#Exploring Corpora with Text Normalization

In this assignment, I will perform text normalization, using Regular Expression, then explore corpora.

The corpora are in Arabic language, and I want to find count of most 10 frequest words without text normalization and with text normalization.

The text normalization includes

1.   علامات الترقيم
2.   الحركات
3. التنوين



### Functions

In [0]:
import re
from google.colab import drive

def load_corpus(file_name):
    text = open(file_name, encoding='utf-8').read()
    return text

def words_count(text):
    text = text.split()
    word_counts = {}
    for word in text:
        word_counts[word] = word_counts.get(word, 0) + 1
    return list(word_counts.items())

def regex(term):
    return '["\',-\\/:،؛ًٌٍَُِّ]?' + term + '["\',-\\/:،؛ًٌٍَُِّ]?'

### Files

In [0]:
jsc_file = 'https://github.com/motazsaad/NLP-ICTS6361/blob/master/corpus/aljazeera.net_20190419_titles.txt'
cnn_file = 'https://github.com/motazsaad/NLP-ICTS6361/blob/master/corpus/arabic.cnn.com_20190419_titles.txt'
euro_file  = 'https://github.com/motazsaad/NLP-ICTS6361/blob/master/corpus/arabic.euronews.com_20190409_titles.txt'
rt_file = 'https://github.com/motazsaad/NLP-ICTS6361/blob/master/corpus/arabic.rt.com_20190419_titles.txt'
bbc_file = 'https://github.com/motazsaad/NLP-ICTS6361/blob/master/corpus/bbc.com_20190409_titles.txt'

jsc = load_corpus(jsc_file)
cnn = load_corpus(cnn_file)
euro = load_corpus(euro_file)
rt = load_corpus(rt_file)
bbc = load_corpus(bbc_file)

### Count

In [0]:
# count
jsc_count = words_count(jsc)
cnn_count = words_count(cnn)
euro_count = words_count(euro)
rt_count = words_count(rt)
bbc_count = words_count(bbc)

sorted_jsc_count = sorted([(v,k) for k,v in jsc_count], reverse=True)
sorted_cnn_count = sorted([(v,k) for k,v in cnn_count], reverse=True)
sorted_euro_count = sorted([(v,k) for k,v in euro_count], reverse=True)
sorted_rt_count = sorted([(v,k) for k,v in rt_count], reverse=True)
sorted_bbc_count = sorted([(v,k) for k,v in bbc_count], reverse=True)

### Explore

In [0]:
norm_jsc_count = {}
norm_cnn_count = {}
norm_euro_count = {}
norm_rt_count = {}
norm_bbc_count = {}

for t in sorted_jsc_count[:10]:
    word = t[1]
    norm_jsc_count[word] = len(re.findall(regex(word), jsc))
    
for t in sorted_cnn_count[:10]:
    word = t[1]
    norm_cnn_count[word] = len(re.findall(regex(word), cnn))
    
for t in sorted_euro_count[:10]:
    word = t[1]
    norm_euro_count[word] = len(re.findall(regex(word), euro))
    
for t in sorted_rt_count[:10]:
    word = t[1]
    norm_rt_count[word] = len(re.findall(regex(word), rt))
    
for t in sorted_bbc_count[:10]:
    word = t[1]
    norm_bbc_count[word] = len(re.findall(regex(word), bbc))

### Results

#### JSC

without text normalization

In [0]:
print (sorted_jsc_count[:10])

[(21042, 'في'), (10199, 'من'), (6902, 'على'), (4616, 'مصر'), (3732, 'عن'), (3335, 'مقتل'), (2711, 'قتلى'), (2597, 'غزة'), (2359, 'مع'), (2349, 'إلى')]

with text normalization

[('في', 28108), ('من', 19135), ('على', 7163), ('مصر', 9002), ('عن', 5510), ('مقتل', 3463), ('قتلى', 3113), ('غزة', 3546), ('مع', 7303), ('إلى', 2349)]

In [0]:
print (list(norm_jsc_count.items())[:10])

#### Euro

without text normalization

In [0]:
print (sorted_euro_count[:10])

[(21101, 'في'), (8009, 'من'), (7730, 'على'), (3033, 'مع'), (2753, 'إلى'), (2701, 'شاهد:'), (2663, 'بعد'), (2550, 'عن'), (1725, 'ترامب'), (1321, 'السعودية')]

with text normalization

In [0]:
print (list(norm_euro_count.items())[:10])

[('في', 26389), ('من', 13929), ('على', 8098), ('مع', 5970), ('إلى', 2769), ('شاهد:', 2718), ('بعد', 2845), ('عن', 3709), ('ترامب', 2149), ('السعودية', 1518)]

#### CNN

without text normalization

In [0]:
print (sorted_cnn_count[:10])

[(8905, 'في'), (6401, 'من'), (5167, 'على'), (2318, 'عن'), (2276, 'بعد'), (1937, 'إلى'), (1476, 'داعش'), (1470, 'لـCNN:'), (1433, 'مع'), (1172, 'هل')]

with text normalization

In [0]:
print (list(norm_cnn_count.items())[:10])

[('في', 13652), ('من', 12166), ('على', 5321), ('عن', 3507), ('بعد', 2536), ('إلى', 1954), ('داعش', 3144), ('لـCNN:', 1472), ('مع', 4165), ('هل', 1942)]

#### RT

without text normalization

In [0]:
print (sorted_rt_count[:10])

[(119313, 'في'), (49560, 'من'), (48777, 'على'), (20839, 'عن'), (20554, 'إلى'), (16939, 'روسيا'), (16372, 'مع'), (9323, 'بعد'), (9322, 'الروسية'), (8982, 'سوريا')]

with text normalization

In [0]:
print (list(norm_rt_count.items())[:10])

[('في', 170496), ('من', 89792), ('على', 49696), ('عن', 27359), ('إلى', 20620), ('روسيا', 19939), ('مع', 34430), ('بعد', 10746), ('الروسية', 11311), ('سوريا', 10872)]

#### BBC

without text normalization

In [0]:
print (sorted_bbc_count[:10])

[(43957, 'في'), (14610, 'على'), (14051, 'من'), (5865, 'عن'), (4184, 'مقتل'), (4111, 'إلى'), (3648, 'مع'), (3423, 'بعد'), (2635, 'بين'), (2236, 'سوريا')]

with text normalization

In [0]:
print (list(norm_bbc_count.items())[:10])

[('في', 54271), ('على', 14877), ('من', 26774), ('عن', 8263), ('مقتل', 4586), ('إلى', 4127), ('مع', 10640), ('بعد', 3831), ('بين', 3919), ('سوريا', 3386)]