In [23]:
import pandas as pd
import re
import nltk
from nltk.stem.isri import ISRIStemmer
from tqdm import tqdm

In [24]:
train = pd.read_csv('dataset/train.csv')

In [25]:
train['stance'].value_counts()

 1    5538
 0    1012
-1     438
Name: stance, dtype: int64

In [26]:
# This function is made because the process of cleaning arabic text is complex
# and depends on many unicodes done in many steps
def clean_arabic(text):
    # ! to understand this nonsense you need this link open 
    """ https://ar.wikipedia.org/wiki/%D8%A7%D9%84%D8%AE%D8%B7_
        %D8%A7%D9%84%D8%B9%D8%B1%D8%A8%D9%8A_%D9%81%D9%8A_%D9%8A%D9
        %88%D9%86%D9%8A%D9%83%D9%88%D8%AF   """

    # remove التشكيل
    text = re.sub(r'[\u0600-\u061F]', '', text)
    text = re.sub(r'[\u064B-\u066D]', '', text)

    # Because of all the idiots that were bypassing twitters' spam filters using
    # special characters like this idiot: كو.ر.ونا We'll remove all the special 
    # before everything else

    # remove special characters
    text = re.sub(r'[\u0024-\u003F]', '', text)
    text = re.sub(r'[\u005B-\u0060]', '', text)
    text = re.sub(r'[\u007B-\u007E]', '', text)

    # replace weird characters with more standard ones
    # 1. replace چ with ج
    text = re.sub(r'چ','ج',text)

    # 2. replace ڤ ڨ with ف
    text = re.sub(r'ڤ','ف',text)
    text = re.sub(r'ڨ','ف',text)

    # 3. replace ڠ with ق
    text = re.sub(r'ڠ','غ',text)
    
    # 4. replace ٱ	ٲ	ٳ	◌ٴ	ٵ with ا
    string = ['ٱ','ٲ','ٳ','ٴ','ٵ', 'آ', 'أ', 'إ']
    for char in string:
        text = re.sub(char,'ا',text)

    # 5. replace ٶ	ٷ with و
    string = ['ٶ','ٷ']
    for char in string:
        text = re.sub(char,'و',text)

    # 6. replace ٸ ی with ي
    text = re.sub(r'ٸ','ي',text) 
    text = re.sub(r'ی','ي',text)
    
    # 7. replace پ	with ب
    text = re.sub(r'پ','ب',text)

    # 8. replace ژ with ز
    text = re.sub(r'ژ','ز',text)

    # 9. replace ک ڪ ګ ڬ ڭ ڮ گ ڰ ڱ ڲ ڳ ڴ with ك
    string = ['ک', 'ڪ', 'ګ', 'ڬ', 'ڭ', 'ڮ', 'گ', 'ڰ', 'ڱ', 'ڲ', 'ڳ', 'ڴ']
    for char in string:
        text = re.sub(char,'ك',text)
    # 10. replace ھ with ه
    text = re.sub(r'ھ','ه',text)

    # remove all extra arabic characters (shift + ت) 
    text = re.sub(r'ـ','',text)

    # remove non arabic characters
    text = re.sub(r'[^\u0620-\u064A\s]',' ',text)
    
    return text

In [27]:
def clean_text(text):
    " removes all non arabic characters & replaces all spaces with a single space "
    # TODO: figure out how this works
    # remove all non arabic characters & numbers
    # text = re.sub(r'[^\u0621-\u064A\u0660-\u0669\s]','',text)
    
    # remove all words with # in them
    text = re.sub(r'[^\s]*#[^\s]*',' ',text)
    
    # arabic letters clean up 
    text = clean_arabic(text)
        
    # replace all white spaces with a single space
    text = re.sub(r'\s+',' ',text)
    
   
    
    return text 

In [28]:
cleaned_train_data = train['text'].apply(lambda x: clean_text(x))

In [29]:
# Might need to download punkt & stopwords
# ! Run the following lines in an empty cell if the code doesn't work
# nltk.download('punkt')
# nltk.download('stopwords')

# currently this line just tokenizes the text using a space (I think)
tokenized_train_data = [nltk.word_tokenize(text) for text in cleaned_train_data]

In [30]:
train['text'][3185]

'الموت أرحم عندي من انو  آخد لقاح موهوب من دولة الاجرام الامارات وبوساطة سعد الحريري.'

In [31]:
def read_stop_words(stopwords_extracted_from_dataset):
    file = open("stopwords/arabic.txt", encoding='utf-8')
    lines = file.readlines()
    # print(lines)
    for line in lines:
        result=line.rstrip('\n')
        if len(result)>0:
            stopwords_extracted_from_dataset.add(result)



In [32]:
# remove stopwords
stopwords = nltk.corpus.stopwords.words('arabic')
tokenized_no_stopwords_train_data = [[word for word in text if word not in stopwords] for text in tokenized_train_data]

# Here we're looking for more stopwords that are 2 characters or less
# we spend hours doing just this for two or three character words
stopwords_extracted_from_dataset = ['ال', 'اي', 'ان', 'تم', 'بن', 
                                    'او', 'اي', 'عم', 'ام', 'رض',
                                    'في', 'فى', 'رب', 'سم', 'خط',
                                    'ول', 'زي', 'دي', 'اذ', 'ده',
                                    'دى', 'انه', 'ابو', 'احد']
stopwords_extracted_from_dataset= set(stopwords_extracted_from_dataset)
read_stop_words(stopwords_extracted_from_dataset)
tokenized_no_stopwords_train_data_v2 = [[word for word in text if word not in stopwords_extracted_from_dataset] for text in tokenized_no_stopwords_train_data]

In [33]:
stopwords_set = []
for text in tokenized_no_stopwords_train_data_v2:
    for word in text:
        if len(word) <= 2:
            stopwords_set.append((word, str(tokenized_no_stopwords_train_data_v2.index(text))))

stopwords_set.sort()

In [34]:
# Lemmatization
stemmed_train = []

for text in tqdm(tokenized_no_stopwords_train_data_v2):
    curr_list = []
    for word in text:
        curr_list.append(ISRIStemmer().stem(word))
    stemmed_train.append(curr_list)

# stemmed_train = [[stemmer.stem(word) for word in text] for text in tokenized_no_stopwords_train_data_v2]

100%|██████████| 6988/6988 [00:00<00:00, 9352.52it/s]


In [36]:
# Write all of the texts into a file for visual comparison
with open('output/original_train.txt', 'w',encoding="utf-8") as f:
    for text in train['text']:
        f.write(text + '\n\n')
f.close()

with open('output/cleaned_up_train.txt', 'w',encoding="utf-8") as f:
    for text in cleaned_train_data:
        f.write(text + '\n\n')
f.close()

with open('output/tokens_train.txt', 'w',encoding="utf-8") as f:
    for text in tokenized_train_data:
        f.write(str(text) + '\n\n')
f.close()

with open('output/tokens_no_stopwords_train.txt', 'w',encoding="utf-8") as f:
    for text in tokenized_no_stopwords_train_data:
        f.write(str(text) + '\n\n')
f.close()

with open('output/words_less_than_2.txt', 'w',encoding="utf-8") as f:
    for word in stopwords_set:
        f.write(str(word) + '\n')
f.close()

with open('output/stemmed_train.txt', 'w',encoding="utf-8") as f:
    for text in stemmed_train:
        f.write(str(text) + '\n\n')
f.close()
# use alt + z to toggle word wrap in vscode 