## `1) Libraries`

In [1]:
import subprocess
import psycopg2
import pandas as pd

from sklearn.metrics import classification_report

## `2) Getting Data`

In [2]:

creds = {'dbname': 'd1mikus7g5uss8',
 'host': 'ec2-99-81-68-240.eu-west-1.compute.amazonaws.com',
 'port': '5432',
 'user': 'iohznziolcottb',
 'password': '5a812ea29f6142328bc2afab03e48e6462939babe87610342cdf12e2d357a4f0',
 'sslmode': 'require'}

creds

{'dbname': 'd1mikus7g5uss8',
 'host': 'ec2-99-81-68-240.eu-west-1.compute.amazonaws.com',
 'port': '5432',
 'user': 'iohznziolcottb',
 'password': '5a812ea29f6142328bc2afab03e48e6462939babe87610342cdf12e2d357a4f0',
 'sslmode': 'require'}

In [3]:
conn = psycopg2.connect(host=creds['host'], database=creds['dbname'],
                                user=creds['user'],
                                password=creds['password'])
cur = conn.cursor()

In [4]:
select_Query = 'select q.description, q.question_id, a.answer_category_num, a.answer_justification, a.answer_upvote, \
              a.account_id_id from public."DiscH_prototype_question" q RIGHT JOIN public."DiscH_prototype_answer" a \
ON a.question_id_id=q.question_id'
cur.execute(select_Query)
QAs = cur.fetchall() 
QA_col = ["question_description", "question_id", "answer_category_num", "answer_justification", "answer_upvote", "account_id"]
QA_df = pd.DataFrame(QAs, columns=QA_col)

## `3) Preparation and Cleaning`

### - Preparing DF

In [90]:
import pandas as pd

# import the dediacritization tool
from camel_tools.utils.dediac import dediac_ar

# Reducing Orthographic Ambiguity
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar

# toknenization
from camel_tools.tokenizers.word import simple_word_tokenize

# Morphological Disambiguation (Maximum Likelihood Disambiguator)
from camel_tools.disambig.mle import MLEDisambiguator
mle = MLEDisambiguator.pretrained() # instantiation fo MLE disambiguator

# tokenization / lemmatization (choosing approach that best fit the project)
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [5]:
def prepare_df(df):
    df = df.copy().rename(columns={"description":'question_description'})
    df  = df[["question_id", 'question_description', 'answer_category_num']]
    df.drop_duplicates(subset='question_description', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # checking how many comments have different votes
    for i, row in df.iterrows():

        comment = row.question_description

        # getting the dataframe for that comment
        temp_df = df[df.question_description == comment]

        # how many unique answers that comment has
        n_ = temp_df.answer_category_num.nunique()

        # changing the answer_category_num to the most frequent one
        if n_ >= 2:
            most_voted_label = temp_df.answer_category_num.value_counts().index[0]
            df.at[i, 'answer_category_num'] = most_voted_label
            
    return df

In [7]:
# QA_df_xlsx = pd.read_excel("../data/responses_data.xlsx")
# QA_df_xlsx = QA_df_xlsx.copy().rename(columns={"description":'question_description'})
# QA_df_xlsx

In [8]:
QA_df_clean = prepare_df(QA_df)
QA_df_clean

Unnamed: 0,question_id,question_description,answer_category_num
0,1,وزير الخارجية اللبناني جبران باسيل قال في سلسل...,Religious affiliation
1,2,سورية بلد الحضارات تربطها بعلية او بحيوان,Violent
2,4246,تقتلون وسام الحسن وتترحموعلية من أي أصناف المخ...,Racist
3,5304,معك خبر انو بلدة قطر متل ما سميتا مساحتها اكبر...,Normal
4,1706,للامانه قوت الموسم اللي طاف كان هوا بس متحمس ح...,Normal
...,...,...,...
3132,3720,كلامك هراء من دون اي قيمة تذكر انت بلوة من الب...,Violent
3133,7784,راح خبرك شو شايفة ب جبران باسيل,Normal
3134,1318,كلمة جبران باسيل أخجلته وأخجلت كل الأمة العربية,Sexual harrasment
3135,3153,انا مش عم خوفك يا قراع انا عم قلك انقبر انقلع ...,Violent


### - Text Cleaning

In [10]:
def remove_urls(text):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', text)

def remove_html(text):
    return BeautifulSoup(text, "html.parser").text

symb_re = re.compile(r"""[!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~،؟…«“\":\"…”]""")
def remove_symbols(text: str) -> str:
    return symb_re.sub(repl="", string=text)

multiple_space_re = re.compile("\s{2,}")
def remove_multiple_whitespace(text):
    return multiple_space_re.sub(repl=" ", string=text)

In [11]:
stop_word_list = pd.read_csv('../Data/stop_words/list.csv')['words'].to_list()
tokenizer = MorphologicalTokenizer(mle, scheme='atbtok', diac=False) # atbseg scheme 
def text_clean(txt):
    txt = remove_urls(txt)
    txt = remove_html(txt)
    
    # remove stopwords
    txt = ' '.join(word for word in txt.split() if word not in stop_word_list)
    
    # dediacritization
    txt = dediac_ar(txt)
    
    # normalization: Reduce Orthographic Ambiguity and Dialectal Variation
    txt = normalize_alef_maksura_ar(txt)
    txt = normalize_alef_ar(txt)
    txt = normalize_teh_marbuta_ar(txt)
    
    # normalization: Reducing Morphological Variation
    tokens = simple_word_tokenize(txt)
    disambig = mle.disambiguate(tokens)
    lemmas = [d.analyses[0].analysis['lex'] for d in disambig]
    tokens = tokenizer.tokenize(lemmas)
    txt = ' '.join(tokens)
    
    # remove longation
    txt = re.sub("[إأآا]", "ا", txt)
    txt = re.sub("ى", "ي", txt)
    txt = re.sub("ؤ", "ء", txt)
    txt = re.sub("ئ", "ء", txt)
    txt = re.sub("ة", "ه", txt)
    txt = re.sub("گ", "ك", txt)
    
    # remove non-arabic words, or non-numbers, or non-english words in the text
    txt = re.sub(r'[^a-zA-Z\s0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9]+'
                 ,' ', txt)
    
    # remove symbols
    txt = remove_symbols(txt)
    
    # remove multiple whitespace
    txt = remove_multiple_whitespace(txt)
    
    
    return txt

In [12]:
QA_df_clean['question_description_clean'] = QA_df_clean['question_description'].apply(text_clean)

In [92]:
# i = 0

In [91]:
# display(QA_df_clean["question_description"][i])
# display(QA_df_clean["question_description_clean"][i])
# i += 1

### - removing frequent words, forming vocabulary

In [93]:
from sklearn.feature_extraction.text import CountVectorizer

In [94]:
count_vectorizer = CountVectorizer()
bag_of_words = count_vectorizer.fit_transform(QA_df_clean['question_description_clean'])
feature_names = count_vectorizer.get_feature_names()
word_frequencies = bag_of_words.toarray().sum(axis=0) 



In [95]:
# vocabulary = count_vectorizer.vocabulary_
# vocabulary_terms = list(vocabulary.keys())
# terms_to_remove = []

# for term in vocabulary_terms:
#     frequency = vocabulary[term]
#     print("term: {} / frequency: {}".format(term, frequency))
#     print()
#     break

In [97]:
feature_names = count_vectorizer.get_feature_names()
word_frequencies = bag_of_words.toarray().sum(axis=0) 

In [235]:
word_occurence_set = list(zip(feature_names, word_frequencies))
word_occurence_set_asc = sorted(word_occurence_set, key=lambda x: x[1], reverse=True)

vocabulary = []
for wos in word_occurence_set_asc:
    word, count = wos 
    
    # frequently occuring words are filetered
    if count > bag_of_words.shape[0] * 0.5:
        continue
    elif count <= 1:
        continue
    else:
        vocabulary.append(word)

print('len vocab (before filtering): {}'.format(len(word_occurence_set_asc)))
print('len vocab (after filtering): {}'.format(len(vocabulary)))

len vocab (before filtering): 7565
len vocab (after filtering): 3214


In [284]:
# QA_df_clean.to_csv("../moh_test/test_data/QA_clean.csv", index=False)

## 4) `TF-IDF (testing model performance with cleaned data)`

In [236]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [237]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report

In [285]:
clean_data = QA_df_clean[["question_description_clean", "answer_category_num"]]

In [314]:
train_data, test_data = train_test_split(QA_df_clean, test_size=0.15, random_state=1)

In [315]:
train_data.head()

Unnamed: 0,question_id,question_description,answer_category_num,question_description_clean
2459,8807,اااه يافلسطين ااااه من ضعفنا اللهم لا تحسبنا و...,Normal,اااه يافلسطين ااااه ضعف اللهم تحسب سال ضعف سام...
1365,6182,الله اكبر ع كل كلب بيكره وليد بيك بدنا نقصفلو ...,Racist,الله اكبر كلب بيكره وليد بك بدن نقصفلو رقبتوا
630,3639,انت رئس الفساد,Violent,رءس فساد
1151,1152,ياريت شعبك بيسمع واذا سمع يوقف واذا وقف يحدث ا...,Sexual harrasment,ياريت شعب بيسمع اذا سمع وقف اذا حدث فرقه شعب ل...
2060,7891,من وطأة التراث الديني مثلا المعركة الحاصلة الآ...,Religious affiliation,وطاه تراث ديني مثل معركه حاصل ان ماء ي زكاه فط...


In [323]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(train_data["question_description_clean"])

In [324]:
# tfidf_vectorizer = TfidfVectorizer()
tfidf_tr = tfidf_vectorizer.transform(train_data['question_description_clean'])
tfidf_val = tfidf_vectorizer.transform(test_data['question_description_clean'])

In [325]:
# Model definitions
LogReg_model = LogisticRegression()
RandomForestClassifier_model = RandomForestClassifier(max_depth=3, random_state=0)

MultinomialNB_model = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
SGDClassifier_model = SGDClassifier(class_weight='balanced', penalty='l1')
KNeighborsClassifier_model = KNeighborsClassifier(n_neighbors=3)
DecisionTreeClassifier_model = DecisionTreeClassifier(random_state=0)

models = [LogReg_model, RandomForestClassifier_model, DecisionTreeClassifier_model,
          SGDClassifier_model,   
          KNeighborsClassifier_model,  MultinomialNB_model]
model_names = ['LogisticRegression', 'RandomForestClassifier', 'DecisionTreeClassifier',
               'SGDClassifier', 'KNeighborsClassifier', 'MultinomialNB']

In [326]:
X_train, y_train = tfidf_tr, train_data['answer_category_num']
X_train.shape, y_train.shape

((2666, 7002), (2666,))

In [327]:
X_test, y_test = tfidf_val, test_data['answer_category_num']
X_test.shape, y_test.shape

((471, 7002), (471,))

In [328]:
def train_models(X_tr, X_te, y_tr, y_te):
    for i, model in enumerate(models):
        print(f"Model: {model_names[i]}")
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_te)
        print("----- METRIC -----")
        print('val accuracy %s' % accuracy_score(y_te, y_pred))
#         print("----- PREDICTION DISTRIBUTION -----")
#         print(pd.Series(y_te).value_counts())
#         print(pd.Series(y_pred).value_counts())
#         print("----- CLASSIFICATION REPORT -----")
#         print(classification_report(y_te, y_pred))
        print()

In [329]:
train_models(X_train, X_test, y_train, y_test)

Model: LogisticRegression
----- METRIC -----
val accuracy 0.33970276008492567

Model: RandomForestClassifier
----- METRIC -----
val accuracy 0.2760084925690021

Model: DecisionTreeClassifier
----- METRIC -----
val accuracy 0.267515923566879

Model: SGDClassifier
----- METRIC -----
val accuracy 0.2505307855626327

Model: KNeighborsClassifier
----- METRIC -----
val accuracy 0.2653927813163482

Model: MultinomialNB
----- METRIC -----
val accuracy 0.32908704883227174



### conclusions

- cleaning text did increase accuracy, though not by much
- limiting vocabulary by removing frequent and occuring-once terms did not improve the results