In [7]:
import pandas as pd

from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
QA_df = pd.read_csv("../moh_test/test_data/QA_clean.csv")

## `1) Exploration`

In [3]:
QA_df

Unnamed: 0,question_id,question_description,answer_category_num,question_description_clean
0,1,وزير الخارجية اللبناني جبران باسيل قال في سلسل...,Religious affiliation,وزير خارجي لبناني جبران باسيل سلسله تغريد عقب ...
1,2,سورية بلد الحضارات تربطها بعلية او بحيوان,Violent,سوري بلد حضاره ربط عليه حيوان
2,4246,تقتلون وسام الحسن وتترحموعلية من أي أصناف المخ...,Racist,قتل وسام حسن وتترحموعليه اي صنف مخلوق انتم
3,5304,معك خبر انو بلدة قطر متل ما سميتا مساحتها اكبر...,Normal,معك نوي بلد قطر متل سمي مساحه اكبر لبنان ل عيب...
4,1706,للامانه قوت الموسم اللي طاف كان هوا بس متحمس ح...,Normal,امانه قوت موسم ل ي طاف هوا متحمس حق الجاي ان ا...
...,...,...,...,...
3132,3720,كلامك هراء من دون اي قيمة تذكر انت بلوة من الب...,Violent,كلام هراء قيمه ذكر بلا البلوات منتقل قتل قتيل ...
3133,7784,راح خبرك شو شايفة ب جبران باسيل,Normal,خبر شو شايفه جبران باسيل
3134,1318,كلمة جبران باسيل أخجلته وأخجلت كل الأمة العربية,Sexual harrasment,كلمه جبران باسيل اخجل اخجل امه عربي
3135,3153,انا مش عم خوفك يا قراع انا عم قلك انقبر انقلع ...,Violent,مش عم خوف يا قراع عم قل انقبر انقلع خلق يا قرد...


In [5]:
QA_df["answer_category_num"].value_counts()

Normal                   783
Mockery                  761
Violent                  572
Religious affiliation    374
Sexual harrasment        328
Racist                   319
Name: answer_category_num, dtype: int64

## `2) ML`

### 2.1 definitions

In [10]:
# Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

In [11]:
# Model definitions
LogReg_model = LogisticRegression()
RandomForestClassifier_model = RandomForestClassifier(max_depth=3, random_state=0)

MultinomialNB_model = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
SGDClassifier_model = SGDClassifier(class_weight='balanced', penalty='l1')
KNeighborsClassifier_model = KNeighborsClassifier(n_neighbors=3)
DecisionTreeClassifier_model = DecisionTreeClassifier(random_state=0)

models = [LogReg_model, RandomForestClassifier_model, DecisionTreeClassifier_model,
          SGDClassifier_model,   
          KNeighborsClassifier_model,  MultinomialNB_model]
model_names = ['LogisticRegression', 'RandomForestClassifier', 'DecisionTreeClassifier',
               'SGDClassifier', 'KNeighborsClassifier', 'MultinomialNB']

### 2.2 training

In [43]:
def train_and_evaluate(df, model_names):
    def train_models(X_tr, X_te, y_tr, y_te):
        accuracies = []
        for i, model in enumerate(models):
            print(f"Model: {model_names[i]}")
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_te)
            acc = accuracy_score(y_te, y_pred)
            print('val accuracy %s' % acc)
            accuracies.append(acc)
#             print("----- PREDICTION DISTRIBUTION -----")
#             print(pd.Series(y_te).value_counts())
#             print(pd.Series(y_pred).value_counts())
#             print("----- CLASSIFICATION REPORT -----")
#             print(classification_report(y_te, y_pred))
            print()
        
        ind_max_acc = accuracies.index(max(accuracies))
        main_predictions = models[ind_max_acc].predict(X_te)
        return main_predictions
    
    train_data, test_data = train_test_split(df, test_size=0.15, random_state=1)
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    tfidf_tr = tfidf_vectorizer.fit_transform(train_data['question_description_clean'])
    tfidf_val = tfidf_vectorizer.transform(test_data['question_description_clean'])
    
    X_train, y_train = tfidf_tr, train_data['answer_category_num']
    X_test, y_test = tfidf_val, test_data['answer_category_num']
    predictions = train_models(X_train, X_test, y_train, y_test)
    test_data['predictions'] = predictions
    return test_data

In [40]:
test_data = train_and_evaluate(QA_df, model_names)

Model: LogisticRegression
val accuracy 0.33970276008492567

Model: RandomForestClassifier
val accuracy 0.2760084925690021

Model: DecisionTreeClassifier
val accuracy 0.267515923566879

Model: SGDClassifier
val accuracy 0.25902335456475584

Model: KNeighborsClassifier
val accuracy 0.2653927813163482

Model: MultinomialNB
val accuracy 0.32908704883227174



`note` best model in terms of accuracy is **logisitic regression**, so we're going to assume its results for the predictions

### 2.3 analyzing results

In [45]:
true_label_dist = test_data["answer_category_num"].value_counts()
predictions_dist = test_data["predictions"].value_counts()

print("----- TRUE LABEL -----")
display(true_label_dist)
print("----- PREDICTION -----")
display(predictions_dist)

----- TRUE LABEL -----


Mockery                  118
Normal                   118
Violent                   89
Religious affiliation     59
Racist                    48
Sexual harrasment         39
Name: answer_category_num, dtype: int64

----- PREDICTION -----


Normal                   193
Mockery                  180
Violent                   83
Racist                    10
Religious affiliation      5
Name: predictions, dtype: int64

`note` we can see a difference in class distribution between predictions and true labels, especially in 'Racist', 'Religious affiliation', and 'Sexual harrasment' classes

In [55]:
correct_predictions_test_data = test_data[test_data["answer_category_num"] == test_data["predictions"]]
incorrect_predictions_test_data = test_data[test_data["answer_category_num"] != test_data["predictions"]]

print("----- COUNT ------")
print("number of correct predictions {}".format(correct_predictions_test_data.shape[0]))
print("number of incorrect predictions {}".format(incorrect_predictions_test_data.shape[0]))
print()

print("----- DISTRIBUTION ------")
print("# predictions predicted correctly:")
display(correct_predictions_test_data["predictions"].value_counts())
print("# predictions predicted incorrectly:")
display(incorrect_predictions_test_data["answer_category_num"].value_counts())
print("original distribution:")
display(test_data["answer_category_num"].value_counts())

----- COUNT ------
number of correct predictions 160
number of incorrect predictions 311

----- DISTRIBUTION ------
# predictions predicted correctly:


Normal                   74
Mockery                  55
Violent                  28
Religious affiliation     2
Racist                    1
Name: predictions, dtype: int64

# predictions predicted incorrectly:


Mockery                  63
Violent                  61
Religious affiliation    57
Racist                   47
Normal                   44
Sexual harrasment        39
Name: answer_category_num, dtype: int64