In [41]:
import pandas as pd

In [42]:
responses_df = pd.read_excel("../Data/responses_data.xlsx")

In [43]:
def prepare_df(df):
    df = df.copy().rename(columns={"description":'question_description'})
    df  = df[['question_description', 'answer_category_num']]
    df.drop_duplicates(subset='question_description', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # checking how many comments have different votes
    for i, row in df.iterrows():

        comment = row.question_description

        # getting the dataframe for that comment
        temp_df = df[df.question_description == comment]

        # how many unique answers that comment has
        n_ = temp_df.answer_category_num.nunique()

        # changing the answer_category_num to the most frequent one
        if n_ >= 2:
            most_voted_label = temp_df.answer_category_num.value_counts().index[0]
            df.at[i, 'answer_category_num'] = most_voted_label
            
    return df

In [44]:
responses_df_clean = prepare_df(responses_df)

## `turning the problem into binary classification`

### hate / not hate

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [46]:
# Model definitions
LogReg_model = LogisticRegression()
RandomForestClassifier_model = RandomForestClassifier(max_depth=3, random_state=0)

MultinomialNB_model = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
SGDClassifier_model = SGDClassifier(class_weight='balanced', penalty='l1')
KNeighborsClassifier_model = KNeighborsClassifier(n_neighbors=3)
DecisionTreeClassifier_model = DecisionTreeClassifier(random_state=0)

models = [LogReg_model, RandomForestClassifier_model, DecisionTreeClassifier_model,
          SGDClassifier_model,   
          KNeighborsClassifier_model,  MultinomialNB_model]
model_names = ['LogisticRegression', 'RandomForestClassifier', 'DecisionTreeClassifier',
               'SGDClassifier', 'KNeighborsClassifier', 'MultinomialNB']

In [47]:
def binarize_classes(df, class_chosen="hate"):
    responses_df_binary = df.copy()
    
    
    data_classes = responses_df_binary["answer_category_num"].unique()
    if class_chosen not in data_classes:
        mapping_classes = {'Religious affiliation': "hate", 
                           'Violent': "hate", 
                           'Racist': "hate", 
                           'Mockery': "hate",
                           'Sexual harrasment': "hate", 
                           'Normal': "not hate"}
    else:
        mapping_classes = {}
        for class_i in data_classes:
            if class_i == class_chosen:
                mapping_classes[class_i] = "{}".format(class_chosen)
            else:
                mapping_classes[class_i] = "not {}".format(class_chosen)
    
    responses_df_binary = responses_df_binary.replace({"answer_category_num": mapping_classes})
    return responses_df_binary
        

In [48]:
def train_and_evaluate(df, model_names):
    def train_models(X_tr, X_te, y_tr, y_te):
        for i, model in enumerate(models):
            print(f"Model: {model_names[i]}")
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_te)
            print('val accuracy %s' % accuracy_score(y_te, y_pred))
            print(classification_report(y_te, y_pred))  # for further evaluation
            print()
    
    train_data, test_data = train_test_split(df, test_size=0.15, random_state=42)
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    tfidf_tr = tfidf_vectorizer.fit_transform(train_data['question_description'])
    tfidf_val = tfidf_vectorizer.transform(test_data['question_description'])
    
    X_train, y_train = tfidf_tr, train_data['answer_category_num']
    X_test, y_test = tfidf_val, test_data['answer_category_num']
    
    train_models(X_train, X_test, y_train, y_test)

---

In [49]:
# hate/not hate
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="hate")
display(responses_df_binary_main.head())
print()

train_and_evaluate(responses_df_binary_main, model_names)

Unnamed: 0,question_description,answer_category_num
0,وزير الخارجية اللبناني جبران باسيل قال في سلسل...,hate
1,سورية بلد الحضارات تربطها بعلية او بحيوان,hate
2,تقتلون وسام الحسن وتترحموعلية من أي أصناف المخ...,hate
3,معك خبر انو بلدة قطر متل ما سميتا مساحتها اكبر...,not hate
4,للامانه قوت الموسم اللي طاف كان هوا بس متحمس ح...,not hate



Model: LogisticRegression
val accuracy 0.7219827586206896
              precision    recall  f1-score   support

        hate       0.73      0.98      0.84       336
    not hate       0.45      0.04      0.07       128

    accuracy                           0.72       464
   macro avg       0.59      0.51      0.45       464
weighted avg       0.65      0.72      0.63       464


Model: RandomForestClassifier
val accuracy 0.7241379310344828
              precision    recall  f1-score   support

        hate       0.72      1.00      0.84       336
    not hate       0.00      0.00      0.00       128

    accuracy                           0.72       464
   macro avg       0.36      0.50      0.42       464
weighted avg       0.52      0.72      0.61       464


Model: DecisionTreeClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


val accuracy 0.6530172413793104
              precision    recall  f1-score   support

        hate       0.74      0.81      0.77       336
    not hate       0.33      0.25      0.28       128

    accuracy                           0.65       464
   macro avg       0.53      0.53      0.53       464
weighted avg       0.63      0.65      0.64       464


Model: SGDClassifier
val accuracy 0.6508620689655172
              precision    recall  f1-score   support

        hate       0.75      0.79      0.77       336
    not hate       0.35      0.30      0.32       128

    accuracy                           0.65       464
   macro avg       0.55      0.54      0.54       464
weighted avg       0.64      0.65      0.64       464


Model: KNeighborsClassifier
val accuracy 0.6745689655172413
              precision    recall  f1-score   support

        hate       0.77      0.78      0.78       336
    not hate       0.41      0.39      0.40       128

    accuracy                       

In [50]:
# Religious affiliation / not Religious affiliation
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Religious affiliation")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.8771551724137931
                           precision    recall  f1-score   support

    Religious affiliation       0.00      0.00      0.00        57
not Religious affiliation       0.88      1.00      0.93       407

                 accuracy                           0.88       464
                macro avg       0.44      0.50      0.47       464
             weighted avg       0.77      0.88      0.82       464


Model: RandomForestClassifier
val accuracy 0.8771551724137931


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

    Religious affiliation       0.00      0.00      0.00        57
not Religious affiliation       0.88      1.00      0.93       407

                 accuracy                           0.88       464
                macro avg       0.44      0.50      0.47       464
             weighted avg       0.77      0.88      0.82       464


Model: DecisionTreeClassifier
val accuracy 0.8318965517241379
                           precision    recall  f1-score   support

    Religious affiliation       0.14      0.07      0.09        57
not Religious affiliation       0.88      0.94      0.91       407

                 accuracy                           0.83       464
                macro avg       0.51      0.50      0.50       464
             weighted avg       0.79      0.83      0.81       464


Model: SGDClassifier
val accuracy 0.8081896551724138
                           precision    recall  f1-score   support

    R

In [51]:
# Violent / not Violent
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Violent")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.8168103448275862
              precision    recall  f1-score   support

     Violent       0.00      0.00      0.00        84
 not Violent       0.82      1.00      0.90       380

    accuracy                           0.82       464
   macro avg       0.41      0.50      0.45       464
weighted avg       0.67      0.82      0.74       464


Model: RandomForestClassifier
val accuracy 0.8189655172413793
              precision    recall  f1-score   support

     Violent       0.00      0.00      0.00        84
 not Violent       0.82      1.00      0.90       380

    accuracy                           0.82       464
   macro avg       0.41      0.50      0.45       464
weighted avg       0.67      0.82      0.74       464


Model: DecisionTreeClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


val accuracy 0.7478448275862069
              precision    recall  f1-score   support

     Violent       0.20      0.13      0.16        84
 not Violent       0.82      0.88      0.85       380

    accuracy                           0.75       464
   macro avg       0.51      0.51      0.50       464
weighted avg       0.71      0.75      0.73       464


Model: SGDClassifier
val accuracy 0.7413793103448276
              precision    recall  f1-score   support

     Violent       0.26      0.23      0.24        84
 not Violent       0.83      0.86      0.84       380

    accuracy                           0.74       464
   macro avg       0.55      0.54      0.54       464
weighted avg       0.73      0.74      0.73       464


Model: KNeighborsClassifier
val accuracy 0.7607758620689655
              precision    recall  f1-score   support

     Violent       0.15      0.07      0.10        84
 not Violent       0.82      0.91      0.86       380

    accuracy                       

In [52]:
# Racist / not Racist
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Racist")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.8987068965517241
              precision    recall  f1-score   support

      Racist       0.00      0.00      0.00        47
  not Racist       0.90      1.00      0.95       417

    accuracy                           0.90       464
   macro avg       0.45      0.50      0.47       464
weighted avg       0.81      0.90      0.85       464


Model: RandomForestClassifier
val accuracy 0.8987068965517241
              precision    recall  f1-score   support

      Racist       0.00      0.00      0.00        47
  not Racist       0.90      1.00      0.95       417

    accuracy                           0.90       464
   macro avg       0.45      0.50      0.47       464
weighted avg       0.81      0.90      0.85       464


Model: DecisionTreeClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


val accuracy 0.8318965517241379
              precision    recall  f1-score   support

      Racist       0.12      0.11      0.11        47
  not Racist       0.90      0.91      0.91       417

    accuracy                           0.83       464
   macro avg       0.51      0.51      0.51       464
weighted avg       0.82      0.83      0.83       464


Model: SGDClassifier
val accuracy 0.8146551724137931
              precision    recall  f1-score   support

      Racist       0.15      0.17      0.16        47
  not Racist       0.90      0.89      0.90       417

    accuracy                           0.81       464
   macro avg       0.53      0.53      0.53       464
weighted avg       0.83      0.81      0.82       464


Model: KNeighborsClassifier
val accuracy 0.8814655172413793
              precision    recall  f1-score   support

      Racist       0.17      0.04      0.07        47
  not Racist       0.90      0.98      0.94       417

    accuracy                       

In [53]:
# Mockery / not Mockery
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Mockery")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.771551724137931
              precision    recall  f1-score   support

     Mockery       0.50      0.02      0.04       106
 not Mockery       0.77      0.99      0.87       358

    accuracy                           0.77       464
   macro avg       0.64      0.51      0.45       464
weighted avg       0.71      0.77      0.68       464


Model: RandomForestClassifier
val accuracy 0.771551724137931
              precision    recall  f1-score   support

     Mockery       0.00      0.00      0.00       106
 not Mockery       0.77      1.00      0.87       358

    accuracy                           0.77       464
   macro avg       0.39      0.50      0.44       464
weighted avg       0.60      0.77      0.67       464


Model: DecisionTreeClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


val accuracy 0.6724137931034483
              precision    recall  f1-score   support

     Mockery       0.29      0.31      0.30       106
 not Mockery       0.79      0.78      0.79       358

    accuracy                           0.67       464
   macro avg       0.54      0.55      0.54       464
weighted avg       0.68      0.67      0.68       464


Model: SGDClassifier
val accuracy 0.6939655172413793
              precision    recall  f1-score   support

     Mockery       0.30      0.25      0.27       106
 not Mockery       0.79      0.83      0.81       358

    accuracy                           0.69       464
   macro avg       0.54      0.54      0.54       464
weighted avg       0.67      0.69      0.68       464


Model: KNeighborsClassifier
val accuracy 0.7262931034482759
              precision    recall  f1-score   support

     Mockery       0.35      0.23      0.27       106
 not Mockery       0.79      0.87      0.83       358

    accuracy                       

In [54]:
# Sexual harrasment / not Sexual harrasment
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Sexual harrasment")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.9094827586206896
                       precision    recall  f1-score   support

    Sexual harrasment       0.00      0.00      0.00        42
not Sexual harrasment       0.91      1.00      0.95       422

             accuracy                           0.91       464
            macro avg       0.45      0.50      0.48       464
         weighted avg       0.83      0.91      0.87       464


Model: RandomForestClassifier
val accuracy 0.9094827586206896
                       precision    recall  f1-score   support

    Sexual harrasment       0.00      0.00      0.00        42
not Sexual harrasment       0.91      1.00      0.95       422

             accuracy                           0.91       464
            macro avg       0.45      0.50      0.48       464
         weighted avg       0.83      0.91      0.87       464


Model: DecisionTreeClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


val accuracy 0.8814655172413793
                       precision    recall  f1-score   support

    Sexual harrasment       0.24      0.14      0.18        42
not Sexual harrasment       0.92      0.95      0.94       422

             accuracy                           0.88       464
            macro avg       0.58      0.55      0.56       464
         weighted avg       0.86      0.88      0.87       464


Model: SGDClassifier
val accuracy 0.834051724137931
                       precision    recall  f1-score   support

    Sexual harrasment       0.14      0.17      0.15        42
not Sexual harrasment       0.92      0.90      0.91       422

             accuracy                           0.83       464
            macro avg       0.53      0.53      0.53       464
         weighted avg       0.85      0.83      0.84       464


Model: KNeighborsClassifier
val accuracy 0.8922413793103449
                       precision    recall  f1-score   support

    Sexual harrasment       

In [55]:
# Racist / not Racist
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Racist")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.8987068965517241
              precision    recall  f1-score   support

      Racist       0.00      0.00      0.00        47
  not Racist       0.90      1.00      0.95       417

    accuracy                           0.90       464
   macro avg       0.45      0.50      0.47       464
weighted avg       0.81      0.90      0.85       464


Model: RandomForestClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


val accuracy 0.8987068965517241
              precision    recall  f1-score   support

      Racist       0.00      0.00      0.00        47
  not Racist       0.90      1.00      0.95       417

    accuracy                           0.90       464
   macro avg       0.45      0.50      0.47       464
weighted avg       0.81      0.90      0.85       464


Model: DecisionTreeClassifier
val accuracy 0.8318965517241379
              precision    recall  f1-score   support

      Racist       0.12      0.11      0.11        47
  not Racist       0.90      0.91      0.91       417

    accuracy                           0.83       464
   macro avg       0.51      0.51      0.51       464
weighted avg       0.82      0.83      0.83       464


Model: SGDClassifier
val accuracy 0.8125
              precision    recall  f1-score   support

      Racist       0.14      0.17      0.16        47
  not Racist       0.90      0.88      0.89       417

    accuracy                           0.81  