In [1]:
import pandas as pd

In [49]:
responses_df = pd.read_excel("../Data/responses_data_500.xlsx")

In [50]:
def prepare_df(df):
    df = df.copy().rename(columns={"description":'question_description'})
    df  = df[['question_description', 'answer_category_num']]
    df.drop_duplicates(subset='question_description', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # checking how many comments have different votes
    for i, row in df.iterrows():

        comment = row.question_description

        # getting the dataframe for that comment
        temp_df = df[df.question_description == comment]

        # how many unique answers that comment has
        n_ = temp_df.answer_category_num.nunique()

        # changing the answer_category_num to the most frequent one
        if n_ >= 2:
            most_voted_label = temp_df.answer_category_num.value_counts().index[0]
            df.at[i, 'answer_category_num'] = most_voted_label
            
    return df

In [51]:
responses_df_clean = prepare_df(responses_df)

## `turning the problem into binary classification`

### hate / not hate

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [63]:
# Model definitions
LogReg_model = LogisticRegression()
RandomForestClassifier_model = RandomForestClassifier(max_depth=3, random_state=0)

MultinomialNB_model = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
SGDClassifier_model = SGDClassifier(class_weight='balanced', penalty='l1')
KNeighborsClassifier_model = KNeighborsClassifier(n_neighbors=3)
DecisionTreeClassifier_model = DecisionTreeClassifier(random_state=0)

models = [LogReg_model, RandomForestClassifier_model, DecisionTreeClassifier_model,
          SGDClassifier_model,   
          KNeighborsClassifier_model,  MultinomialNB_model]
model_names = ['LogisticRegression', 'RandomForestClassifier', 'DecisionTreeClassifier',
               'SGDClassifier', 'KNeighborsClassifier', 'MultinomialNB']

In [56]:
def binarize_classes(df, class_chosen="hate"):
    responses_df_binary = df.copy()
    
    
    data_classes = responses_df_binary["answer_category_num"].unique()
    if class_chosen not in data_classes:
        mapping_classes = {'Religious affiliation': "hate", 
                           'Violent': "hate", 
                           'Racist': "hate", 
                           'Mockery': "hate",
                           'Sexual harrasment': "hate", 
                           'Normal': "not hate"}
    else:
        mapping_classes = {}
        for class_i in data_classes:
            if class_i == class_chosen:
                mapping_classes[class_i] = "{}".format(class_chosen)
            else:
                mapping_classes[class_i] = "not {}".format(class_chosen)
    
    responses_df_binary = responses_df_binary.replace({"answer_category_num": mapping_classes})
    return responses_df_binary
        

In [122]:
def train_and_evaluate(df, model_names):
    def train_models(X_tr, X_te, y_tr, y_te):
        for i, model in enumerate(models):
            print(f"Model: {model_names[i]}")
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_te)
            print('val accuracy %s' % accuracy_score(y_te, y_pred))
            # print(classification_report(y_te, y_pred))  # for further evaluation
            print()
    
    train_data, test_data = train_test_split(df, test_size=0.15, random_state=42)
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    tfidf_tr = tfidf_vectorizer.fit_transform(train_data['question_description'])
    tfidf_val = tfidf_vectorizer.transform(test_data['question_description'])
    
    X_train, y_train = tfidf_tr, train_data['answer_category_num']
    X_test, y_test = tfidf_val, test_data['answer_category_num']
    
    train_models(X_train, X_test, y_train, y_test)

---

In [123]:
# hate/not hate
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="hate")
responses_df_binary_main.head()

Unnamed: 0,question_description,answer_category_num
0,وزير الخارجية اللبناني جبران باسيل قال في سلسل...,hate
1,سورية بلد الحضارات تربطها بعلية او بحيوان,hate
2,اخي الحاج اذا شعرت انك محرجا من الانتقادات لتص...,not hate
3,ما فيك تعيش بلا ما تكب فتن ليل نهار وبكرة قلهم...,not hate
4,هذا البطل الذي قاتل وجاذف بحياته لتحيا انت يا ...,hate


In [124]:
# Religious affiliation / not Religious affiliation
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Religious affiliation")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.8421052631578947

Model: RandomForestClassifier
val accuracy 0.8421052631578947

Model: DecisionTreeClassifier
val accuracy 0.8070175438596491

Model: SGDClassifier
val accuracy 0.6491228070175439

Model: KNeighborsClassifier
val accuracy 0.8070175438596491

Model: MultinomialNB
val accuracy 0.8245614035087719



In [125]:
# Violent / not Violent
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Violent")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.7543859649122807

Model: RandomForestClassifier
val accuracy 0.7543859649122807

Model: DecisionTreeClassifier
val accuracy 0.7543859649122807

Model: SGDClassifier
val accuracy 0.6666666666666666

Model: KNeighborsClassifier
val accuracy 0.7368421052631579

Model: MultinomialNB
val accuracy 0.7368421052631579



In [126]:
# Racist / not Racist
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Racist")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.9122807017543859

Model: RandomForestClassifier
val accuracy 0.9122807017543859

Model: DecisionTreeClassifier
val accuracy 0.8070175438596491

Model: SGDClassifier
val accuracy 0.7192982456140351

Model: KNeighborsClassifier
val accuracy 0.8771929824561403

Model: MultinomialNB
val accuracy 0.9122807017543859



In [127]:
# Mockery / not Mockery
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Mockery")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.7719298245614035

Model: RandomForestClassifier
val accuracy 0.7719298245614035

Model: DecisionTreeClassifier
val accuracy 0.6666666666666666

Model: SGDClassifier
val accuracy 0.6140350877192983

Model: KNeighborsClassifier
val accuracy 0.6491228070175439

Model: MultinomialNB
val accuracy 0.7017543859649122



In [119]:
# Sexual harrasment / not Sexual harrasment
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Sexual harrasment")
train_and_evaluate(responses_df_binary_main, model_names)

Model: LogisticRegression
val accuracy 0.8771929824561403

Model: RandomForestClassifier
val accuracy 0.8771929824561403

Model: DecisionTreeClassifier
val accuracy 0.8070175438596491

Model: SGDClassifier
val accuracy 0.8070175438596491

Model: KNeighborsClassifier
val accuracy 0.8596491228070176

Model: MultinomialNB
val accuracy 0.8771929824561403



In [120]:
# Racist / not Racist
responses_df_binary_main = binarize_classes(responses_df_clean, class_chosen="Racist")
train_and_evaluate(responses_df_binary_main)

Model: LogisticRegression
val accuracy 0.9122807017543859

Model: RandomForestClassifier
val accuracy 0.9122807017543859

Model: DecisionTreeClassifier
val accuracy 0.8070175438596491

Model: SGDClassifier
val accuracy 0.6491228070175439

Model: KNeighborsClassifier
val accuracy 0.8771929824561403

Model: MultinomialNB
val accuracy 0.9122807017543859

