# Import Libraries

In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [33]:
# Reading the data
data = pd.read_excel('../data/responses_data.xlsx')

In [34]:
# Checking the head of the data
data.head()

Unnamed: 0,description,answer_category_num,question_id_id
0,ูุฒูุฑ ุงูุฎุงุฑุฌูุฉ ุงููุจูุงูู ุฌุจุฑุงู ุจุงุณูู ูุงู ูู ุณูุณู...,Religious affiliation,1
1,ูุฒูุฑ ุงูุฎุงุฑุฌูุฉ ุงููุจูุงูู ุฌุจุฑุงู ุจุงุณูู ูุงู ูู ุณูุณู...,Religious affiliation,1
2,ูุฒูุฑ ุงูุฎุงุฑุฌูุฉ ุงููุจูุงูู ุฌุจุฑุงู ุจุงุณูู ูุงู ูู ุณูุณู...,Religious affiliation,1
3,ูุฒูุฑ ุงูุฎุงุฑุฌูุฉ ุงููุจูุงูู ุฌุจุฑุงู ุจุงุณูู ูุงู ูู ุณูุณู...,Racist,1
4,ุณูุฑูุฉ ุจูุฏ ุงูุญุถุงุฑุงุช ุชุฑุจุทูุง ุจุนููุฉ ุงู ุจุญููุงู,Violent,2


In [35]:
# The shape of the data
data.shape

(4825, 3)

In [36]:
data.description = data.description.apply(str.strip)

In [37]:
# The columns of the data
data.columns

Index(['description', 'answer_category_num', 'question_id_id'], dtype='object')

In [38]:
data.description.duplicated().sum()

1737

In [39]:
clean_data = data.copy()
clean_data  = clean_data[['description', 'answer_category_num']]
clean_data.head(2)

Unnamed: 0,description,answer_category_num
0,ูุฒูุฑ ุงูุฎุงุฑุฌูุฉ ุงููุจูุงูู ุฌุจุฑุงู ุจุงุณูู ูุงู ูู ุณูุณู...,Religious affiliation
1,ูุฒูุฑ ุงูุฎุงุฑุฌูุฉ ุงููุจูุงูู ุฌุจุฑุงู ุจุงุณูู ูุงู ูู ุณูุณู...,Religious affiliation


In [40]:
clean_data.description.value_counts()

ููุง ุงูุนุงูุฑุฉ ุชุงุถุฑ ุจุงูุนูุฉ                                                      9
ููุงูู ุตุญูุญ                                                                   9
ูู ููููู ุงูุดูุนุฉ ูููุช ู ุงูุจู ูุงููุฏุฏ                                           8
ูุง ุฎุตู ูุง ุฏุฎูู ุงูุฑููุน                                                        7
ูุจูู ุงุฏู ุงูุช ูุนูู ููุณุฎ ูุธู ุญุงูู ููุณุงูู ุงูุถู ูู ุงูู ุชูุดุฑ ูุณุงุฎุชู ุนูุฏ ุงููุธุงู    7
                                                                            ..
ูุง ุจุณ ุงููุทู ุจุฏู ุชูููุณ ูู ู ูุชูู                                              1
ูุฌุจ ุฃู ุชุนูู ุตุญูุญ                                                             1
ุชุญูุงุชู ููุดุนุจ ุงูุนุฑูู ูู ุงูุฃุฑุฏู ๐ฏ๐ด๐ต๐ธ                                           1
ุงุฑุฏูุบุงู ุงูู ููุง                                                      

In [41]:
sum(clean_data.description.value_counts() > 1)

1138

In [42]:
# removing duplicated descrptions/ comments
clean_data.drop_duplicates(subset='description', inplace=True)
clean_data.reset_index(drop=True, inplace=True)

In [43]:
clean_data

Unnamed: 0,description,answer_category_num
0,ูุฒูุฑ ุงูุฎุงุฑุฌูุฉ ุงููุจูุงูู ุฌุจุฑุงู ุจุงุณูู ูุงู ูู ุณูุณู...,Religious affiliation
1,ุณูุฑูุฉ ุจูุฏ ุงูุญุถุงุฑุงุช ุชุฑุจุทูุง ุจุนููุฉ ุงู ุจุญููุงู,Violent
2,ุชูุชููู ูุณุงู ุงูุญุณู ูุชุชุฑุญููุนููุฉ ูู ุฃู ุฃุตูุงู ุงููุฎ...,Racist
3,ูุนู ุฎุจุฑ ุงูู ุจูุฏุฉ ูุทุฑ ูุชู ูุง ุณููุชุง ูุณุงุญุชูุง ุงูุจุฑ...,Normal
4,ููุงูุงูู ููุช ุงูููุณู ุงููู ุทุงู ูุงู ููุง ุจุณ ูุชุญูุณ ุญ...,Normal
...,...,...
3083,ูู ูุจุงุฑุญ ุนู ุชุชูุฌ ุจูุงุฑูุณ ูู ููู ุฌุงุจุช ุงูุดูุณ,Normal
3084,ุงูุนูู ุจููุจู ุดู ููุถูู,Normal
3085,ูุฏููุฉ,Mockery
3086,ุงููู ูุญููู ูุง ุจุทู,Normal


In [44]:
# checking how many comments have different votes
for i, row in clean_data.iterrows():

    comment = row.description

    # getting the dataframe for that comment
    temp_df = data[data.description == comment]
    
    # how many unique answers that comment has
    n_ = temp_df.answer_category_num.nunique()

    # changing the answer_category_num to the most frequent one
    if n_ >= 2:
        most_voted_label = temp_df.answer_category_num.value_counts().index[0]
        clean_data.at[i, 'answer_category_num'] = most_voted_label

In [45]:
clean_data

Unnamed: 0,description,answer_category_num
0,ูุฒูุฑ ุงูุฎุงุฑุฌูุฉ ุงููุจูุงูู ุฌุจุฑุงู ุจุงุณูู ูุงู ูู ุณูุณู...,Religious affiliation
1,ุณูุฑูุฉ ุจูุฏ ุงูุญุถุงุฑุงุช ุชุฑุจุทูุง ุจุนููุฉ ุงู ุจุญููุงู,Violent
2,ุชูุชููู ูุณุงู ุงูุญุณู ูุชุชุฑุญููุนููุฉ ูู ุฃู ุฃุตูุงู ุงููุฎ...,Racist
3,ูุนู ุฎุจุฑ ุงูู ุจูุฏุฉ ูุทุฑ ูุชู ูุง ุณููุชุง ูุณุงุญุชูุง ุงูุจุฑ...,Normal
4,ููุงูุงูู ููุช ุงูููุณู ุงููู ุทุงู ูุงู ููุง ุจุณ ูุชุญูุณ ุญ...,Normal
...,...,...
3083,ูู ูุจุงุฑุญ ุนู ุชุชูุฌ ุจูุงุฑูุณ ูู ููู ุฌุงุจุช ุงูุดูุณ,Normal
3084,ุงูุนูู ุจููุจู ุดู ููุถูู,Normal
3085,ูุฏููุฉ,Mockery
3086,ุงููู ูุญููู ูุง ุจุทู,Normal


In [46]:
# Generated Class
clean_data.answer_category_num.value_counts()

Normal                   776
Mockery                  755
Violent                  571
Religious affiliation    368
Racist                   323
Sexual harrasment        295
Name: answer_category_num, dtype: int64

In [47]:
# Checking the info
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3088 entries, 0 to 3087
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   description          3088 non-null   object
 1   answer_category_num  3088 non-null   object
dtypes: object(2)
memory usage: 48.4+ KB


In [48]:
# Removing stop words for each tweet
clean_data['tweet_no_stopwords'] = 'x'
stop_words = set(stopwords.words('arabic')) 
for count, tweet in enumerate(clean_data.description):
    word_tokens = word_tokenize(tweet)
    filtered_tweet = []
    for word in word_tokens:
        if word not in stop_words:
            filtered_tweet.append(word)
    joined_filtered_tweet = " ".join(filtered_tweet)
    clean_data.tweet_no_stopwords[count] = joined_filtered_tweet

In [49]:
clean_data.head(2)

Unnamed: 0,description,answer_category_num,tweet_no_stopwords
0,ูุฒูุฑ ุงูุฎุงุฑุฌูุฉ ุงููุจูุงูู ุฌุจุฑุงู ุจุงุณูู ูุงู ูู ุณูุณู...,Religious affiliation,ูุฒูุฑ ุงูุฎุงุฑุฌูุฉ ุงููุจูุงูู ุฌุจุฑุงู ุจุงุณูู ูุงู ุณูุณูุฉ ุช...
1,ุณูุฑูุฉ ุจูุฏ ุงูุญุถุงุฑุงุช ุชุฑุจุทูุง ุจุนููุฉ ุงู ุจุญููุงู,Violent,ุณูุฑูุฉ ุจูุฏ ุงูุญุถุงุฑุงุช ุชุฑุจุทูุง ุจุนููุฉ ุงู ุจุญููุงู


# Training the Model

In [50]:
# y is Class which is dependent on X Tweet
X = clean_data['tweet_no_stopwords']
y = clean_data['answer_category_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [51]:
y_train.shape, y_test.shape

((2470,), (618,))

In [52]:
y_train.value_counts()

Normal                   628
Mockery                  607
Violent                  453
Religious affiliation    272
Racist                   271
Sexual harrasment        239
Name: answer_category_num, dtype: int64

In [53]:
y_test.value_counts()

Mockery                  148
Normal                   148
Violent                  118
Religious affiliation     96
Sexual harrasment         56
Racist                    52
Name: answer_category_num, dtype: int64

In [54]:
def show_results(y_true, y_pred):
    print("Confusion Matrix")
    print(metrics.confusion_matrix(y_true, y_pred))

    print("Classification Report")
    print(metrics.classification_report(y_true, y_pred)) 

In [55]:
# Using pipiles for machine learning flow

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),])


text_clf.fit(X_train, y_train)  


predictions = text_clf.predict(X_test)

In [56]:
show_results(y_test, predictions)

Confusion Matrix
[[49 45 13  5 10 26]
 [37 61 13 21  5 11]
 [12 11  4  5  5 15]
 [12 45  7  9  6 17]
 [13 18  4  2  2 17]
 [39 20  6 13  6 34]]
Classification Report
                       precision    recall  f1-score   support

              Mockery       0.30      0.33      0.32       148
               Normal       0.30      0.41      0.35       148
               Racist       0.09      0.08      0.08        52
Religious affiliation       0.16      0.09      0.12        96
    Sexual harrasment       0.06      0.04      0.04        56
              Violent       0.28      0.29      0.29       118

             accuracy                           0.26       618
            macro avg       0.20      0.21      0.20       618
         weighted avg       0.24      0.26      0.24       618



In [57]:
# Using pipiles for machine learning flow
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),])


text_clf.fit(X_train, y_train)  


predictions = text_clf.predict(X_test)

In [58]:
show_results(y_test, predictions)

Confusion Matrix
[[ 70  64   0   0   0  14]
 [ 44 102   0   0   0   2]
 [ 23  23   0   0   0   6]
 [ 21  65   0   0   0  10]
 [ 21  26   0   0   0   9]
 [ 69  32   0   0   0  17]]
Classification Report
                       precision    recall  f1-score   support

              Mockery       0.28      0.47      0.35       148
               Normal       0.33      0.69      0.44       148
               Racist       0.00      0.00      0.00        52
Religious affiliation       0.00      0.00      0.00        96
    Sexual harrasment       0.00      0.00      0.00        56
              Violent       0.29      0.14      0.19       118

             accuracy                           0.31       618
            macro avg       0.15      0.22      0.17       618
         weighted avg       0.20      0.31      0.23       618



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
tst =  "ุงูุช  ูุงุญุฏ ุญููุฑ ููุง ุจุชุณุชุญู ุง"

In [60]:
text_clf.predict([tst])[0]

'Violent'

In [61]:
wrong = 0
mock_viol = 0
correct_classified = 0

mock_viol_list = ['Violent', 'Mockery']

for count, test_tweet in enumerate(X_test):
    
    y_test_answer = y_test.values[count]
    y_test_prediction = predictions[count]

    # getting the correct classified comments
    if y_test_answer == y_test_prediction:
        correct_classified += 1

    if y_test_answer != y_test_prediction:
        print("Tweet: {}".format(test_tweet))
        print("Real Prediction: {} ||  Model Prediction: {}\n".format(y_test_answer, y_test_prediction))

        if y_test_answer in mock_viol_list and y_test_prediction in mock_viol_list:
            mock_viol += 1
        wrong += 1

print("Correct Classified {}".format(correct_classified))        
print("Misclassified {}".format(wrong))
mock_viol

Tweet: ูุช ุจุบูุถู
Real Prediction: Mockery ||  Model Prediction: Normal

Tweet: ุขุฎ ุฌุจุฑุงู ุจุงุณูู ุดู ูุงูุฑ ุนุงูู ุงูุช ูุนูู ููุง ูุฌุฑ
Real Prediction: Religious affiliation ||  Model Prediction: Violent

Tweet: ุงููู ูููููุง
Real Prediction: Religious affiliation ||  Model Prediction: Normal

Tweet: ุฑุฏูุฏ ุงูุดุนุจ ุงููุจูุงูู ููุงูุฉ ุฑุฆูุณ ุงูุญุฒุจ ุชูุญูุฏ ุงูุนุฑุจ ูุงู ุงูุช ูุนุฑูู ูุฐูุจู ูุฑู ููุฑู ุถุฏ ููู
Real Prediction: Normal ||  Model Prediction: Mockery

Tweet: ุงูู ุจุงุณูู ุงุฑุชูุจ ุฌุฑููุฉ ุงู ุญูู ุดู ุจุฑุงุช ุงูุณูุงู
Real Prediction: Violent ||  Model Prediction: Mockery

Tweet: ุณูุจูู ูุนูู ูุชุญููู ููุทู ุงูุนุฏุงูุฉ ูุฌููุน ุฃุจูุงุก ุงูุทุงุฆูุฉ ุงููุนุฑูููุฉ
Real Prediction: Racist ||  Model Prediction: Normal

Tweet: ุฌูุด ุงุจู ุดุญุงุทุฉ ุงููุตูุฑู ุงููุฐุฑ ุงูุฌูุด ุงููุญูุฏ ุจุงูุนุงูู ููู ุจููุชู ุดุนุจู ูุจูู ุง

83

In [71]:
X_test.iloc[0]

1236    ุดู ุชูุจุฑููุ ููุจุฑูููู ุงูุฏูุนุฉ ุงูุฅูุงุฑุงุชูุฉุ ุจูู ุชุณุช...
1062                                             ูุช ุจุบูุถู
2530         ุขุฎ ุฌุจุฑุงู ุจุงุณูู ุดู ูุงูุฑ ุนุงูู ุงูุช ูุนูู ููุง ูุฌุฑ
619                                            ุงูุช ุตุฑูุงูู
773                                           ุงููู ูููููุง
                              ...                        
299                            ุฌุจุฑุงู ุจุงุณูู ุณููู ุฌุงุจ ููุฑุจุง
2000                                     ููุงูู ุตุญูุญ ููุงุณู
2432                ููุฉ ุงูููุงุญุฉ ุนูุฏูุง ุชุญุงุตุฑ ุงูุนุงูุฑุฉ ุงูุนูุฉ
1539                     ุญูุง ุงููู ูุจูุงู ูุดุนุจ ูุจูุงู ุงูุนุธูู
705     ูุนูู ุจูู ูุฒูุฑ ุงููุตุงูุญุฉ ุนูู ุญูุฏุฑ ุจุงุน ููุงุฏูุง ุงู ...
Name: tweet_no_stopwords, Length: 618, dtype: object

# Getting questions id's for correctly classified questions by model

In [86]:
list_of_correct_ids = []

for i, test_tweet in enumerate(X_test):
    y_test_answer = y_test.values[i]
    y_test_prediction = predictions[i]

    # getting the correct classified comments
    if y_test_answer == y_test_prediction:
        # getting the original tweet before doing any preprocessing ex BOW removal
        description = clean_data[clean_data['tweet_no_stopwords'] == X_test.iloc[i]]['description'].values[0]

        # getting the question id
        question_id_id = data[data['description'] == description]['question_id_id'].values[0]

        list_of_correct_ids.append(question_id_id)

In [87]:
list_of_correct_ids

[2832,
 1076,
 1212,
 1222,
 4896,
 3562,
 3575,
 2617,
 7852,
 809,
 775,
 4376,
 5035,
 2372,
 2049,
 2835,
 853,
 3450,
 489,
 5063,
 713,
 4806,
 4481,
 4200,
 6033,
 5011,
 3460,
 3968,
 4462,
 7892,
 5334,
 1801,
 8430,
 6906,
 7241,
 3064,
 2465,
 7515,
 4935,
 4224,
 2204,
 3314,
 7426,
 7971,
 6994,
 7976,
 5185,
 5927,
 3584,
 2311,
 4859,
 4663,
 8354,
 5041,
 2456,
 2642,
 4181,
 1270,
 1785,
 7156,
 8832,
 8613,
 3605,
 5076,
 2463,
 8297,
 5203,
 4395,
 4616,
 8237,
 4356,
 3747,
 5521,
 7713,
 4522,
 2093,
 4388,
 4627,
 6037,
 2245,
 3656,
 1788,
 5676,
 3393,
 1703,
 8321,
 8006,
 3782,
 6705,
 7593,
 8769,
 7310,
 4087,
 4839,
 6216,
 1185,
 1932,
 585,
 3138,
 6601,
 65,
 7816,
 7407,
 5922,
 5488,
 4164,
 8002,
 3135,
 4892,
 931,
 6537,
 268,
 2580,
 2254,
 4947,
 1906,
 6920,
 3927,
 8123,
 8232,
 6785,
 1608,
 3699,
 1720,
 7296,
 7576,
 5363,
 7727,
 3601,
 3517,
 4247,
 4503,
 408,
 3604,
 4708,
 3799,
 2031,
 923,
 1853,
 5647,
 6938,
 8885,
 692,
 8044,
 3882

# Saving the Model

In [27]:
import pickle
# save the model to disk
filename = 'finalized_model_SVC.sav'
pickle.dump(text_clf, open(filename, 'wb'))

In [74]:
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.8393162393162393
