# Comments classification: offensive/non-offensive

1. Import Dataset

In [1]:
import pandas as pd
df = pd.read_csv('Dataset/CVMAD_04.csv', encoding='UTF-8')### if it doesn't work try 'utf-8-sig'
df.head()

Unnamed: 0,content,decision
0,یخ علی الخماح مخسر للسیده وجها مازال کتهضرشوهت...,1
1,یاک مطلقها علاش کیحاسبها داب,1
2,یاخوفي من بدعة التفاخر بالقتل والتشرمیل والمعص...,1
3,يوم يكون الصوت سعره يتحدى 30درهم هنا سيصبح الت...,0
4,يوسف زروالي هو لقتقصدك لدخلو معاه جميعة شباب ا...,0


In [None]:
import seaborn as sns # visulization
sns.countplot(x=df['decision'])

2. Data pre-processing

In [3]:
import re
def removeWeirdChars(text):
    weirdPatterns = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               u"\u2069"
                               u"\u2066"
                               u"\u200c"
                               u"\u2068"
                               u"\u2067"
                               "]+", flags=re.UNICODE)
    return weirdPatterns.sub(r'', text)

In [4]:
df.content = [removeWeirdChars(d) for d in df.content]

In [4]:
df.content[0:10]

0    یخ علی الخماح مخسر للسیده وجها مازال کتهضرشوهت...
1                         یاک مطلقها علاش کیحاسبها داب
2    یاخوفي من بدعة التفاخر بالقتل والتشرمیل والمعص...
3    يوم يكون الصوت سعره يتحدى 30درهم هنا سيصبح الت...
4    يوسف زروالي هو لقتقصدك لدخلو معاه جميعة شباب ا...
5                                             يوسف رجل
6        يوسف الزروالي هو لي دخلو لي جمعية شباب الملكي
7                        يوزع شكون فيق سكيزو من السبات
8      يوتيب تسبب ليا بمرض نفسي وليت كنخاف ومرضت نفسيا
9    يهدر واش تتوقع منو وجاية ملمريخ نتا واش قودك ي...
Name: content, dtype: object

3. Data Encoding

Word Embedding

In [6]:
import numpy as np
word_embeddings = {}
f = open('ma_model_cbow_mix.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()
print('Total %s word vectors in MA CBOW model 300d.' % len(word_embeddings))

Total 82906 word vectors in MA CBOW model 300d.


In [7]:
def get_sentence_vectors(sentence):
    sentence_vector = np.zeros((300,))
    if len(sentence)==0:
        return sentence_vector
    for word in sentence:
        if word in word_embeddings:
            sentence_vector += word_embeddings[word]
    sentence_vector = sentence_vector/len(sentence)
    return sentence_vector

In [8]:
len(word_embeddings)

82906

In [9]:
word_embeddings['علاش']

array([-0.04573694,  0.10975436, -0.54744524,  0.10996349,  0.20633675,
        0.90674895,  0.11491557,  0.72445756,  0.58139473, -0.2946483 ,
       -0.7712647 , -0.17335273,  0.33888417, -0.20815212,  0.46175632,
        0.38756797,  0.28463918, -0.03309639, -0.08293769, -0.13326049,
       -0.58587915, -0.48430493, -0.16314602, -0.38404787,  0.08127248,
       -0.8705088 ,  0.33157548,  0.3544283 ,  0.5581337 ,  0.08885915,
       -1.1427025 ,  0.92506343, -0.09592154,  0.5934387 ,  0.3629863 ,
       -0.21132165,  0.30229187,  0.16288331, -0.31264427, -0.04881828,
       -0.08142379, -1.4429231 , -0.66444516, -1.4426838 , -1.0821352 ,
       -0.05486058,  0.8075012 ,  0.6802713 ,  1.0687793 , -1.7807748 ,
       -1.0383372 ,  1.5321866 , -0.8323125 ,  0.3871929 , -0.6774203 ,
        1.2722178 ,  0.6946758 , -0.8461339 ,  1.0099423 ,  0.37826127,
        0.580605  , -0.18887214, -0.8269939 ,  0.958509  ,  0.4126865 ,
       -2.1811097 ,  1.2467979 , -0.53286076, -1.0175996 , -1.90

In [9]:
df["content"] = df["content"].apply(get_sentence_vectors)
df.head()

Unnamed: 0,content,decision
0,"[0.03822386347585254, 0.030719361233490484, -0...",1
1,"[-0.016081052499690225, -0.011788641102612019,...",1
2,"[0.014637194904140555, 0.01521033041895582, -0...",1
3,"[0.027888019090971432, 0.02286620584929043, -0...",0
4,"[0.04122190000306742, 0.028468514847404817, -0...",0


4. Modeling

5. Data splitting

In [17]:
X = df.content
y = df.decision

In [18]:
X[0:3]

0    [0.03822386347585254, 0.030719361233490484, -0...
1    [-0.016081052499690225, -0.011788641102612019,...
2    [0.014637194904140555, 0.01521033041895582, -0...
Name: content, dtype: object

In [20]:
X.shape

(23000,)

In [13]:
y[0:3]

0    1
1    1
2    1
Name: decision, dtype: int64

In [26]:
""" Sckaling and Splitting the dataset into training, validation and test sets """
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#scaler = StandardScaler()
#X_sacaled=scaler.fit_transform(X)

train_X, test_X, train_y, test_y = train_test_split(X.to_list(), y.to_list(), random_state=50, test_size=0.3)

In [35]:
from sklearn import svm
clf = svm.SVC(C=1000, gamma=0.1, kernel='rbf')
clf.fit(train_X, train_y)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [36]:
pred_y=clf.predict(test_X)

In [37]:
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(test_y, pred_y)
cnf_matrix

array([[1618, 1807],
       [ 911, 2564]], dtype=int64)

In [38]:
from sklearn.metrics import classification_report
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           0       0.64      0.47      0.54      3425
           1       0.59      0.74      0.65      3475

   micro avg       0.61      0.61      0.61      6900
   macro avg       0.61      0.61      0.60      6900
weighted avg       0.61      0.61      0.60      6900



In [32]:
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3, cv=5)
  
# fitting the model for grid search
grid.fit(train_X, train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.5694056269796907, total= 1.3min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min remaining:    0.0s


[CV]  C=0.1, gamma=1, kernel=rbf, score=0.5600894354387926, total= 1.3min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.8min remaining:    0.0s


[CV]  C=0.1, gamma=1, kernel=rbf, score=0.5687663063734626, total= 1.1min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.5526364822060742, total= 1.1min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.5438792621576299, total= 1.1min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.5415579575102497, total= 1.1min
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.5220793739519285, total= 1.1min
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.5204024594745668, total= 1.1min
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.5175177040626164, total= 1.1min
[CV] C=0.1, gamma=0.001, kernel=rbf .....

[CV]  C=100, gamma=0.0001, kernel=rbf, score=0.5347493944475499, total= 1.1min
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV]  C=100, gamma=0.0001, kernel=rbf, score=0.5307491613865076, total= 1.1min
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV]  C=1000, gamma=1, kernel=rbf, score=0.5992174399105645, total= 2.6min
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV]  C=1000, gamma=1, kernel=rbf, score=0.5882243338923048, total= 2.5min
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV]  C=1000, gamma=1, kernel=rbf, score=0.603056280283265, total= 2.5min
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV]  C=1000, gamma=0.1, kernel=rbf, score=0.6159865846841811, total= 1.5min
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV]  C=1000, gamma=0.1, kernel=rbf, score=0.6189677659772685, total= 1.5min
[CV] C=1000, gamma=0.1, kernel=rbf ...

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 134.4min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [33]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [39]:
grid_predictions = grid.predict(test_X)
  
# print classification report
print(classification_report(test_y, grid_predictions))

              precision    recall  f1-score   support

           0       0.64      0.47      0.54      3425
           1       0.59      0.74      0.65      3475

   micro avg       0.61      0.61      0.61      6900
   macro avg       0.61      0.61      0.60      6900
weighted avg       0.61      0.61      0.60      6900

