# Comments classification: offensive/non-offensive

1. Import Dataset

In [None]:
import pandas as pd
df = pd.read_csv('Dataset/CVMAD_04.csv', encoding='UTF-8')### if it doesn't work try 'utf-8-sig'
df.head()

In [None]:
import seaborn as sns # visulization
sns.countplot(x=df['decision'])

2. Data Encoding

    Word Embedding

In [None]:
import numpy as np
word_embeddings = {}
f = open('ma_model_cbow_mix.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()
print('Total %s word vectors in MA CBOW model 300d.' % len(word_embeddings))

In [None]:
def get_sentence_vectors(sentence):
    sentence_vector = np.zeros((300,))
    if len(sentence)==0:
        return sentence_vector
    for word in sentence:
        if word in word_embeddings:
            sentence_vector += word_embeddings[word]
    sentence_vector = sentence_vector/len(sentence)
    return sentence_vector

In [None]:
df["content"] = df["content"].apply(get_sentence_vectors)
df.head()

3. Data splitting

In [None]:
X = df.content
y = df.decision

In [None]:
""" Scaling and Splitting the dataset into training, validation and test sets """
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#scaler = StandardScaler()
#X_sacaled=scaler.fit_transform(X)

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=50, test_size=0.3)

In [None]:
train_X=np.array(train_X.to_list())

In [None]:
test_X=np.array(test_X.to_list())

In [None]:
train_X.shape

4. Modeling

In [None]:
# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
clf = XGBClassifier()

clf = XGBClassifier(n_estimators=200, #100
                           silent=True, 
                           max_delta_step=1,
                           min_child_weight=1, 
                           max_depth=5, 
                           objective='binary:logistic', 
                           gamma=1, 
                           learning_rate=0.1, 
                           colsample_bytree=1,#0.3,  
                           subsample=1, 
                           seed=10,#1
                           reg_alpha=0.1,#0
                           reg_lambda=0.1,#1
                           scale_pos_weight=1)#0.8

In [None]:
clf.fit(train_X, train_y)

In [None]:
pred_y=clf.predict(test_X)

In [None]:
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(test_y, pred_y)
cnf_matrix

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_y, pred_y))

Cross validation

In [None]:
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'n_estimators': [100, 200, 300], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'max_depth': [3, 5, 7],
              'learning_rate': [0.1, 0.01, 0.05]
             } 
  
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    n_jobs = 10,
    cv = 5,
    verbose=True
)
  
# fitting the model for grid search
grid.fit(train_X, train_y)

In [None]:
# best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

In [None]:
grid_predictions = grid.predict(test_X)
  
# print classification report
print(classification_report(test_y, grid_predictions))