# XGBoost Implementation for the Toxic Comment Detector

In [2]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pylab as plb
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [11]:
TRAIN_DATA_PATH = "../../../data/wikipedia/train.csv"
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
comment_col = 'comment_text'
target_names = ['Predicted negative', 'Predicted positive']

In [6]:
train = pd.read_csv(TRAIN_DATA_PATH)

In [7]:
def text_hot_encoding(X):
    label_encoder = LabelEncoder()
    label_encoder = label_encoder.fit(X)
    label_encoded_x = label_encoder.transform(X)
    return label_encoded_x

In [17]:
X = text_hot_encoding(train[comment_col])
accuracy_dict = dict()
seed = 7
test_size = 0.2

for i in range(len(label_cols)):
    Y = train[label_cols[i]]
    # split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)
    
    # fit model no training data
    model = XGBClassifier()
    model.fit(X_train[:,np.newaxis],y_train)
    
    #testing with X_test data
    #X_test = text_hot_encoding(X_test)
    
    # make predictions for test data
    y_pred = model.predict(X_test[:,np.newaxis])
    
    predictions = [round(value) for value in y_pred]
    
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    auc = roc_auc_score(y_test, predictions)
    print(classification_report(y_test, predictions, target_names=target_names))
    
    print("\nAccuracy with label", label_cols[i], str(accuracy * 100.0))
    print("\nAUC with label", label_cols[i], str(auc))
    print("-------------------------------------------------------\n")
    
    accuracy_dict[label_cols[i]] = accuracy


  if diff:


                    precision    recall  f1-score   support

Predicted negative       0.91      1.00      0.95     28836
Predicted positive       0.79      0.06      0.12      3079

       avg / total       0.90      0.91      0.87     31915


Accuracy with label toxic 90.79743067523108

AUC with label toxic 0.5306020842645045
-------------------------------------------------------



  if diff:
  'precision', 'predicted', average, warn_for)


                    precision    recall  f1-score   support

Predicted negative       0.99      1.00      0.99     31587
Predicted positive       0.00      0.00      0.00       328

       avg / total       0.98      0.99      0.98     31915


Accuracy with label severe_toxic 98.97227009243302

AUC with label severe_toxic 0.5
-------------------------------------------------------



  if diff:


                    precision    recall  f1-score   support

Predicted negative       0.95      1.00      0.97     30228
Predicted positive       0.98      0.07      0.13      1687

       avg / total       0.95      0.95      0.93     31915


Accuracy with label obscene 95.07441641861195

AUC with label obscene 0.5346438594051343
-------------------------------------------------------



  if diff:
  'precision', 'predicted', average, warn_for)


                    precision    recall  f1-score   support

Predicted negative       1.00      1.00      1.00     31832
Predicted positive       0.00      0.00      0.00        83

       avg / total       0.99      1.00      1.00     31915


Accuracy with label threat 99.73993420021934

AUC with label threat 0.5
-------------------------------------------------------



  if diff:


                    precision    recall  f1-score   support

Predicted negative       0.95      1.00      0.97     30289
Predicted positive       0.81      0.05      0.09      1626

       avg / total       0.94      0.95      0.93     31915


Accuracy with label insult 95.0963496788344

AUC with label insult 0.5242866007847242
-------------------------------------------------------

                    precision    recall  f1-score   support

Predicted negative       0.99      1.00      1.00     31641
Predicted positive       0.00      0.00      0.00       274

       avg / total       0.98      0.99      0.99     31915


Accuracy with label identity_hate 99.1414695284349

AUC with label identity_hate 0.5
-------------------------------------------------------



  if diff:
  'precision', 'predicted', average, warn_for)
