# Abusive Comment classifier

In [1]:
import pickle
import string
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [33]:
train_data.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,is_abusive
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,1
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,0


In [3]:
train_data_ALL_CLASSES=train_data.iloc[:,2:8]

In [4]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
train_data['is_abusive'] = train_data.max(axis=1)

In [8]:
len(train_data)

159571

In [9]:
print("Toxic: {}\nSevere Toxic: {}\nObscene: {}\nThreat: {}\nInsult: {}\nIdentity Hate: {}"
      .format(len(train_data[train_data['toxic']== 1]), len(train_data[train_data['severe_toxic']== 1]), len(train_data[train_data['obscene']== 1]),
             len(train_data[train_data['threat']== 1]), len(train_data[train_data['insult']== 1]), len(train_data[train_data['identity_hate']== 1])))

Toxic: 15294
Severe Toxic: 1595
Obscene: 8449
Threat: 478
Insult: 7877
Identity Hate: 1405


In [10]:
print("Total Relevant comments = {}".format(len(train_data[train_data['is_abusive']==0])))
print("Total Irrelevant comments = {}".format(len(train_data[train_data['is_abusive']==1])))

Total Relevant comments = 143346
Total Irrelevant comments = 16225


## Cleaning the data and using TF-IDF

Removing the punctuations, digits and some other symbols

In [6]:
del_symbols = ['«', '»', '®', '´', '·','º', '½', '¾', '¿', '¡', '§', '£', '₤']

In [7]:
def remove_punc(txt): 
    plain_txt = (ch for ch in txt if(ch not in string.punctuation and ch not in string.digits and ch not in del_symbols))
    plain_txt = ''.join(plain_txt)
    return plain_txt

**Removing punchuations**

In [9]:
# Don't Execute this and instead of this, use pickle files
train_data['comment_text'] = train_data['comment_text'].apply(remove_punc)
test_data['comment_text'] = test_data['comment_text'].apply(remove_punc)

KeyboardInterrupt: 

Making **Pickle files**

In [8]:
#pickle.dump(train_data,open('train_data_after_removing_punc_pickle.pkl','wb'))
#pickle.dump(test_data,open('test_data_after_removing_punc_pickle.pkl','wb'))

**Loading pickle files**

In [8]:
train_data_wthout_punc = pickle.load(open('train_data_after_removing_punc_pickle.pkl','rb'))
test_data_wthout_punc = pickle.load(open('test_data_after_removing_punc_pickle.pkl','rb'))

Applying **TF-IDF Algorithm** on comments i.e Converting ***words to vectors***

In [9]:
n = train_data_wthout_punc.shape[0]
vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = 3, max_df = 0.9,
                      strip_accents = 'unicode', use_idf = 0.9, smooth_idf = 1, sublinear_tf = 1)

In [24]:
# Don't run this and use Pickle file directly
# run this if there is error in final output with using pickle file

tfidf = vectorizer.fit(train_data_wthout_punc['comment_text'])
train_data_sparse = vectorizer.transform(train_data_wthout_punc['comment_text'])
test_data_sparse  = vectorizer.transform(train_data_wthout_punc['comment_text'])

# we got a sparse matrix of vectors

#### Making pickle

In [25]:
#pickle.dump(tfidf,open('tfidf_V','wb'))
#pickle.dump(train_data_sparse,open('train_data_in_sparse_matrix.pkl','wb'))
#pickle.dump(test_data_sparse,open('test_data_in_sparse_matrix.pkl','wb'))

#### loading pickle

In [10]:
tfidf_V = pickle.load(open('tfidf_V','rb'))
train_data_in_sparse_matrix = pickle.load(open('train_data_in_sparse_matrix.pkl','rb'))
test_data_in_sparse_matrix = pickle.load(open('test_data_in_sparse_matrix.pkl','rb'))

## Making model with Naive Bayes

### ==>> Single Class Classification

In [260]:
from sklearn.naive_bayes import MultinomialNB

In [261]:
classifier = MultinomialNB()
classifier.fit(train_data_in_sparse_matrix,train_data['is_abusive'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [262]:
y_pred = classifier.predict(test_data_in_sparse_matrix)

In [263]:
print('Total True 1 labels = {}'.format(sum(train_data['is_abusive'] == 1)))
print('Total True 0 labels = {}'.format(sum(train_data['is_abusive'] == 0)))

Total True 1 labels = 16225
Total True 0 labels = 143346


In [264]:
print('Total Pred 1 labels = {}'.format(sum(y_pred == 1)))
print('Total Pred 0 labels = {}'.format(sum(y_pred == 0)))

Total Pred 1 labels = 4704
Total Pred 0 labels = 154867


In [265]:
from sklearn.metrics import classification_report
print(classification_report(train_data['is_abusive'], y_pred ))

             precision    recall  f1-score   support

          0       0.93      1.00      0.96    143346
          1       0.99      0.29      0.45     16225

avg / total       0.93      0.93      0.91    159571



#### Prediction

In [266]:
user_input = input()
user_input = remove_punc(user_input)
user_input = pd.Series(user_input)
user_input = vectorizer.transform(user_input)
result = classifier.predict(user_input)
if result == 0:
    print("Comment is fine")
else:
    print("Comment can't be posted due to inappropriate language")

hello
Comment is fine


### ==>> Multi-Class Classification

In [11]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB

In [12]:
one_vs_rest_classifier= OneVsRestClassifier(MultinomialNB())
one_vs_rest_classifier.fit(train_data_in_sparse_matrix,train_data_ALL_CLASSES)

OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          n_jobs=1)

In [13]:
predict = one_vs_rest_classifier.predict(train_data_in_sparse_matrix)

In [14]:
from sklearn.metrics import classification_report
print(classification_report(train_data_ALL_CLASSES, predict))

             precision    recall  f1-score   support

          0       0.99      0.27      0.43     15294
          1       1.00      0.01      0.01      1595
          2       0.98      0.22      0.36      8449
          3       0.00      0.00      0.00       478
          4       0.95      0.14      0.25      7877
          5       0.00      0.00      0.00      1405

avg / total       0.93      0.20      0.33     35098



  'precision', 'predicted', average, warn_for)


#### Prediction

In [37]:
user_input = input()
user_input = remove_punc(user_input)
user_input = pd.Series(user_input)
user_input = tfidf_V.transform(user_input)
result = one_vs_rest_classifier.predict(user_input)
#print(result[0],'\n')
print('--------------RESULT---------------')
count=0
for i in range(len(result[0])):
    if(result[0][i]==0):
        count=count+1
if count ==6:
    print("comment is fine")
    
if(result[0][0]==1):
    print('Toxic')
if(result[0][1]==1):
    print('severe_toxic')
if(result[0][2]==1):
    print('obscene')
if(result[0][3]==1):
    print('threat')
if(result[0][4]==1):
    print('insult')
if(result[0][5]==1):
    print('identity_hate')




bastard
--------------RESULT---------------
Toxic
obscene


# =>TKinter for GUI

In [15]:
from tkinter import *
from tkinter import ttk

In [38]:
def GUI_function(event1):
    user_input = comment.get() 
    
    
    user_input = remove_punc(user_input)
    user_input = pd.Series(user_input)
    user_input = vectorizer.transform(user_input)
    output = classifier.predict(user_input)

    if output == 0:
        result.delete(0, "end")
        result.insert(0,"Text is fine")
    else:
        result.delete(0, "end")
        result.insert(0,"Comment can't be posted due to inappropriate language")

def clear_screen(event2):
       result.delete(0, "end")
        
    
###########################################################################################################################

main = Tk()
main.title('Online Abusive Comment Detector')

frame = Frame(main)
frame.grid()

label1 = Label(frame, text="Post a Comment")
label1.grid(row=0, column=20, padx=4)
comment = Entry(frame, width=60)
comment.grid(row=0, column=21, padx=4)

button_check = Button(frame,text='post')
button_check.bind("<Button>",GUI_function)
button_check.grid(row=2, column=44)

'''
button_clear = Button(frame, text='clear result screen')
button_clear.bind("<Button-2>", clear_screen)
button_clear.grid(row=2, column=10)
'''

label2 = Label(frame, text="Result")
label2.grid(row=4, column=20, padx=4)
result = Entry(frame,width=60)
result.grid(row=4, column=21, padx=4)


'''
Button(frame, text="LEFT button").pack(side=LEFT, fill=X)
Button(frame, text="RIGHT button").pack(side=RIGHT, fill=X)
Button(frame, text="TOP button").pack(side=TOP, fill=X)
Button(frame, text="BOTTOM").pack(side=BOTTOM, fill=X)
'''

 
main.mainloop()