In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
df = pd.read_csv(r'/content/gdrive/MyDrive/balanced_train.csv')

In [18]:
categories = ['toxic', 'obscene', 'insult']

### Clean the data by removing all the stop words

In [6]:
# function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [19]:
df['comment_text'] = df['comment_text'].map(lambda com : clean_text(com))

In [20]:
train, test = train_test_split(df, random_state=42, test_size=0.25, shuffle=True)

In [21]:
X_train = train.comment_text
X_test = test.comment_text
print(X_train.shape)
print(X_test.shape)

(30918,)
(10307,)


# Naive Bayes

In [22]:
# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

In [11]:
NB_pipeline.fit(X_train, train['toxic'])

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', 'can',
                                             'couldn', "couldn't", ...})),
                ('clf', OneVsRestClassifier(estimator=MultinomialNB()))])

In [12]:
prediction = NB_pipeline.predict(X_test)

In [13]:
len(test['toxic'])

10307

In [14]:
len(prediction)

10307

In [15]:
accuracy_score(test['toxic'], prediction)

0.8452508004268944

In [23]:
from sklearn import metrics
for category in categories:
    print("Naive Bayes -",category)
    
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[category])
    
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print(confusion_matrix(test[category], prediction))
    print("----")

    precision = metrics.precision_score(test[category], prediction)
    recall = metrics.recall_score(test[category], prediction)

    if precision != 0.0 or recall != 0.0:
      F1 = 2 * (precision * recall) / (precision + recall)

      print("Precision:",precision)
      print("Recall:",recall)
      print("F1 Score - ",F1)

    print("===============")
    print()

Naive Bayes - toxic
Test accuracy is 0.8452508004268944
[[6249  204]
 [1391 2463]]
----
Precision: 0.9235095613048369
Recall: 0.6390762843798651
F1 Score -  0.7554056126360987

Naive Bayes - obscene
Test accuracy is 0.8526244299990298
[[8139   32]
 [1487  649]]
----
Precision: 0.9530102790014684
Recall: 0.3038389513108614
F1 Score -  0.460773872914448

Naive Bayes - insult
Test accuracy is 0.837586106529543
[[8259   51]
 [1623  374]]
----
Precision: 0.88
Recall: 0.1872809213820731
F1 Score -  0.30883567299752274

