In [13]:
import random
import pandas as pd
import numpy as np 
import seaborn as sns
from tensorflow import keras
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import re
import tensorflow as tf

# NAÏVE BAYES CLASSIFICATION

## 1:Demonstrate application of Naïve Bayes Using Python

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
no_comment = train_df[train_df['comment_text'].isnull()]
no_toxic = train_df[train_df['toxic'].isnull()]
no_severe_toxic = train_df[train_df['severe_toxic'].isnull()]
no_obscene = train_df[train_df['obscene'].isnull()]
no_threat = train_df[train_df['threat'].isnull()]
no_insult = train_df[train_df['insult'].isnull()]
no_identity_hate = train_df[train_df['identity_hate'].isnull()]

print(len(no_comment),len(no_toxic), len(no_severe_toxic), len(no_obscene), len(no_threat), len(no_insult), len(no_identity_hate))

0 0 0 0 0 0 0


In [4]:
no_comment = test_df[test_df['comment_text'].isnull()]
len(no_comment)

0

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

# clean comment_text in train_df 
train_df['comment_text'] = train_df['comment_text'].map(lambda com : clean_text(com))

# clean comment_text in test_df 
test_df['comment_text'] = test_df['comment_text'].map(lambda com : clean_text(com))

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_X = train_df['comment_text']
test_X = test_df['comment_text']

vectorizer = TfidfVectorizer(max_features=5000) 
train_vect = vectorizer.fit_transform(train_X)
test_vect = vectorizer.transform(test_X)

train_vect.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
from collections import Counter
print('toxic',sorted(Counter(train_df['toxic']).items()))
print('severe_toxic',sorted(Counter(train_df['severe_toxic']).items()))
print('obscene',sorted(Counter(train_df['obscene']).items()))
print('threat',sorted(Counter(train_df['threat']).items()))
print('insult',sorted(Counter(train_df['insult']).items()))
print('identity_hate',sorted(Counter(train_df['identity_hate']).items()))

toxic [(0, 144277), (1, 15294)]
severe_toxic [(0, 157976), (1, 1595)]
obscene [(0, 151122), (1, 8449)]
threat [(0, 159093), (1, 478)]
insult [(0, 151694), (1, 7877)]
identity_hate [(0, 158166), (1, 1405)]


In [8]:
from imblearn.over_sampling import SMOTE

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
oversampled_data = {}

smote = SMOTE(sampling_strategy='auto', random_state=42)

for label in labels:
    y = train_df[label].values  

    # SMOTE 
    X_resampled, y_resampled = smote.fit_resample(train_vect, y)

    # oversampled
    oversampled_data[label] = (X_resampled, y_resampled)

    unique, counts = np.unique(y_resampled, return_counts=True)
    class_counts = dict(zip(unique, counts))
    print(f'label : {label}')
    print('0 =>', class_counts[0])
    print('1 =>', class_counts[1])

label : toxic
0 => 144277
1 => 144277
label : severe_toxic
0 => 157976
1 => 157976
label : obscene
0 => 151122
1 => 151122
label : threat
0 => 159093
1 => 159093
label : insult
0 => 151694
1 => 151694
label : identity_hate
0 => 158166
1 => 158166


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
models = {}

In [10]:
from sklearn.naive_bayes import MultinomialNB
models = {}

for label in labels:
    X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(oversampled_data[label][0],oversampled_data[label][1],test_size=0.2, random_state=42) 
    
    nb_model = MultinomialNB()
    nb_model.fit(X_train_label, y_train_label)
    
    y_pred_label = nb_model.predict(X_test_label)
    accuracy = accuracy_score(y_test_label, y_pred_label)
    report = classification_report(y_test_label, y_pred_label)
    
    print(f'Category: {label}')
    print(f'Test accuracy: {accuracy}')
    print(report)

    models[label] = nb_model

Category: toxic
Test accuracy: 0.8838349708027933
              precision    recall  f1-score   support

           0       0.87      0.90      0.89     28854
           1       0.90      0.86      0.88     28857

    accuracy                           0.88     57711
   macro avg       0.88      0.88      0.88     57711
weighted avg       0.88      0.88      0.88     57711

Category: severe_toxic
Test accuracy: 0.9599943029861847
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     31656
           1       0.95      0.97      0.96     31535

    accuracy                           0.96     63191
   macro avg       0.96      0.96      0.96     63191
weighted avg       0.96      0.96      0.96     63191

Category: obscene
Test accuracy: 0.9058545219937468
              precision    recall  f1-score   support

           0       0.90      0.92      0.91     30250
           1       0.92      0.89      0.90     30199

    accuracy           

In [11]:
submission_nb = pd.DataFrame({'id': test_df['id']})
for label in labels:
    y_prob_label = nb_model.predict_proba(test_vect)[:, 1]
    submission_nb[label] = y_prob_label

submission_nb.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.997129,0.997129,0.997129,0.997129,0.997129,0.997129
1,0000247867823ef7,0.011009,0.011009,0.011009,0.011009,0.011009,0.011009
2,00013b17ad220c46,0.156403,0.156403,0.156403,0.156403,0.156403,0.156403
3,00017563c3f7919a,0.008143,0.008143,0.008143,0.008143,0.008143,0.008143
4,00017695ad8997eb,0.188677,0.188677,0.188677,0.188677,0.188677,0.188677


In [12]:
submission_nb.to_csv('submission_nb.csv', index=False)