In [2]:
pip install bayesian-optimization

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input\jigsaw-toxic-comment-classification-challenge\sample_submission.csv.zip
/kaggle/input\jigsaw-toxic-comment-classification-challenge\test.csv.zip
/kaggle/input\jigsaw-toxic-comment-classification-challenge\test_labels.csv.zip
/kaggle/input\jigsaw-toxic-comment-classification-challenge\train.csv.zip


### **Load dataset**

In [4]:
%cd E:\Toxic-comment-classification

E:\Toxic-comment-classification


In [5]:
train = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_label = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
sample_submission = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

In [6]:
def read_from_file(filename):
    with open(filename, 'r') as f:
        return np.array(f.read().splitlines())

In [7]:
test.comment_text = read_from_file('clean_data/data_test_cleaned_light2.txt')
train.comment_text = read_from_file('clean_data/data_train_cleaned_light2.txt')

In [8]:
test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule much succesful ever whats hat...
1,0000247867823ef7,rfc title fine imo
2,00013b17ad220c46,source zawe ashton lapland
3,00017563c3f7919a,look back source information update correct fo...
4,00017695ad8997eb,anonymously edit article
...,...,...
153159,fffcd0960ee309b5,totally agree stuff nothing long crap
153160,fffd7a9a6eb32c16,throw field home plate doe get fast throw cut ...
153161,fffda9e8d6fafa9e,okinotorishima category see change agree much ...
153162,fffe8f1340a79fc2,one found nation eu germany law return quite s...


In [9]:
train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edit make username hardcore metall...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww match background colour seemingly stick t...,0,0,0,0,0,0
2,000113f07ec002fd,hey man really try edit war guy constantly rem...,0,0,0,0,0,0
3,0001b41b1c6bb37e,much cannot make real suggestion improvement w...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,time ask view completely contradict coverage r...,0,0,0,0,0,0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0
159569,fff125370e4aaaf3,look like actually put speedy version delete look,0,0,0,0,0,0


### **Modeling**

In [10]:
# For clean light2 dataset, bow
models=[Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7, max_features=20000, min_df=5,
                                ngram_range=(1, 3))),
                ('clf', MultinomialNB(alpha=1.5661036084959452))]),
Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7, max_features=20000, min_df=4)),
                ('clf', MultinomialNB(alpha=0.537899295687526))]),
Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7, max_features=20000, min_df=5)),
                ('clf', MultinomialNB(alpha=0.45187360114562564))]),
Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7100206098404588, max_features=20000,
                                min_df=4, ngram_range=(1, 2))),
                ('clf', MultinomialNB(alpha=0.19167368926195816))]),
Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7, max_features=20000, min_df=4)),
                ('clf', MultinomialNB(alpha=0.39470274844235514))]),
Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7, max_features=20000, min_df=4)),
                ('clf', MultinomialNB(alpha=0.36450987810825203))])]

In [11]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [12]:
train_comments = train['comment_text']
train_labels = train[classes]

In [13]:
for idx, label in enumerate(classes):
    print(label)
    models[idx].fit(train_comments, train_labels[label])

toxic
severe_toxic
obscene
threat
insult
identity_hate


### **Save the model**

In [14]:
from joblib import dump

for i, model in enumerate(models):
    filename = f'src/NaiveBayes/models/naive_bayes_bow/naive_bayes_bow{i}.joblib'
    dump(model, filename)
    print(f'Model {i} save to {filename}')

Model 0 save to src/NaiveBayes/models/naive_bayes_bow/naive_bayes_bow0.joblib
Model 1 save to src/NaiveBayes/models/naive_bayes_bow/naive_bayes_bow1.joblib
Model 2 save to src/NaiveBayes/models/naive_bayes_bow/naive_bayes_bow2.joblib
Model 3 save to src/NaiveBayes/models/naive_bayes_bow/naive_bayes_bow3.joblib
Model 4 save to src/NaiveBayes/models/naive_bayes_bow/naive_bayes_bow4.joblib
Model 5 save to src/NaiveBayes/models/naive_bayes_bow/naive_bayes_bow5.joblib


### **Load the model**

In [15]:
from joblib import load 

models = []
for i in range(6): 
    filename = f'src/NaiveBayes/models/naive_bayes_bow/naive_bayes_bow{i}.joblib'
    model = load(filename)
    models.append(model)

In [16]:
predictions = np.zeros((len(test), len(classes)))

In [17]:
for idx, model in enumerate(models):
    preds = model.predict_proba(test.comment_text)[:, 1]  # Get the toxicity probability
    predictions[:, idx] = preds

In [18]:
submid = pd.DataFrame({'id': sample_submission["id"]})
submission = pd.concat([submid, pd.DataFrame(predictions, columns=classes)], axis=1)
submission.to_csv('src/NaiveBayes/submission/submission_naive_bayes_bow.csv', index=False)