In [13]:
pip install bayesian-optimization

Note: you may need to restart the kernel to use updated packages.


In [14]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input\jigsaw-toxic-comment-classification-challenge\sample_submission.csv.zip
/kaggle/input\jigsaw-toxic-comment-classification-challenge\test.csv.zip
/kaggle/input\jigsaw-toxic-comment-classification-challenge\test_labels.csv.zip
/kaggle/input\jigsaw-toxic-comment-classification-challenge\train.csv.zip


### **Load dataset**

In [15]:
%cd E:\Toxic-comment-classification

E:\Toxic-comment-classification


In [16]:
train = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_label = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
sample_submission = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

In [17]:
def read_from_file(filename):
    with open(filename, 'r') as f:
        return np.array(f.read().splitlines())

In [18]:
test.comment_text = read_from_file('clean_data/data_test_cleaned_light2.txt')
train.comment_text = read_from_file('clean_data/data_train_cleaned_light2.txt')

In [19]:
test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule much succesful ever whats hat...
1,0000247867823ef7,rfc title fine imo
2,00013b17ad220c46,source zawe ashton lapland
3,00017563c3f7919a,look back source information update correct fo...
4,00017695ad8997eb,anonymously edit article
...,...,...
153159,fffcd0960ee309b5,totally agree stuff nothing long crap
153160,fffd7a9a6eb32c16,throw field home plate doe get fast throw cut ...
153161,fffda9e8d6fafa9e,okinotorishima category see change agree much ...
153162,fffe8f1340a79fc2,one found nation eu germany law return quite s...


In [20]:
train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edit make username hardcore metall...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww match background colour seemingly stick t...,0,0,0,0,0,0
2,000113f07ec002fd,hey man really try edit war guy constantly rem...,0,0,0,0,0,0
3,0001b41b1c6bb37e,much cannot make real suggestion improvement w...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,time ask view completely contradict coverage r...,0,0,0,0,0,0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0
159569,fff125370e4aaaf3,look like actually put speedy version delete look,0,0,0,0,0,0


### **Split the data**

In [21]:
comments = train['comment_text']
labels = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [22]:
train_comments, val_comments, train_labels, val_labels = train_test_split(comments, labels, test_size = 0.2, random_state=42)

In [23]:
train_labels

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
140030,1,0,0,0,0,0
159124,0,0,0,0,0,0
60006,0,0,0,0,0,0
65432,0,0,0,0,0,0
154979,0,0,0,0,0,0
...,...,...,...,...,...,...
119879,0,0,0,0,0,0
103694,0,0,0,0,0,0
131932,1,0,0,0,0,0
146867,0,0,0,0,0,0


### **Hyperparameter tuning**

In [24]:
# Define the pipeline
def build_pipeline(ngram_range, min_df, max_df, alpha):
    return Pipeline([
        ('bow', CountVectorizer(
            ngram_range=ngram_range,
            min_df=int(min_df),
            max_df=min(max_df, 1.0),  # max_df should be <= 1.0
            max_features=20000,
            binary=False
            # binary=True
        )),
        ('clf', MultinomialNB(alpha=max(alpha, 0)))  # alpha should be > 0
    ])

# Define the objective function
def objective(ngram_range_min, ngram_range_max, min_df, max_df, alpha):
    ngram_range = (int(ngram_range_min), int(ngram_range_max))
    pipeline = build_pipeline(ngram_range, min_df, max_df, alpha)
    scores = cross_val_score(pipeline, train_comments, train_labels[label], cv=3, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

# Define the parameter bounds
param_bounds = {
    'ngram_range_min': (1, 1),  # lower bound of ngram_range
    'ngram_range_max': (1, 3),  # upper bound of ngram_range
    'min_df': (1, 5),
    'max_df': (0.7, 1.0),
    'alpha': (0.01, 10),
}

models = []
for label in train_labels.columns:
    print(f"Optimizing for label: {label}")
    
    # Create the BayesianOptimization object
    optimizer = BayesianOptimization(
        f=objective,
        pbounds=param_bounds,
        random_state=42,
        verbose=2
    )

    # Maximize the objective function
    optimizer.maximize(init_points=10, n_iter=50)

    # Get the best parameters
    best_parameters = optimizer.max['params']

    # Convert the parameters to the correct format
    best_parameters['ngram_range'] = (int(best_parameters.pop('ngram_range_min')), int(best_parameters.pop('ngram_range_max')))
    best_parameters['min_df'] = int(best_parameters['min_df'])
    best_parameters['alpha'] = float(best_parameters['alpha'])

    # Output the best parameters
    print(f"Best parameters for label {label}: {best_parameters}")

    # Refit the pipeline with the best parameters if needed
    pipeline = build_pipeline(**best_parameters)
    models.append(pipeline.fit(train_comments, train_labels[label]))

Optimizing for label: toxic
|   iter    |  target   |   alpha   |  max_df   |  min_df   | ngram_... | ngram_... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9204   [0m | [0m3.752    [0m | [0m0.9852   [0m | [0m3.928    [0m | [0m2.197    [0m | [0m1.0      [0m |
| [95m2        [0m | [95m0.9256   [0m | [95m1.568    [0m | [95m0.7174   [0m | [95m4.465    [0m | [95m2.202    [0m | [95m1.0      [0m |
| [0m3        [0m | [0m0.9245   [0m | [0m0.2156   [0m | [0m0.991    [0m | [0m4.33     [0m | [0m1.425    [0m | [0m1.0      [0m |
| [0m4        [0m | [0m0.9239   [0m | [0m1.842    [0m | [0m0.7913   [0m | [0m3.099    [0m | [0m1.864    [0m | [0m1.0      [0m |
| [0m5        [0m | [0m0.9013   [0m | [0m6.122    [0m | [0m0.7418   [0m | [0m2.169    [0m | [0m1.733    [0m | [0m1.0      [0m |
| [0m6        [0m | [0m0.9084   [0m | [0m7.854    [0m | [0m0.7599   [0m | 

In [25]:
models

[Pipeline(steps=[('bow',
                  CountVectorizer(max_df=0.7, max_features=20000, min_df=5,
                                  ngram_range=(1, 3))),
                 ('clf', MultinomialNB(alpha=1.5661036084959452))]),
 Pipeline(steps=[('bow',
                  CountVectorizer(max_df=0.7, max_features=20000, min_df=4)),
                 ('clf', MultinomialNB(alpha=0.537899295687526))]),
 Pipeline(steps=[('bow',
                  CountVectorizer(max_df=0.7, max_features=20000, min_df=5)),
                 ('clf', MultinomialNB(alpha=0.45187360114562564))]),
 Pipeline(steps=[('bow',
                  CountVectorizer(max_df=0.7100206098404588, max_features=20000,
                                  min_df=4, ngram_range=(1, 2))),
                 ('clf', MultinomialNB(alpha=0.19167368926195816))]),
 Pipeline(steps=[('bow',
                  CountVectorizer(max_df=0.7, max_features=20000, min_df=4)),
                 ('clf', MultinomialNB(alpha=0.39470274844235514))]),
 Pipeline(steps

In [26]:
# # For clean light2 dataset, binary bow 
# models=[Pipeline(steps=[('bow',
#                 CountVectorizer(binary=True, max_features=20000, min_df=3)),
#                 ('clf', MultinomialNB(alpha=0.4270130129826107))]),
# Pipeline(steps=[('bow',
#                 CountVectorizer(binary=True, max_df=0.7, max_features=20000,
#                                 min_df=3, ngram_range=(1, 3))),
#                 ('clf', MultinomialNB(alpha=0.4975355206730156))]),
# Pipeline(steps=[('bow',
#                 CountVectorizer(binary=True, max_df=0.7, max_features=20000,
#                                 min_df=2, ngram_range=(1, 3))),
#                 ('clf', MultinomialNB(alpha=1.2298381626178778))]),
# Pipeline(steps=[('bow',
#                 CountVectorizer(binary=True, max_df=0.7, max_features=20000,
#                                 min_df=3, ngram_range=(1, 2))),
#                 ('clf', MultinomialNB(alpha=0.14313711598103757))]),
# Pipeline(steps=[('bow',
#                 CountVectorizer(binary=True, max_df=0.7, max_features=20000,
#                                 min_df=3, ngram_range=(1, 3))),
#                 ('clf', MultinomialNB(alpha=0.8921676778066149))]),
# Pipeline(steps=[('bow',
#                 CountVectorizer(binary=True, max_df=0.7, max_features=20000,
#                                 min_df=2, ngram_range=(1, 3))),
#                 ('clf', MultinomialNB(alpha=0.3935245328516555))])]

# For clean light2 dataset, bow
models=[Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7, max_features=20000, min_df=5,
                                ngram_range=(1, 3))),
                ('clf', MultinomialNB(alpha=1.5661036084959452))]),
Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7, max_features=20000, min_df=4)),
                ('clf', MultinomialNB(alpha=0.537899295687526))]),
Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7, max_features=20000, min_df=5)),
                ('clf', MultinomialNB(alpha=0.45187360114562564))]),
Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7100206098404588, max_features=20000,
                                min_df=4, ngram_range=(1, 2))),
                ('clf', MultinomialNB(alpha=0.19167368926195816))]),
Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7, max_features=20000, min_df=4)),
                ('clf', MultinomialNB(alpha=0.39470274844235514))]),
Pipeline(steps=[('bow',
                CountVectorizer(max_df=0.7, max_features=20000, min_df=4)),
                ('clf', MultinomialNB(alpha=0.36450987810825203))])]

In [27]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [28]:
for idx, label in enumerate(classes):
    print(label)
    models[idx].fit(train_comments, train_labels[label])
    preds = models[idx].predict_proba(val_comments)[:,1].reshape(-1, 1)
    print(roc_auc_score(val_labels, preds))

toxic
0.9261705314283984
severe_toxic
0.8077091549017257
obscene
0.897953954540482
threat
0.7977899496541135
insult
0.8990460013744386
identity_hate
0.8324892499531484


In [29]:
enum = enumerate(classes)
list(enum)

[(0, 'toxic'),
 (1, 'severe_toxic'),
 (2, 'obscene'),
 (3, 'threat'),
 (4, 'insult'),
 (5, 'identity_hate')]

In [30]:
preds = np.zeros((len(test), len(classes)))

for i, label in enumerate(classes):
    print(label)
    
    # Access the TfidfVectorizer step in the pipeline
    bow_transformer = models[i].named_steps['bow']
    
    # Transform the test data
    test_term_doc = bow_transformer.transform(test.comment_text)
    
    # Access the classifier step in the pipeline
    classifier = models[i].named_steps['clf']
    
    # Get predictions
    preds[:, i] = classifier.predict_proba(test_term_doc)[:, 1]


toxic
severe_toxic
obscene
threat
insult
identity_hate


In [31]:
submid = pd.DataFrame({'id': sample_submission["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = classes)], axis=1)
submission.to_csv('kaggle/working/submission_pipeline_light2_bow.csv', index=False)
# submission.to_csv('kaggle/working/submission_pipeline_light2_binary_bow.csv', index=False)