In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier

/kaggle/input\jigsaw-toxic-comment-classification-challenge\sample_submission.csv.zip
/kaggle/input\jigsaw-toxic-comment-classification-challenge\test.csv.zip
/kaggle/input\jigsaw-toxic-comment-classification-challenge\test_labels.csv.zip
/kaggle/input\jigsaw-toxic-comment-classification-challenge\train.csv.zip


### **Load dataset**

In [2]:
from os import chdir, path, getcwd
for i in range(10):
    if path.isfile("checkcwd"):
        break
    chdir(path.pardir)
if path.isfile("checkcwd"):
    pass
else:
    raise Exception("Something went wrong. cwd=" + getcwd())

E:\Toxic-comment-classification


In [3]:
train = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_label = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
sample_submission = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

In [4]:
def read_from_file(filename):
    with open(filename, 'r') as f:
        return np.array(f.read().splitlines())

In [5]:
test.comment_text = read_from_file('clean_data/data_test_cleaned_light2.txt')
train.comment_text = read_from_file('clean_data/data_train_cleaned_light2.txt')

In [6]:
test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule much succesful ever whats hat...
1,0000247867823ef7,rfc title fine imo
2,00013b17ad220c46,source zawe ashton lapland
3,00017563c3f7919a,look back source information update correct fo...
4,00017695ad8997eb,anonymously edit article
...,...,...
153159,fffcd0960ee309b5,totally agree stuff nothing long crap
153160,fffd7a9a6eb32c16,throw field home plate doe get fast throw cut ...
153161,fffda9e8d6fafa9e,okinotorishima category see change agree much ...
153162,fffe8f1340a79fc2,one found nation eu germany law return quite s...


In [7]:
train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edit make username hardcore metall...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww match background colour seemingly stick t...,0,0,0,0,0,0
2,000113f07ec002fd,hey man really try edit war guy constantly rem...,0,0,0,0,0,0
3,0001b41b1c6bb37e,much cannot make real suggestion improvement w...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,time ask view completely contradict coverage r...,0,0,0,0,0,0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0
159569,fff125370e4aaaf3,look like actually put speedy version delete look,0,0,0,0,0,0


In [8]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

### **KFold CrossValidation**

In [9]:
# For clean light2 dataset, tfidf 
tools = [Pipeline(steps=[('vec',
                TfidfVectorizer(max_df=0.7240907832327527, max_features=20000,
                                min_df=4, ngram_range=(1, 2),
                                strip_accents='unicode', sublinear_tf=True)),
                ('clf', MultinomialNB(alpha=0.500284537385457))]),
Pipeline(steps=[('vec',
                TfidfVectorizer(max_df=0.8523586112847981, max_features=20000,
                                min_df=4, ngram_range=(1, 2),
                                strip_accents='unicode', sublinear_tf=True)),
                ('clf', MultinomialNB(alpha=0.07081443444614416))]),
Pipeline(steps=[('vec',
                TfidfVectorizer(max_features=20000, min_df=4,
                                ngram_range=(1, 2), strip_accents='unicode',
                                sublinear_tf=True)),
                ('clf', MultinomialNB(alpha=0.40501768233734997))]),
Pipeline(steps=[('vec',
                TfidfVectorizer(max_features=20000, min_df=5,
                                ngram_range=(1, 3), strip_accents='unicode',
                                sublinear_tf=True)),
                ('clf', MultinomialNB(alpha=0.01))]),
Pipeline(steps=[('vec',
                TfidfVectorizer(max_df=0.7, max_features=20000, min_df=5,
                                ngram_range=(1, 3), strip_accents='unicode',
                                sublinear_tf=True)),
                ('clf', MultinomialNB(alpha=0.2092609543216633))]),
Pipeline(steps=[('vec',
                TfidfVectorizer(max_df=0.9474985572987579, max_features=20000,
                                min_df=3, ngram_range=(1, 2),
                                strip_accents='unicode', sublinear_tf=True)),
                ('clf', MultinomialNB(alpha=0.13788393353862147))])]

In [10]:
X_t = train['comment_text'].values
y = train[classes].values
X_te = test['comment_text'].values

num_folds = 10

oof_predict = np.zeros((train.shape[0], 6))

predict = np.zeros((test.shape[0], 6))
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

cnt = 1
for train_index, test_index in kf.split(X_t):
    kfold_X_train, kfold_X_valid = X_t[train_index], X_t[test_index]
    kfold_y_train, kfold_y_valid = y[train_index], y[test_index]
    
    # Initialize an empty list to store models for each label
    models = []
    list_predict = list()
    list_oof_predict = list()
    
    # Train a separate Naive Bayes model for each label
    for i, label in enumerate(classes):
        # Access the CountVectorizer step in the pipeline
        vec_transformer = tools[i].named_steps['vec']

        # Transform the test data
        kfold_X_train_vec = vec_transformer.fit_transform(kfold_X_train)
        kfold_X_valid_vec = vec_transformer.transform(kfold_X_valid)
        X_te_vec = vec_transformer.transform(X_te)
        
        # Access the classifier step in the pipeline
        classifier = tools[i].named_steps['clf']
        
        models.append(classifier.fit(kfold_X_train_vec, kfold_y_train[:, i]))
        
        list_predict.append(classifier.predict_proba(X_te_vec)[:, 1])
        list_oof_predict.append(classifier.predict_proba(kfold_X_valid_vec)[:, 1])
        
    predict += np.array(list_predict).T / num_folds
    
    oof_predict[test_index] = np.array(list_oof_predict).T
    print(f'Fold {cnt} done')
    cnt+=1
print('Done')

sample_submission[classes] = predict
sample_submission.to_csv('kaggle/working/k-fold/8_sub.csv', index=False)

oof = pd.DataFrame.from_dict({'id': train['id']})
for c in classes:
    oof[c] = np.zeros(len(train))  
oof[classes] = oof_predict
oof.to_csv('kaggle/working/k-fold/8_oof.csv', index=False)

Fold 1 done
Fold 2 done
Fold 3 done
Fold 4 done
Fold 5 done
Fold 6 done
Fold 7 done
Fold 8 done
Fold 9 done
Fold 10 done
Done
