In [43]:
# quick and dirty way to change the current working directory to root (/toxic-comment-classification)
# you should run this at least once just to be certain
import os
from os import chdir, path, getcwd
for i in range(10):
    if path.isfile("checkcwd"):
        break
    chdir(path.pardir)
if path.isfile("checkcwd"):
    pass
else:
    raise Exception("Something went wrong. cwd=" + getcwd())
root_path = os.getcwd()

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from gc import collect
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [45]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=10000,
    )

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(3, 6),
    max_features=40000,
    )

In [46]:
train = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_label = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
sample_submission = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

In [47]:
def read_from_file(filename):
    with open(filename, 'r') as f:
        return pd.Series(f.read().splitlines())

In [48]:
test.comment_text = read_from_file('clean_data/data_test_cleaned_light2.txt')
train.comment_text = read_from_file('clean_data/data_train_cleaned_light2.txt')

In [49]:
test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule much succesful ever whats hat...
1,0000247867823ef7,rfc title fine imo
2,00013b17ad220c46,source zawe ashton lapland
3,00017563c3f7919a,look back source information update correct fo...
4,00017695ad8997eb,anonymously edit article
...,...,...
153159,fffcd0960ee309b5,totally agree stuff nothing long crap
153160,fffd7a9a6eb32c16,throw field home plate doe get fast throw cut ...
153161,fffda9e8d6fafa9e,okinotorishima category see change agree much ...
153162,fffe8f1340a79fc2,one found nation eu germany law return quite s...


In [50]:
train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edit make username hardcore metall...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww match background colour seemingly stick t...,0,0,0,0,0,0
2,000113f07ec002fd,hey man really try edit war guy constantly rem...,0,0,0,0,0,0
3,0001b41b1c6bb37e,much cannot make real suggestion improvement w...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,time ask view completely contradict coverage r...,0,0,0,0,0,0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0
159569,fff125370e4aaaf3,look like actually put speedy version delete look,0,0,0,0,0,0


### **Split the data**

In [51]:
comments = train['comment_text']
labels = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [52]:
train_comments, val_comments, train_labels, val_labels = train_test_split(comments, labels, test_size = 0.2, random_state=42)

### **Hyper parameter tuning**

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import GridSearchCV

vectorizer = FeatureUnion([
    ('word', word_vectorizer),
    ('char', char_vectorizer)
])
# Construct the pipeline
pipeline = Pipeline([
    ('vec', vectorizer),
    ('clf', LogisticRegression(solver='saga'))
])

best_C_dict = {label: 0 for label in labels.columns}
# Define parameter grid for C
param_grid = {
    "clf__C": [0.1, 1, 2, 10, 20, 100]
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='roc_auc', verbose=2)

for label in labels.columns:    
    # Fit the grid search to the data
    grid_search.fit(train_comments, train_labels[label])

    # Get the best parameters
    best_parameter = grid_search.best_params_
    
    best_C_dict[label] = best_parameter['clf__C']
    # Output the best parameters
    print('#######################################')
    print(f'Best alpha for {label}',best_parameter)
    print('#######################################')

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] END .........................................clf__C=0.1; total time= 1.4min


KeyboardInterrupt: 