In [1]:
import pandas as pd
import numpy as np

In [2]:
# you should run this at least once just to be certain
import os
from os import chdir, path, getcwd
for i in range(10):
    if path.isfile("checkcwd"):
        break
    chdir(path.pardir)
if path.isfile("checkcwd"):
    pass
else:
    raise Exception("Something went wrong. cwd=" + getcwd())
root_path = os.getcwd()

In [3]:
path = 'kaggle/input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
clean_data_path = 'clean_data/'
TRAIN_DATA_FILE=f'{path}{comp}train.csv.zip'
TEST_DATA_FILE=f'{path}{comp}test.csv.zip'
CLEAN_TRAIN_DATA_FILE=f'{clean_data_path}data_train_cleaned_light2.txt'
CLEAN_TEST_DATA_FILE=f'{clean_data_path}data_test_cleaned_light2.txt'
SAMPLE_SUBMISSION=f'{path}{comp}sample_submission.csv.zip'
LABELS = f'{clean_data_path}labels.csv'

In [4]:
def write_to_file(data, filename):
    with open(filename, 'w') as f:
        for line in data:
            f.write(line + '\n')

def read_from_file(filename):
    with open(filename, 'r') as f:
        return np.array(f.read().splitlines())

# Dataframe original

In [5]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

In [6]:

submission = pd.read_csv(SAMPLE_SUBMISSION)

In [7]:
y_train_origin = pd.read_csv(LABELS)
classes = y_train_origin.columns.values
classes[0]

'toxic'

# Transform tool


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def chooseVector(i=1):
    if i == 1:
        vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True)
    elif i == 2:
        vec = CountVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, max_features=20000, binary=True)
    elif i == 3:
        vec = CountVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, max_features=20000, binary=False)
    return vec
vec = chooseVector(1)


In [10]:
X_train_origin = read_from_file(CLEAN_TRAIN_DATA_FILE)

In [None]:
X_test_origin = read_from_file(CLEAN_TEST_DATA_FILE)

In [12]:
''' You can set data length to run all cells below faster, but the accuracy is not be guaranteed'''
train.loc[:, 'comment_text'] = X_train_origin
# train = train[:1000]
test.loc[:, 'comment_text'] = X_test_origin
# test = test[:1000]

# Model

In [13]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

In [None]:
preds = np.zeros((len(test), len(classes)))
best_C_dict = {label:0 for label in ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]}
def trainGrid (train, num_col, label, vectorizer = vec):
    X_train, X_test, y_train, y_test = train_test_split(train, train[label], test_size=0.3, random_state= 50)
    
    ''' transform '''
    X_train_trans = vectorizer.fit_transform(X_train['comment_text'])
    X_test_trans = vectorizer.transform(X_test['comment_text'])
    y = y_train.values

    ''' implement Naive Bayes part'''
    def pr(y_i, y):
        p = X_train_trans[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)
    
    def get_mdl(y):
        y = y.values
        r = np.log(pr(1,y) / pr(0,y))
        return y,r

    y,r = get_mdl(y_train)
    x_nb = X_train_trans.multiply(r)
    # x_nb = X_train_trans

    ''' implement the grid search to find the best parameter for Linear SVC'''
    param_grid = {'C': [0.1, 1, 10, 100, 1000],'loss': ['hinge', 'squared_hinge']}
    grid = GridSearchCV(LinearSVC(),param_grid,refit = True, cv= 3, verbose=2)
    grid.fit(x_nb, y)
    ''' get the best parameter, add to dictionary to store for each label '''
    best_C_dict[label] = grid.best_params_

    ''' use those parameter to train model '''

    model = CalibratedClassifierCV(LinearSVC('l2', loss = grid.best_params_["loss"],C = grid.best_params_["C"]), cv = 3)
    train_trans = vectorizer.transform(train.comment_text)
    test_trans = vectorizer.transform(test.comment_text)
    fitted_model = model.fit(train_trans.multiply(r), train[label])

    ''' store value of prediction to created array '''
    preds[:,num_col] = fitted_model.predict_proba(test_trans.multiply(r))[:,1]

In [None]:
print(best_C_dict)

In [None]:
for i, label in enumerate(classes):
    print(label)
    trainGrid(train, i, label, vec)

In [None]:
submid = pd.DataFrame({'id': submission["id"]})
SUB = pd.concat([submid, pd.DataFrame(preds, columns = classes)], axis=1)
SUB.to_csv('submission.csv', index=False)