In [2]:
# quick way to change the current working directory to root (/toxic-comment-classification)
# you should run this at least once just to be certain
import os
from os import chdir, path, getcwd
for i in range(10):
    if path.isfile("checkcwd"):
        break
    chdir(path.pardir)
if path.isfile("checkcwd"):
    pass
else:
    raise Exception("Something went wrong. cwd=" + getcwd())
root_path = os.getcwd()

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from gc import collect
from joblib import dump

In [6]:
def choose_word_char_vec():
    
    word_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                        stop_words='english', ngram_range=(1, 2), max_features=20000,)
    char_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(3, 6), max_features=40000,)

    return word_vectorizer, char_vectorizer
word_vectorizer, char_vectorizer = choose_word_char_vec()       # choose TF-IDF feature


In [7]:
path = 'kaggle/input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
clean_data_path = 'clean_data/'
EMBEDDING_FILE=f'{path}glove_embeddings/glove.6B.300d.txt'
TRAIN_DATA_FILE=f'{path}{comp}train.csv.zip'
TEST_DATA_FILE=f'{path}{comp}test.csv.zip'
CLEAN_TRAIN_DATA_FILE=f'{clean_data_path}data_train_cleaned_light2.txt'
CLEAN_TEST_DATA_FILE=f'{clean_data_path}data_test_cleaned_light2.txt'
SAMPLE_SUBMISSION=f'{path}{comp}sample_submission.csv.zip'

In [8]:
save_path = 'model_checkpoint/nb_Svm/'

In [9]:
def read_from_file(filename):
    with open(filename, 'r') as f:
        return pd.Series(f.read().splitlines())

In [10]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC


In [11]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, penalty='l2', loss='squared_hinge'):
        self.C = C
        self.penalty = penalty
        self.loss = loss

    def pr(self, x, y_i, y):
        p = x[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)

    def predict(self, x):
        # Verify that model has been fit
        # check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        # check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))
    
    def pre_fit(self, x, y):
        self._r = sparse.csr_matrix(np.log(self.pr(x,1,y) / self.pr(x,0,y)))
        self.x_nb = x.multiply(self._r)

    def fit(self, x, y):
        # Check that X and y have correct shape
        # y = y.values
        # x, y = check_X_y(x, y, accept_sparse=True)
        self._clf = CalibratedClassifierCV(LinearSVC(penalty= self.penalty, C = self.C, loss= self.loss)).fit(self.x_nb, y)

        return self

In [13]:
X_train = read_from_file(CLEAN_TRAIN_DATA_FILE)
X_test = read_from_file(CLEAN_TEST_DATA_FILE)

collect()

train_word_features = word_vectorizer.fit_transform(X_train)
dump(word_vectorizer, save_path + 'word_vectorizer.bin', compress=True)
train_char_features = char_vectorizer.fit_transform(X_train)
dump(char_vectorizer, save_path + 'char_vectorizer.bin', compress=True)
del X_train
collect()

X_t = sparse.hstack([train_word_features, train_char_features])

del train_word_features
del train_char_features
collect()

test_word_features = word_vectorizer.transform(X_test)
test_char_features = char_vectorizer.transform(X_test)

del X_test
collect()

X_te = sparse.hstack([test_word_features, test_char_features])
del test_word_features
del test_char_features
collect()

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [14]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)
y = train[list_classes].values
train_shape = train.shape[0]
test_shape = test.shape[0]

oof = pd.DataFrame.from_dict({'id': train['id']})
submid = pd.DataFrame({'id': test["id"]})

In [15]:
del train
del test
collect()

0

In [16]:
collect()
predict = np.zeros((test_shape, 6))


for i in range(6):

    model = NbSvmClassifier(C=0.1, penalty='l2', loss='hinge')
    model.pre_fit(X_t, y[:, i])
    model.fit(X_t, y[:, i])
    dump(model, save_path + 'nb_Svm_' + str(i) + '.bin', compress=True)
    predict[:, i] = model.predict_proba(X_te)[:, 1]
    del model
    collect()


submission = pd.concat([submid, pd.DataFrame(predict, columns=list_classes)], axis=1)
submission.to_csv('kaggle/working/nb_Svm/1fold_nb_Svm.csv', index=False)

