In [1]:
# quick way to change the current working directory to root (/toxic-comment-classification)
# you should run this at least once just to be certain
from os import chdir, path, getcwd

if getcwd().endswith("src"):
    chdir(path.pardir)
if path.isfile("checkcwd"):
    print("Success")
else:
    raise Exception("Something went wrong. cwd=" + getcwd())

Success


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from gc import collect

In [3]:
def choose_word_char_vec(i=1):
    if i == 1:
        word_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                        stop_words='english', ngram_range=(1, 2), max_features=20000,)
        char_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(3, 6), max_features=40000,)
    elif i == 2:
        word_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', analyzer='word',
                                        token_pattern= r'\w{1,}', stop_words='english',max_features=20000, binary=True)
        char_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9,strip_accents='unicode', analyzer='char', 
                                        token_pattern= r'\w{1,}', stop_words='english',max_features=20000, binary= True)
    elif i == 3:
        word_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', analyzer='word', 
                                        token_pattern= r'\w{1,}', stop_words='english',max_features=20000, binary=False)
        char_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9,strip_accents='unicode', analyzer='char', 
                                        token_pattern= r'\w{1,}', stop_words='english',max_features=20000, binary= False)

    return word_vectorizer, char_vectorizer
word_vectorizer, char_vectorizer = choose_word_char_vec(i=1)       # choose TF-IDF feature
# word_vectorizer, char_vectorizer = choose_word_char_vec(i=2)       # choose binary BoW feature
# word_vectorizer, char_vectorizer = choose_word_char_vec(i=3)       # choose normal BoW feature

In [4]:
path = 'kaggle/input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
clean_data_path = 'clean_data/'
EMBEDDING_FILE=f'{path}glove_embeddings/glove.6B.300d.txt'
TRAIN_DATA_FILE=f'{path}{comp}train.csv.zip'
TEST_DATA_FILE=f'{path}{comp}test.csv.zip'
CLEAN_TRAIN_DATA_FILE=f'{clean_data_path}data_train_cleaned_light2.txt'
CLEAN_TEST_DATA_FILE=f'{clean_data_path}data_test_cleaned_light2.txt'
SAMPLE_SUBMISSION=f'{path}{comp}sample_submission.csv.zip'

In [5]:
def read_from_file(filename):
    with open(filename, 'r') as f:
        return pd.Series(f.read().splitlines())

In [6]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC


In [7]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, penalty='l2', loss='squared_hinge'):
        self.C = C
        self.penalty = penalty
        self.loss = loss

    def pr(self, x, y_i, y):
        p = x[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)

    def predict(self, x):
        # Verify that model has been fit
        # check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        # check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))
    
    def pre_fit(self, x, y):
        self._r = sparse.csr_matrix(np.log(self.pr(x,1,y) / self.pr(x,0,y)))
        self.x_nb = x.multiply(self._r)

    def fit(self, x, y):
        # Check that X and y have correct shape
        # y = y.values
        # x, y = check_X_y(x, y, accept_sparse=True)
        self._clf = CalibratedClassifierCV(LinearSVC(penalty= self.penalty, C = self.C, loss= self.loss)).fit(self.x_nb, y)

        return self

In [8]:
X_train = read_from_file(CLEAN_TRAIN_DATA_FILE)
X_test = read_from_file(CLEAN_TEST_DATA_FILE)

collect()

train_word_features = word_vectorizer.fit_transform(X_train)
train_char_features = char_vectorizer.fit_transform(X_train)

del X_train
collect()

X_t = sparse.hstack([train_word_features, train_char_features])

del train_word_features
del train_char_features
collect()

test_word_features = word_vectorizer.transform(X_test)
test_char_features = char_vectorizer.transform(X_test)

del X_test
collect()

X_te = sparse.hstack([test_word_features, test_char_features])
del test_word_features
del test_char_features
collect()

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [9]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)
y = train[list_classes].values
train_shape = train.shape[0]
test_shape = test.shape[0]

oof = pd.DataFrame.from_dict({'id': train['id']})
submid = pd.DataFrame({'id': test["id"]})

In [10]:
del train
del test
collect()

0

In [11]:
from sklearn.model_selection import KFold
num_folds = 10

collect()

scores = []
oof_predict = np.zeros((train_shape, 6))

predict = np.zeros((test_shape, 6))
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

lst_num = [0, 1, 2, 3, 4, 5]

for train_index, test_index in kf.split(X_t):
    
    kfold_y_train, kfold_y_test = y[train_index], y[test_index]
    kfold_X_train = X_t[train_index]

    kfold_X_valid = X_t[test_index]

    for i in range(6):
        print('start')
        ''' 
        those parameter with each label are gotten when running Complete_Grid_SVM. 
        we implement step choose those parameter mannually with conditional code
        '''
        if (i==0) or (i==2) or (i==4) or (i==5):
            model = NbSvmClassifier(C=0.1, penalty='l2', loss='hinge')
        else:
            model = NbSvmClassifier(C=0.1, penalty='l2', loss='squared_hinge')
          

        # model = NbSvmClassifier(C=2, solver='saga')
        model.pre_fit(kfold_X_train, kfold_y_train[:, i])
        model.fit(kfold_X_train, kfold_y_train[:, i])
        predict[:, i] += model.predict_proba(X_te)[:, 1] / num_folds
        oof_predict[test_index, i] = model.predict_proba(kfold_X_valid)[:, 1]
        del model
        collect()
        print('end')


    print('fold done')

    
print('Done')


submission = pd.concat([submid, pd.DataFrame(predict, columns=list_classes)], axis=1)
''' 6.x, x corresponding with the number you choose when choose vector '''
submission.to_csv('kaggle/working/6.1_sub.csv', index=False)
# submission.to_csv('kaggle/working/6.2_sub.csv', index=False)
# submission.to_csv('kaggle/working/6.3_sub.csv', index=False)


for c in list_classes:   
    oof[c] = np.zeros(train_shape)

    
oof[list_classes] = oof_predict
oof.to_csv('kaggle/working/6.1_oof.csv', index=False)
# oof.to_csv('kaggle/working/6.2_oof.csv', index=False)
# oof.to_csv('kaggle/working/6.3_oof.csv', index=False)

start




end
start




end
start




end
start




end
start




end
start




end
fold done
start




end
start




end
start




end
start




end
start




end
start




end
fold done
start




KeyboardInterrupt: 