## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
    # for filename in filenames:
    #     print(os.path.join(dirname, filename))

In [2]:
from os import chdir, path, getcwd
if getcwd().endswith("src"):
    chdir(path.pardir)
if path.isfile("checkcwd"):
    print("Success")
else:
    raise Exception("Something went wrong. cwd=" + getcwd())

Success


In [3]:
def write_to_file(data, filename):
    with open(filename, 'w') as f:
        for line in data:
            f.write(line + '\n')

def read_from_file(filename):
    with open(filename, 'r') as f:
        return pd.Series(f.read().splitlines())

In [4]:
y_train = pd.read_csv('clean_data/labels.csv')
sample_submission = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
classes = y_train.columns.values
classes

array(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'], dtype=object)

## TF_IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
# vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True)

In [6]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    max_features=50000)

## Define NB-SVM model

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
from sklearn.metrics import roc_auc_score

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, solver = 'lbfgs', dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.solver = solver
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(solver = self.solver, C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

D:\python\Toxic-comment-classification\kaggle\working


In [9]:
def submit_from_file(version):
    dir = 'clean_data/'
    X_train = read_from_file(dir + 'data_train_cleaned_' + version + '.txt')
    X_test = read_from_file(dir + 'data_test_cleaned_' + version + '.txt')
    full_text = np.concatenate([X_train, X_test])
    word_vectorizer.fit(full_text)
    train_word_features = word_vectorizer.transform(X_train)
    test_word_features = word_vectorizer.transform(X_test)
    print(word_vectorizer.get_feature_names_out())
    char_vectorizer.fit(full_text)
    train_char_features = char_vectorizer.transform(X_train)
    test_char_features = char_vectorizer.transform(X_test)
    train_doc = sparse.hstack([train_word_features, train_char_features])
    test_doc = sparse.hstack([test_word_features, test_char_features])
    print(char_vectorizer.get_feature_names_out())
    print(train_doc.shape)


    from sklearn.model_selection import KFold
    num_folds = 10

    preds = np.zeros((len(X_test), len(classes)))
    oof_predict = np.zeros((len(X_train), len(classes)))
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    for train_index, test_index in kf.split(train_doc):
        kfold_y_train, kfold_y_test = y_train[train_index], y_train[test_index]
        kfold_X_train = train_doc[train_index]

        kfold_X_valid = train_doc[test_index]

        for i, col in enumerate(classes):
            # print(col)
            model = NbSvmClassifier(C=4, n_jobs=-1).fit(kfold_X_train, kfold_y_train[:, i])
            preds[:, i] += model.predict_proba(test_doc)[:, 1] / num_folds
            oof_predict[test_index, i] = model.predict_proba(kfold_X_valid)[:, 1]
    
    print('Done')
        
    submid = pd.DataFrame({'id': sample_submission["id"]})
    submission = pd.concat([submid, pd.DataFrame(preds, columns = classes)], axis=1)
    submission.to_csv('kaggle/working/5_sub.csv', index=False)

    train = pd.read_csv('kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

    oof = pd.DataFrame.from_dict({'id': train['id']})
    for c in classes:   
        oof[c] = np.zeros(len(train))

    del train
    
    oof[classes] = oof_predict
    oof.to_csv('kaggle/working/' + '5_oof.csv', index=False)



## Vanilla

In [10]:
submit_from_file('vanilla2')

['aa' 'aaaaa' 'aaaaaaaaaaaaaaaa' ... 'zu' 'zuck' 'zuckerberg']
[' >' ' > ' ' a' ... 'zzzz' 'zzzzz' 'zzzzzz']
(159571, 60000)
toxic
severe_toxic
obscene
threat
insult
identity_hate


## Light

In [11]:
# submit_from_file('light')

['aa' 'aaaaa' 'aaaaaaaaaaaaaaaa' ... 'zuck' 'zuckerberg' 'zum']
[' <' ' >' ' > ' ... 'zzzz' 'zzzzz' 'zzzzzz']
(159571, 60000)
toxic
severe_toxic
obscene
threat
insult
identity_hate


## Spacy

In [12]:
# submit_from_file('spacy')