## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
%cd D:\\python\\Toxic-comment-classification\\clean_data

D:\python\Toxic-comment-classification\clean_data


In [3]:
def write_to_file(data, filename):
    with open(filename, 'w') as f:
        for line in data:
            f.write(line + '\n')

def read_from_file(filename):
    with open(filename, 'r') as f:
        return np.array(f.read().splitlines())

In [4]:
y_train = pd.read_csv('labels.csv')
sample_submission = pd.read_csv('D:\python\Toxic-comment-classification\kaggle\input\jigsaw-toxic-comment-classification-challenge\sample_submission.csv.zip')
classes = y_train.columns.values
classes

array(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'], dtype=object)

## TF_IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
# vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True)

In [6]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    max_features=50000)

## Define NB-SVM model

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
from sklearn.metrics import roc_auc_score

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, solver = 'lbfgs', dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.solver = solver
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(solver = self.solver, C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [8]:
%cd D:\\python\\Toxic-comment-classification\\kaggle\\working

D:\python\Toxic-comment-classification\kaggle\working


In [9]:
def submit_from_file(version):
    dir = 'D:\\python\\Toxic-comment-classification\\clean_data\\'
    X_train = read_from_file(dir + 'data_train_cleaned_' + version + '.txt')
    X_test = read_from_file(dir + 'data_test_cleaned_' + version + '.txt')
    full_text = np.concatenate([X_train, X_test])
    word_vectorizer.fit(full_text)
    train_word_features = word_vectorizer.transform(X_train)
    test_word_features = word_vectorizer.transform(X_test)
    print(word_vectorizer.get_feature_names_out())
    char_vectorizer.fit(full_text)
    train_char_features = char_vectorizer.transform(X_train)
    test_char_features = char_vectorizer.transform(X_test)
    train_doc = sparse.hstack([train_word_features, train_char_features])
    test_doc = sparse.hstack([test_word_features, test_char_features])
    print(char_vectorizer.get_feature_names_out())
    print(train_doc.shape)
    preds = np.zeros((len(X_test), len(classes)))
    for i, col in enumerate(classes):
        print(col)
        model = NbSvmClassifier(C=4, n_jobs=-1).fit(train_doc, y_train[col])
        preds[:, i] = model.predict_proba(test_doc)[:, 1]

    submid = pd.DataFrame({'id': sample_submission["id"]})
    submission = pd.concat([submid, pd.DataFrame(preds, columns = classes)], axis=1)
    submission.to_csv(version + '_submission.csv', index=False)

## Vanilla

In [10]:
submit_from_file('vanilla')



(159571, 60000)
['aa' 'aaaaa' 'aaaaaaaaaaaaaaaa' ... 'zu' 'zuck' 'zuckerberg']
[' >' ' > ' ' a' ... 'zzzz' 'zzzzz' 'zzzzzz']
toxic
severe_toxic
obscene
threat
insult
identity_hate


In [11]:
print(char_vectorizer.__dict__['vocabulary_'])

{'ex': 18579, 'xp': 48960, 'pl': 36784, 'la': 26757, 'an': 8069, 'na': 30395, 'at': 9426, 'ti': 44897, 'io': 24539, 'on': 34040, 'n ': 29668, ' w': 6161, 'wh': 48538, 'hy': 22460, 'y ': 49023, ' t': 5475, 'th': 44437, 'he': 21210, 'e ': 13927, ' e': 1710, 'ed': 15807, 'di': 13395, 'it': 25290, 'ts': 45692, 's ': 40083, ' m': 3488, 'ma': 28678, 'ad': 7123, 'de': 13138, ' u': 5945, 'un': 46785, 'nd': 30639, 'er': 17314, 'r ': 37409, 'my': 29615, 'us': 47242, 'se': 41231, 'rn': 39301, 'am': 7947, 'me': 28892, ' h': 2332, 'ha': 20936, 'ar': 8720, 'rd': 38228, 'dc': 13094, 'co': 11823, 'or': 34803, 're': 38300, 'et': 18265, 'ta': 43844, 'al': 7601, 'll': 27675, 'li': 27412, 'ic': 22986, 'ca': 10878, 'a ': 6489, ' f': 1928, 'fa': 18958, 'we': 48430, ' r': 4681, 'ev': 18427, 've': 47729, 'rt': 39782, 'te': 44055, 'd ': 12305, 'ey': 18656, 'en': 16816, 't ': 42902, ' v': 6080, 'va': 47677, 'da': 13006, 'is': 24833, 'sm': 41933, 'ms': 29537, ' j': 3168, 'ju': 25911, 'st': 42418, ' c': 1127, 'cl

## Light

In [22]:
submit_from_file('light')

['aa' 'aa talk' 'aaa' ... 'zzz' 'zzzz' 'zzzzz']
toxic
severe_toxic
obscene
threat
insult
identity_hate


## Spacy

In [23]:
submit_from_file('spacy')

['aa' 'aa aa' 'aa talk' ... 'zzz' 'zzzz' 'zzzzz']
toxic
severe_toxic
obscene
threat
insult
identity_hate
