## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
%cd D:\\python\\Toxic-comment-classification\\clean_data

D:\python\Toxic-comment-classification\clean_data


In [3]:
def write_to_file(data, filename):
    with open(filename, 'w') as f:
        for line in data:
            f.write(line + '\n')

def read_from_file(filename):
    with open(filename, 'r') as f:
        return np.array(f.read().splitlines())

In [4]:
y_train = pd.read_csv('labels.csv')
classes = y_train.columns.values
classes

array(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'], dtype=object)

## TF_IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True)

## Data cleaned light

In [6]:
X_train = read_from_file('data_train_cleaned_vanilla.txt')
X_test = read_from_file('data_test_cleaned_vanilla.txt')
X_train.shape, X_test.shape

((159571,), (153164,))

In [7]:
train_doc = vec.fit_transform(X_train)
test_doc = vec.transform(X_test)
vec.get_feature_names_out()

array(['__', '__ once', '___', ..., '連絡 見学', '雲水', '雲水 http'],
      dtype=object)

In [8]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
from sklearn.metrics import roc_auc_score

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, solver = 'lbfgs', dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.solver = solver
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(solver = self.solver, C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self


In [9]:
preds = np.zeros((len(X_test), len(classes)))

for i, col in enumerate(classes):
    print(col)
    model = NbSvmClassifier(C=4, n_jobs=-1).fit(train_doc, y_train[col])
    preds[:, i] = model.predict_proba(test_doc)[:, 1]

toxic
severe_toxic
obscene
threat
insult
identity_hate


In [10]:
sample_submission = pd.read_csv('D:\python\Toxic-comment-classification\kaggle\input\jigsaw-toxic-comment-classification-challenge\sample_submission.csv.zip')

In [11]:
%cd D:\\python\\Toxic-comment-classification\\kaggle\\working

D:\python\Toxic-comment-classification\kaggle\working


In [12]:
submid = pd.DataFrame({'id': sample_submission["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = classes)], axis=1)
submission.to_csv('vanilla_submission.csv', index=False)

In [13]:
%cd D:\\python\\Toxic-comment-classification\\clean_data

D:\python\Toxic-comment-classification\clean_data


In [15]:
X_train = read_from_file('data_train_cleaned_light.txt')
X_test = read_from_file('data_test_cleaned_light.txt')

In [16]:
train_doc = vec.fit_transform(X_train)
test_doc = vec.transform(X_test)
vec.get_feature_names_out()

array(['10', '10 amendment', '10 century', ..., '連絡 見学', '雲水', '雲水 http'],
      dtype=object)

In [None]:
preds = np.zeros((len(X_test), len(classes)))

for i, col in enumerate(classes):
    print(col)
    model = NbSvmClassifier(C=4, n_jobs=-1).fit(train_doc, y_train[col])
    preds[:, i] = model.predict_proba(test_doc)[:, 1]