In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def write_to_file(data, filename):
    with open(filename, 'w') as f:
        for line in data:
            f.write(line + '\n')

def read_from_file(filename):
    with open(filename, 'r') as f:
        return np.array(f.read().splitlines())

### **Datafram Train**

In [3]:
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
submission = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

In [4]:
y_train_origin = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip")
classes = y_train_origin.columns.values[1:]
classes[0] 

'toxic'

### **Transform Tool**

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True)
# vec = CountVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, max_features=20000, binary=True)  # Binary BoW
# vec = CountVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, max_features=20000, binary=False)   # Normal BoW

In [6]:
X_train_origin = read_from_file("/kaggle/input/clean-data-2/data_train_cleaned_light2.txt")
X_test_origin = read_from_file("/kaggle/input/clean-data-2/data_test_cleaned_light2.txt")


In [7]:
train.loc[:, 'comment_text'] = X_train_origin
# train = train[:1000]
test.loc[:, 'comment_text'] = X_test_origin
# test = test[:1000]

In [8]:
from sklearn.model_selection import train_test_split
# from nltk import ngrams, bigrams, trigrams
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV


In [9]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### **Model**

In [10]:
best_C_dict = {label:0 for label in ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]}
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import chi2, SelectKBest

preds = np.zeros((len(test), len(classes)))

def trainGrid (train, num_col, label, vectorizer = vec):
    X_train, X_test, y_train, y_test = train_test_split(train, train[label], test_size=0.3, random_state= 50)
    
    ########### transform
    X_train_trans = vectorizer.fit_transform(X_train['comment_text'])
    X_test_trans = vectorizer.transform(X_test['comment_text'])

    # print("Shape:" + str(X_train_trans.shape[1]))

    # selector = SelectKBest(chi2, k=int(0.2*X_train_trans.shape[1]))  # Select top 100 features (adjust k as needed)
    # X_train_trans = selector.fit_transform(X_train_trans, y_train)

    # print("Shape:" + str(X_train_trans.shape))

    # X_test_trans = selector.transform(X_test_trans)

    # print(vectorizer.fit_transform(X_test['comment_text']))
    # print(vectorizer.transform(X_test['comment_text']).shape)


    def pr(y_i, y):
        p = X_train_trans[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)
    def get_mdl(y):
        y = y.values
        r = np.log(pr(1,y) / pr(0,y))
        # print(r)
        return y,r
    print(X_train[label].shape)
    y,r = get_mdl(X_train[label])
    x_nb = X_train_trans.multiply(r)
    x_nb = X_train_trans
    
    # param_grid = {'C':[1, 10, 50, 100, 250, 500, 750, 1000]}
    param_grid = {
        'C': [0.1, 1, 10, 100, 1000],'loss': ['hinge', 'squared_hinge']}
    grid = GridSearchCV(LinearSVC(),param_grid,refit = True, cv= 3, verbose=2)
    grid.fit(x_nb, y)
    # print(X_test_trans.shape)
    prediction = grid.predict(X_test_trans.multiply(r))
    # prediction = grid.predict(X_test_trans)
    print(f'Classification report for {label}:\n',classification_report(y_test,prediction))
    print(f'Confusion matrix for {label}:\n',confusion_matrix(y_test, prediction)) 
    
    best_C_dict[label] = grid.best_params_
    print(f'Best C for {label}:\n',grid.best_params_)

    model = CalibratedClassifierCV(LinearSVC('l2', loss = grid.best_params_["loss"],C = grid.best_params_["C"]), cv = 3)
    train_trans = vectorizer.transform(train.comment_text)

    # train_trans = selector.fit_transform(train_trans, train[label])

    print(train_trans.shape, train[label].shape)
    fitted_model = model.fit(train_trans.multiply(r), train[label])
    # fitted_model = model.fit(train_trans, train[label])
    test_trans = vectorizer.transform(test.comment_text)

    # test_trans = selector.transform(test_trans)

    # print(test_trans.shape)
    # print(preds.shape)
    preds[:,num_col] = fitted_model.predict_proba(test_trans.multiply(r))[:,1]
    # preds[:,num_col] = fitted_model.predict_proba(test_trans)[:,1]

# trainGrid(train, 0, 'toxic', vec)

In [11]:
for i, label in enumerate(classes):
    print(label)
    trainGrid(train, i, label, vec)

toxic
(111699,)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.9s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.9s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.8s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.7s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.6s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.6s




[CV] END ....................................C=1, loss=hinge; total time=   6.5s




[CV] END ....................................C=1, loss=hinge; total time=   6.2s
[CV] END ....................................C=1, loss=hinge; total time=   4.1s
[CV] END ............................C=1, loss=squared_hinge; total time=   1.4s
[CV] END ............................C=1, loss=squared_hinge; total time=   1.3s
[CV] END ............................C=1, loss=squared_hinge; total time=   1.3s




[CV] END ...................................C=10, loss=hinge; total time=  11.6s




[CV] END ...................................C=10, loss=hinge; total time=  10.6s




[CV] END ...................................C=10, loss=hinge; total time=  10.3s
[CV] END ...........................C=10, loss=squared_hinge; total time=   5.3s
[CV] END ...........................C=10, loss=squared_hinge; total time=   7.1s
[CV] END ...........................C=10, loss=squared_hinge; total time=   6.4s




[CV] END ..................................C=100, loss=hinge; total time=  12.4s




[CV] END ..................................C=100, loss=hinge; total time=  10.9s




[CV] END ..................................C=100, loss=hinge; total time=  12.0s




[CV] END ..........................C=100, loss=squared_hinge; total time=  14.1s




[CV] END ..........................C=100, loss=squared_hinge; total time=  13.5s




[CV] END ..........................C=100, loss=squared_hinge; total time=  16.1s




[CV] END .................................C=1000, loss=hinge; total time=  20.8s




[CV] END .................................C=1000, loss=hinge; total time=  20.6s




[CV] END .................................C=1000, loss=hinge; total time=  22.3s




[CV] END .........................C=1000, loss=squared_hinge; total time=  21.4s




[CV] END .........................C=1000, loss=squared_hinge; total time=  22.7s




[CV] END .........................C=1000, loss=squared_hinge; total time=  24.6s




Classification report for toxic:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     43200
           1       0.88      0.67      0.76      4672

    accuracy                           0.96     47872
   macro avg       0.93      0.83      0.87     47872
weighted avg       0.96      0.96      0.96     47872

Confusion matrix for toxic:
 [[42793   407]
 [ 1542  3130]]
Best C for toxic:
 {'C': 1, 'loss': 'hinge'}
(159571, 211316) (159571,)




severe_toxic
(111699,)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.6s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.7s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.6s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.5s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.5s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.4s




[CV] END ....................................C=1, loss=hinge; total time=   1.0s
[CV] END ....................................C=1, loss=hinge; total time=   1.2s
[CV] END ....................................C=1, loss=hinge; total time=   0.7s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.7s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.5s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.5s




[CV] END ...................................C=10, loss=hinge; total time=   1.3s




[CV] END ...................................C=10, loss=hinge; total time=   1.5s




[CV] END ...................................C=10, loss=hinge; total time=   1.7s
[CV] END ...........................C=10, loss=squared_hinge; total time=   1.6s
[CV] END ...........................C=10, loss=squared_hinge; total time=   1.9s
[CV] END ...........................C=10, loss=squared_hinge; total time=   2.0s




[CV] END ..................................C=100, loss=hinge; total time=   3.2s




[CV] END ..................................C=100, loss=hinge; total time=   3.6s




[CV] END ..................................C=100, loss=hinge; total time=   2.8s




[CV] END ..........................C=100, loss=squared_hinge; total time=   3.4s




[CV] END ..........................C=100, loss=squared_hinge; total time=   4.4s




[CV] END ..........................C=100, loss=squared_hinge; total time=   4.2s




[CV] END .................................C=1000, loss=hinge; total time=   6.6s




[CV] END .................................C=1000, loss=hinge; total time=   7.4s




[CV] END .................................C=1000, loss=hinge; total time=   7.7s




[CV] END .........................C=1000, loss=squared_hinge; total time=   4.8s




[CV] END .........................C=1000, loss=squared_hinge; total time=   7.3s




[CV] END .........................C=1000, loss=squared_hinge; total time=   7.1s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report for severe_toxic:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99     47391
           1       0.00      0.00      0.00       481

    accuracy                           0.99     47872
   macro avg       0.49      0.50      0.50     47872
weighted avg       0.98      0.99      0.98     47872

Confusion matrix for severe_toxic:
 [[47391     0]
 [  481     0]]
Best C for severe_toxic:
 {'C': 0.1, 'loss': 'hinge'}
(159571, 211316) (159571,)
obscene
(111699,)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.7s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.7s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.6s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.5s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.5s




[CV] END ....................................C=1, loss=hinge; total time=   2.5s
[CV] END ....................................C=1, loss=hinge; total time=   1.7s
[CV] END ............................C=1, loss=squared_hinge; total time=   1.1s
[CV] END ............................C=1, loss=squared_hinge; total time=   1.1s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.8s




[CV] END ...................................C=10, loss=hinge; total time=   6.0s




[CV] END ...................................C=10, loss=hinge; total time=   5.8s




[CV] END ...................................C=10, loss=hinge; total time=   7.3s
[CV] END ...........................C=10, loss=squared_hinge; total time=   4.2s
[CV] END ...........................C=10, loss=squared_hinge; total time=   3.8s
[CV] END ...........................C=10, loss=squared_hinge; total time=   5.6s




[CV] END ..................................C=100, loss=hinge; total time=   9.7s




[CV] END ..................................C=100, loss=hinge; total time=   9.3s




[CV] END ..................................C=100, loss=hinge; total time=   9.8s




[CV] END ..........................C=100, loss=squared_hinge; total time=  11.9s




[CV] END ..........................C=100, loss=squared_hinge; total time=  13.4s




[CV] END ..........................C=100, loss=squared_hinge; total time=  14.2s




[CV] END .................................C=1000, loss=hinge; total time=  21.9s




[CV] END .................................C=1000, loss=hinge; total time=  21.4s




[CV] END .................................C=1000, loss=hinge; total time=  21.3s




[CV] END .........................C=1000, loss=squared_hinge; total time=  19.6s




[CV] END .........................C=1000, loss=squared_hinge; total time=  20.1s




[CV] END .........................C=1000, loss=squared_hinge; total time=  19.7s




Classification report for obscene:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     45326
           1       0.87      0.74      0.80      2546

    accuracy                           0.98     47872
   macro avg       0.93      0.87      0.89     47872
weighted avg       0.98      0.98      0.98     47872

Confusion matrix for obscene:
 [[45046   280]
 [  662  1884]]
Best C for obscene:
 {'C': 1, 'loss': 'hinge'}
(159571, 211316) (159571,)




threat
(111699,)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.4s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.7s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.5s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.5s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.5s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.4s
[CV] END ....................................C=1, loss=hinge; total time=   0.6s
[CV] END ....................................C=1, loss=hinge; total time=   0.8s
[CV] END ....................................C=1, loss=hinge; total time=   0.7s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.4s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.4s
[CV] END ......................



[CV] END ...................................C=10, loss=hinge; total time=   1.0s




[CV] END ...................................C=10, loss=hinge; total time=   1.0s
[CV] END ...........................C=10, loss=squared_hinge; total time=   1.3s
[CV] END ...........................C=10, loss=squared_hinge; total time=   1.3s
[CV] END ...........................C=10, loss=squared_hinge; total time=   1.3s




[CV] END ..................................C=100, loss=hinge; total time=   2.7s




[CV] END ..................................C=100, loss=hinge; total time=   2.0s




[CV] END ..................................C=100, loss=hinge; total time=   2.2s




[CV] END ..........................C=100, loss=squared_hinge; total time=   1.8s




[CV] END ..........................C=100, loss=squared_hinge; total time=   1.6s




[CV] END ..........................C=100, loss=squared_hinge; total time=   1.7s




[CV] END .................................C=1000, loss=hinge; total time=   2.3s




[CV] END .................................C=1000, loss=hinge; total time=   2.3s




[CV] END .................................C=1000, loss=hinge; total time=   2.5s




[CV] END .........................C=1000, loss=squared_hinge; total time=   2.2s




[CV] END .........................C=1000, loss=squared_hinge; total time=   1.9s




[CV] END .........................C=1000, loss=squared_hinge; total time=   2.5s
Classification report for threat:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     47722
           1       0.62      0.23      0.34       150

    accuracy                           1.00     47872
   macro avg       0.81      0.62      0.67     47872
weighted avg       1.00      1.00      1.00     47872

Confusion matrix for threat:
 [[47701    21]
 [  115    35]]
Best C for threat:
 {'C': 1, 'loss': 'squared_hinge'}
(159571, 211316) (159571,)
insult
(111699,)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.5s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.5s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.4s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.6s
[CV] END ....



[CV] END ....................................C=1, loss=hinge; total time=   3.7s




[CV] END ....................................C=1, loss=hinge; total time=   3.1s
[CV] END ....................................C=1, loss=hinge; total time=   1.4s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.8s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.8s
[CV] END ............................C=1, loss=squared_hinge; total time=   1.1s




[CV] END ...................................C=10, loss=hinge; total time=   7.1s




[CV] END ...................................C=10, loss=hinge; total time=   7.0s




[CV] END ...................................C=10, loss=hinge; total time=   6.5s
[CV] END ...........................C=10, loss=squared_hinge; total time=   4.7s
[CV] END ...........................C=10, loss=squared_hinge; total time=   4.3s
[CV] END ...........................C=10, loss=squared_hinge; total time=   4.2s




[CV] END ..................................C=100, loss=hinge; total time=   9.6s




[CV] END ..................................C=100, loss=hinge; total time=   9.5s




[CV] END ..................................C=100, loss=hinge; total time=   9.8s




[CV] END ..........................C=100, loss=squared_hinge; total time=  11.4s




[CV] END ..........................C=100, loss=squared_hinge; total time=  12.7s




[CV] END ..........................C=100, loss=squared_hinge; total time=  10.8s




[CV] END .................................C=1000, loss=hinge; total time=  16.4s




[CV] END .................................C=1000, loss=hinge; total time=  16.9s




[CV] END .................................C=1000, loss=hinge; total time=  16.9s




[CV] END .........................C=1000, loss=squared_hinge; total time=  14.4s




[CV] END .........................C=1000, loss=squared_hinge; total time=  18.0s




[CV] END .........................C=1000, loss=squared_hinge; total time=  16.6s




Classification report for insult:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98     45474
           1       0.78      0.60      0.68      2398

    accuracy                           0.97     47872
   macro avg       0.88      0.80      0.83     47872
weighted avg       0.97      0.97      0.97     47872

Confusion matrix for insult:
 [[45056   418]
 [  955  1443]]
Best C for insult:
 {'C': 1, 'loss': 'hinge'}
(159571, 211316) (159571,)




identity_hate
(111699,)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.9s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.7s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.9s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.6s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.6s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.5s
[CV] END ....................................C=1, loss=hinge; total time=   1.0s




[CV] END ....................................C=1, loss=hinge; total time=   1.3s
[CV] END ....................................C=1, loss=hinge; total time=   1.5s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.6s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.6s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.8s




[CV] END ...................................C=10, loss=hinge; total time=   2.4s




[CV] END ...................................C=10, loss=hinge; total time=   1.8s
[CV] END ...................................C=10, loss=hinge; total time=   2.5s
[CV] END ...........................C=10, loss=squared_hinge; total time=   1.8s
[CV] END ...........................C=10, loss=squared_hinge; total time=   2.0s
[CV] END ...........................C=10, loss=squared_hinge; total time=   2.3s




[CV] END ..................................C=100, loss=hinge; total time=   4.4s




[CV] END ..................................C=100, loss=hinge; total time=   4.5s




[CV] END ..................................C=100, loss=hinge; total time=   4.6s




[CV] END ..........................C=100, loss=squared_hinge; total time=   6.2s




[CV] END ..........................C=100, loss=squared_hinge; total time=   7.8s




[CV] END ..........................C=100, loss=squared_hinge; total time=   5.3s




[CV] END .................................C=1000, loss=hinge; total time=  12.8s




[CV] END .................................C=1000, loss=hinge; total time=  14.6s




[CV] END .................................C=1000, loss=hinge; total time=  11.4s




[CV] END .........................C=1000, loss=squared_hinge; total time=  13.9s




[CV] END .........................C=1000, loss=squared_hinge; total time=  15.0s




[CV] END .........................C=1000, loss=squared_hinge; total time=  13.8s




Classification report for identity_hate:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     47442
           1       0.76      0.22      0.34       430

    accuracy                           0.99     47872
   macro avg       0.88      0.61      0.67     47872
weighted avg       0.99      0.99      0.99     47872

Confusion matrix for identity_hate:
 [[47413    29]
 [  337    93]]
Best C for identity_hate:
 {'C': 1, 'loss': 'hinge'}
(159571, 211316) (159571,)




In [12]:
# best_C_dict = {'toxic': {'C': 0.1, 'loss': 'squared_hinge'},
#  'severe_toxic': {'C': 0.1, 'loss': 'hinge'},
#  'obscene': {'C': 0.1, 'loss': 'hinge'},
#  'threat': {'C': 0.1, 'loss': 'squared_hinge'},
#  'insult': {'C': 0.1, 'loss': 'hinge'},
#  'identity_hate': {'C': 0.1, 'loss': 'squared_hinge'}}   # Binary BoW

# best_C_dict = {'toxic': {'C': 0.1, 'loss': 'squared_hinge'},
#  'severe_toxic': {'C': 0.1, 'loss': 'squared_hinge'},
#  'obscene': {'C': 0.1, 'loss': 'hinge'},
#  'threat': {'C': 0.1, 'loss': 'hinge'},
#  'insult': {'C': 0.1, 'loss': 'hinge'},
#  'identity_hate': {'C': 0.1, 'loss': 'hinge'}}  # Normal BoW

# best_C_dict = {'toxic': {'C': 1, 'loss': 'hinge'},
#  'severe_toxic': {'C': 0.1, 'loss': 'hinge'},
#  'obscene': {'C': 1, 'loss': 'hinge'},
#  'threat': {'C': 1, 'loss': 'squared_hinge'},
#  'insult': {'C': 1, 'loss': 'hinge'},
#  'identity_hate': {'C': 1, 'loss': 'hinge'}}  # TfIDF

In [13]:
submid = pd.DataFrame({'id': submission["id"]})
SUB = pd.concat([submid, pd.DataFrame(preds, columns = classes)], axis=1)
SUB.to_csv('submission.csv', index=False)