In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def write_to_file(data, filename):
    with open(filename, 'w') as f:
        for line in data:
            f.write(line + '\n')

def read_from_file(filename):
    with open(filename, 'r') as f:
        return np.array(f.read().splitlines())

# Dataframe Train

In [3]:
train = pd.read_csv("D:\\ky4\\MachineLearning\\Project\\train.csv").fillna('')
test = pd.read_csv("D:\\ky4\\MachineLearning\\Project\\test.csv").fillna('')
submission = pd.read_csv("D:\\ky4\\MachineLearning\\Project\\sample_submission.csv")

In [4]:
y_train_origin = pd.read_csv("D:\\ky4\\MachineLearning\\Project\\test_labels.csv")
classes = y_train_origin.columns.values[1:]
classes[0] 

'toxic'

# Transform tool


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True)

In [6]:
X_train_origin = read_from_file("D:\\ky4\\MachineLearning\\MLP\\Toxic-comment-classification\\clean_data\\data_train_cleaned_vanilla.txt")

In [7]:
X_test_origin = read_from_file("D:\\ky4\\MachineLearning\\MLP\\Toxic-comment-classification\\clean_data\\data_test_cleaned_vanilla.txt")

In [8]:
train.loc[:, 'comment_text'] = X_train_origin
# train = train[:1000]
test.loc[:, 'comment_text'] = X_test_origin
# test = test[:1000]

In [9]:
from sklearn.model_selection import train_test_split
# from nltk import ngrams, bigrams, trigrams
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV


In [10]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Model

In [16]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import chi2, SelectKBest

preds = preds = np.zeros((len(test), len(classes)))

def trainGrid (train, num_col, label, vectorizer = vec):
    X_train, X_test, y_train, y_test = train_test_split(train, train[label], test_size=0.3, random_state= 50)
    
    ########### transform
    X_train_trans = vectorizer.fit_transform(X_train['comment_text'])
    X_test_trans = vectorizer.transform(X_test['comment_text'])

    # print("Shape:" + str(X_train_trans.shape[1]))
    selector = SelectKBest(chi2, k=int(0.8*X_train_trans.shape[1]))  # Select top 100 features (adjust k as needed)
    X_train_trans = selector.fit_transform(X_train_trans, y_train)
    # print("Shape:" + str(X_train_trans.shape))
    X_test_trans = selector.transform(X_test_trans)
    # print(vectorizer.fit_transform(X_test['comment_text']))
    # print(vectorizer.transform(X_test['comment_text']).shape)


    def pr(y_i, y):
        p = X_train_trans[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)
    def get_mdl(y):
        y = y.values
        r = np.log(pr(1,y) / pr(0,y))
        # print(r)
        return y,r
    
    y,r = get_mdl(X_train[label])
    x_nb = X_train_trans.multiply(r)
    
    # param_grid = {'C':[1, 10, 50, 100, 250, 500, 750, 1000]}
    param_grid = {
        'C': [0.1, 1, 10, 100, 1000],'loss': ['hinge', 'squared_hinge']}
    grid = GridSearchCV(LinearSVC(),param_grid,refit = True, cv= 3, verbose=2)
    grid.fit(x_nb, y)
    # print(X_test_trans.shape)
    prediction = grid.predict(X_test_trans.multiply(r))
    print(classification_report(y_test,prediction))
    print(confusion_matrix(y_test, prediction)) 

    model = CalibratedClassifierCV(LinearSVC('l2', loss = grid.best_params_["loss"],C = grid.best_params_["C"]), cv = 3)
    train_trans = vectorizer.transform(train.comment_text)
    train_trans = selector.fit_transform(train_trans, train[label])
    # print(train_trans.shape)
    fitted_model = model.fit(train_trans.multiply(r), train[label])
    test_trans = vectorizer.transform(test.comment_text)
    test_trans = selector.transform(test_trans)
    # print(test_trans.shape)
    # print(preds.shape)
    preds[:,num_col] = fitted_model.predict_proba(test_trans.multiply(r))[:,1]


trainGrid(train, 0, 'toxic', vec)

    

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1, loss=s



[CV] END ..................................C=100, loss=hinge; total time=   0.0s
[CV] END ..................................C=100, loss=hinge; total time=   0.0s
[CV] END ..........................C=100, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=100, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=100, loss=squared_hinge; total time=   0.0s
[CV] END .................................C=1000, loss=hinge; total time=   0.0s
[CV] END .................................C=1000, loss=hinge; total time=   0.0s
[CV] END .................................C=1000, loss=hinge; total time=   0.0s
[CV] END .........................C=1000, loss=squared_hinge; total time=   0.0s




[CV] END .........................C=1000, loss=squared_hinge; total time=   0.0s
[CV] END .........................C=1000, loss=squared_hinge; total time=   0.0s
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       276
           1       0.79      0.46      0.58        24

    accuracy                           0.95       300
   macro avg       0.87      0.72      0.78       300
weighted avg       0.94      0.95      0.94       300

[[273   3]
 [ 13  11]]




ValueError: inconsistent shapes

In [None]:
for i, label in enumerate(classes):
    print(label)
    trainGrid(train, i, label, vec)


    

toxic
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1, 



[CV] END ...................................C=10, loss=hinge; total time=   0.0s
[CV] END ...................................C=10, loss=hinge; total time=   0.0s
[CV] END ...................................C=10, loss=hinge; total time=   0.0s
[CV] END ...........................C=10, loss=squared_hinge; total time=   0.0s
[CV] END ...........................C=10, loss=squared_hinge; total time=   0.0s
[CV] END ...........................C=10, loss=squared_hinge; total time=   0.0s
[CV] END ..................................C=100, loss=hinge; total time=   0.0s
[CV] END ..................................C=100, loss=hinge; total time=   0.0s




[CV] END ..................................C=100, loss=hinge; total time=   0.0s
[CV] END ..........................C=100, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=100, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=100, loss=squared_hinge; total time=   0.0s
[CV] END .................................C=1000, loss=hinge; total time=   0.0s
[CV] END .................................C=1000, loss=hinge; total time=   0.0s
[CV] END .................................C=1000, loss=hinge; total time=   0.0s




[CV] END .........................C=1000, loss=squared_hinge; total time=   0.0s
[CV] END .........................C=1000, loss=squared_hinge; total time=   0.0s
[CV] END .........................C=1000, loss=squared_hinge; total time=   0.0s
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       276
           1       0.79      0.46      0.58        24

    accuracy                           0.95       300
   macro avg       0.87      0.72      0.78       300
weighted avg       0.94      0.95      0.94       300

[[273   3]
 [ 13  11]]




severe_toxic
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................



obscene
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1



[CV] END .........................C=1000, loss=squared_hinge; total time=   0.0s
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       290
           1       0.83      0.50      0.62        10

    accuracy                           0.98       300
   macro avg       0.91      0.75      0.81       300
weighted avg       0.98      0.98      0.98       300

[[289   1]
 [  5   5]]




threat
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1,



insult
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1, loss=squared_hinge; total time=   0.0s
[CV] END ............................C=1,



              precision    recall  f1-score   support

           0       0.99      1.00      0.99       288
           1       0.89      0.67      0.76        12

    accuracy                           0.98       300
   macro avg       0.94      0.83      0.88       300
weighted avg       0.98      0.98      0.98       300

[[287   1]
 [  4   8]]
identity_hate
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ..........................C=0.1, loss=squared_hinge; total time=   0.0s
[CV] END ....................................C=1, loss=hinge; total time=   0.0s
[CV] END

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
submid = pd.DataFrame({'id': submission["id"]})
SUB = pd.concat([submid, pd.DataFrame(preds, columns = classes)], axis=1)
SUB.to_csv('submission.csv', index=False)