In [1]:
import re
import nltk
import pandas as pd
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split

In [2]:
stop_words = nltk.corpus.stopwords.words('english')

In [3]:
df = pd.read_csv('../train_data.csv')

In [4]:
clean_words = re.compile(r'[^\w\s]+')
clean_space = re.compile(r'\s+')

def clean(x):
    x = clean_words.sub(' ', x.lower())
    return clean_space.sub(' ', x)

# Sklearn

In [5]:
def preprocess(df, vectorizer=None, fit_vectorizer=True):
    X = df['comment_text'].apply(clean)
    y = df['toxic'].values
    if vectorizer is None:
        vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=100000)
    if fit_vectorizer:
        X = vectorizer.fit_transform(X)
        return X, y, vectorizer
    else:
        X = vectorizer.transform(X)
        return X, y

# Tfidf vectorizer

In [6]:
vec = TfidfVectorizer(stop_words=stop_words, max_features=100000)
x_train, y_train, vec = preprocess(df, vec)
df_test = pd.read_csv('../test_data.csv')
x_test, y_test = preprocess(df_test, vectorizer=vec, fit_vectorizer=False)

In [7]:
def train_model(model, X, y, **kwargs):
    model = model(**kwargs)
    model.fit(X, y)
    return model

def evaluate_model(model, X, y, verbose=True):
    y_pred = model.predict(X)
    print(classification_report(y, y_pred))
    return y_pred    

In [32]:
def train_and_evaluate_models(models, x_train, y_train, x_test, y_test, **kwargs):
    all_models = []
    for model in models:
        model = train_model(model, x_train, y_train, **kwargs)
        print('*'*80)
        print(model.__class__.__name__)
        evaluate_model(model, x_test, y_test)
        all_models.append(model)
    return all_models

In [33]:
MODELS = [LinearSVC, Perceptron, SGDClassifier, PassiveAggressiveClassifier, LogisticRegression]
train_and_evaluate_models(MODELS, x_train, y_train, x_test, y_test)

********************************************************************************
LinearSVC
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      5040
           1       0.84      0.62      0.72       549

    accuracy                           0.95      5589
   macro avg       0.90      0.81      0.85      5589
weighted avg       0.95      0.95      0.95      5589

********************************************************************************
Perceptron
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      5040
           1       0.74      0.63      0.68       549

    accuracy                           0.94      5589
   macro avg       0.85      0.80      0.82      5589
weighted avg       0.94      0.94      0.94      5589

********************************************************************************
SGDClassifier
              precision    recall  f1-score   support

           0 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
           verbose=0),
 Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
            fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
            penalty=None, random_state=0, shuffle=True, tol=0.001,
            validation_fraction=0.1, verbose=0, warm_start=False),
 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
               early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
               l1_ratio=0.15, learning_rate='optimal', loss='hinge',
               max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
               power_t=0.5, random_state=None, shuffle=True, tol=0.001,
               validation_fraction=0.1, verbose=0, warm_start=False),
 PassiveAggressiveClassifier(C=1.0, average

# Hashing Vectorizer

In [34]:
vec = HashingVectorizer(stop_words=stop_words)
x_train, y_train, vec = preprocess(df, vec)
df_test = pd.read_csv('../test_data.csv')
x_test, y_test = preprocess(df_test, vectorizer=vec, fit_vectorizer=False)

In [35]:
MODELS = [LinearSVC, Perceptron, SGDClassifier, PassiveAggressiveClassifier, LogisticRegression]
train_and_evaluate_models(MODELS, x_train, y_train, x_test, y_test)

********************************************************************************
LinearSVC
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      5040
           1       0.84      0.62      0.72       549

    accuracy                           0.95      5589
   macro avg       0.90      0.81      0.85      5589
weighted avg       0.95      0.95      0.95      5589

********************************************************************************
Perceptron
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      5040
           1       0.74      0.63      0.68       549

    accuracy                           0.94      5589
   macro avg       0.85      0.80      0.82      5589
weighted avg       0.94      0.94      0.94      5589

********************************************************************************
SGDClassifier
              precision    recall  f1-score   support

           0 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
           verbose=0),
 Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
            fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
            penalty=None, random_state=0, shuffle=True, tol=0.001,
            validation_fraction=0.1, verbose=0, warm_start=False),
 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
               early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
               l1_ratio=0.15, learning_rate='optimal', loss='hinge',
               max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
               power_t=0.5, random_state=None, shuffle=True, tol=0.001,
               validation_fraction=0.1, verbose=0, warm_start=False),
 PassiveAggressiveClassifier(C=1.0, average