In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import matplotlib.pyplot as plt 
import re

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

from sklearn import metrics 

In [2]:
# Load data

train = pd.read_csv(r'train.csv')
test = pd.read_csv(r'test.csv')

In [3]:
# Combine comments for processing

train_comments = train['comment_text']
test_comments = train['comment_text']
allComments = pd.concat([train_comments, test_comments])

In [4]:
# Vectorize words

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000
)

word_vectorizer.fit(allComments)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents='unicode',
                sublinear_tf=True, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [5]:
# Vectorized characters

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=10000
)

char_vectorizer.fit(allComments)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10000,
                min_df=1, ngram_range=(2, 6), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents='unicode',
                sublinear_tf=True, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [6]:
# Transform fitted data
train_word_features = word_vectorizer.transform(train_comments)
test_word_features = word_vectorizer.transform(test_comments)

train_char_features = char_vectorizer.transform(train_comments)
test_char_features = char_vectorizer.transform(test_comments)

In [7]:
# Combine both word_vectorized and char_vectorized data

train_features = hstack([train_word_features, train_char_features])
test_features = hstack([test_char_features, test_char_features])

In [9]:
# Method 1: Using cross_val_score 

# cross_val_score is a helper function on the estimator and the dataset 
# (where cv will output estimate scores)

def cvs(classifier, trainingSet, testingSet, split=3):
    cv = cross_val_score(classifier, trainingSet, testingSet, cv=split, scoring='roc_auc')
    
    return cv, "Score: {:.2%}".format(cv['test_score'])


train_target = train.toxic
cvs(LogisticRegression(C=3, solver='liblinear', random_state=42), train_features, train_target)
# cvs(LogisticRegression(random_state=42), train_word_features, train_target)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# Apply to all classifications

TARGET_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for target in TARGET_COLUMNS:
    train_target = train[target]
    s = cvs(LogisticRegression(C=3, solver='liblinear', random_state=42), train_features, train_target)
    print('Label: {}\nScore: {:.3%}'.format(target, s[1]))

In [None]:
# Method 2: Using Cross Validate 

# cross_validate allows specifying multiple metrics for evaluation
# and returns a dict containing the training scores, fit-times, 
# and score-times in addition to test score

def cv(classifier, trainingSet, testingSet, split=3):
    cv2 = cross_validate(classifier, train_word_features, train_target, cv=3, scoring=('roc_auc','neg_log_loss', 'accuracy'))

#     m = np.mean(cv2['test_roc_auc'])


    return cv2


train_target = train.toxic
cv(LogisticRegression(C=3, solver='liblinear', random_state=42), train_word_features, train_target)

In [None]:
# Method 3: Using standard train_test_split

def tts(classifier, trainingSet, testingSet, size=0.33):
    X_train, X_test, y_train, y_test = train_test_split(trainingSet, testingSet, test_size=0.33, random_state=42)
#     X_train, X_test, y_train, y_test = train_test_split(train_word_features, train_target, test_size=0.33, random_state=42)

    clf = classifier
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Accuracy
    acc = metrics.accuracy_score(y_test, y_pred)
    
    # Return probability estimates, where a matrix of (N,2) is returned.
    # The first value is the probability of toxic = 0, second is toxic = 1. 
    # Their sum will equal 1.

    # We want the 2nd column (toxic = 1)
    y_pred_proba = clf.predict_proba(X_test)[::, 1]

    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    plt.plot(fpr, tpr, label="AUC = {:.2%}".format(auc))
    plt.title('ROC')
    plt.legend()
    plt.show()
    
    n = metrics.log_loss(y_test, y_pred_proba)
    print('Log Loss: {:.3%}'.format(n))
    
    return acc
    
    
train_target = train.toxic
tts(LogisticRegression(C=3, solver='liblinear', random_state=42), train_word_features, train_target)

In [None]:
# Method #4 (Pipeline)

from sklearn.pipeline import make_pipeline

# Note that we are not using sparse matrices (must be iterable data for train_test_split)
train_target = train.toxic
X_train, X_test, y_train, y_test = train_test_split(train_comments, train_target, test_size=0.33, random_state=42)


model = make_pipeline(TfidfVectorizer(), LogisticRegression(C=3, solver='liblinear', random_state=42))

param_grid = [{'tfidf__sublinear_tf' : True,
              'tfidf__strip_accents' : 'unicode',
              'tfidf__analyzer' : 'word',
              'tfidf__token_pattern' : r'\w{1,}',
              'tfidf__stop_words' : 'english',
               'tfidf__ngram_range' : (1, 1),
               'tfidf__max_features' : 10000,
               'logit__C' : 0.1
              }]

# Fit
model.fit(X_train, y_train)

# Predict
pred = model.predict_proba(X_test)

# Metrics
ll = metrics.log_loss(y_test, pred)
print('Log Loss: {:.3%}'.format(ll))

# y_pred_proba = model.predict_proba(X_test)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, pred[::, 1])
auc = metrics.roc_auc_score(y_test, pred[::, 1])
print('Area Under Curve: {:.3%}'.format(auc))

In [None]:
# https://www.kaggle.com/metadist/work-like-a-pro-with-pipelines-and-feature-unions
# https://scikit-learn.org/0.18/auto_examples/hetero_feature_union.html
# https://datascience.stackexchange.com/questions/22813/using-tf-idf-with-other-features-in-sklearn