In [20]:
import pandas as pd
import numpy as np
import os
import re
import string
import gensim
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import xgboost
from xgboost import XGBClassifier

In [2]:
N_DIMS = 300
pretrained = 'data\\glove.840B.300d.txt'

In [3]:
def get_coefs(row):
    row = row.strip().split()
    # can't use row[0], row[1:] split because 840B contains multi-part words 
    word, arr = " ".join(row[:-N_DIMS]), row[-N_DIMS:]
    return word, np.asarray(arr, dtype='float32')

In [4]:
def get_glove():
    return dict(get_coefs(row) for row in open(pretrained, encoding="utf-8"))

In [5]:
# Transforms

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(row):
    return re_tok.sub(r' \1 ', row).lower().split()

In [6]:
# Feature engineering

def lengths(series):
    return np.array(series.apply(len)).reshape(-1,1).astype(float)

def asterixes(series):
    return np.array(series.apply(lambda x: x.count('!'))).reshape(-1,1).astype(float)

def uppercase_count(series):
    return np.array(series.apply(lambda x: len(re.findall(r'[A-Z]',x)))).reshape(-1,1).astype(float)

In [7]:
def get_average_wordvector(tokens_list, vector, generate_missing=False, k=N_DIMS):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments.apply(lambda x: get_average_wordvector(x, vectors, generate_missing=generate_missing))
    return list(embeddings)

def embed(series):
    return get_embeddings(glove, series)

In [8]:
from nlp_pipeline import NlpPipeline

In [9]:
train = pd.read_csv('data\\train.csv')
test = pd.read_csv('data\\test.csv')
train["comment_text"] = train["comment_text"].fillna("_na_")
test["comment_text"] = test["comment_text"].fillna("_na_")

In [10]:
%time glove = get_glove()

Wall time: 3min 50s


In [34]:
class_labels = [column for column in train.columns[2:8]]
feature_funcs = [lengths, asterixes, uppercase_count]
transforms = [tokenize]
# logreg = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg')
# logreg.name = "Logistic regression newton"
xgb = XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.15, colsample_bytree=0.6)
xgb.name = "XGBoost"
models = [xgb]

In [12]:
pipe = NlpPipeline(train, test, "comment_text", class_labels, feature_funcs, transforms, models, word_vectors=glove)

In [13]:
pipe.engineer_features()

Engineering features


In [14]:
pipe.apply_transforms()

Applying transforms


In [15]:
%time pipe.create_embeddings(embed)

Creating embeddings
Wall time: 41.1 s


In [16]:
pipe.cross_val()

Cross-validating
LogisticRegression(C=30.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
Cross-validating toxic
roc_auc: 0.9663566879765998
Cross-validating severe_toxic
roc_auc: 0.9824378952854586
Cross-validating obscene
roc_auc: 0.975907595417055
Cross-validating threat
roc_auc: 0.9669333937159189
Cross-validating insult
roc_auc: 0.9721503138305995
Cross-validating identity_hate
roc_auc: 0.966883488850247


In [17]:
pipe.fit_predict()

Fitting and predicting
Fitting submission classifier for toxic
Fitting submission classifier for severe_toxic
Fitting submission classifier for obscene
Fitting submission classifier for threat
Fitting submission classifier for insult
Fitting submission classifier for identity_hate


In [18]:
pipe.create_submission()

Creating submissions


In [22]:
pipe.models = [xgb]

In [23]:
pipe.cross_val()

Cross-validating
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.25, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=20, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
Cross-validating toxic
roc_auc: 0.9370200898161059
Cross-validating severe_toxic
roc_auc: 0.9710925528863775
Cross-validating obscene
roc_auc: 0.9501719761273542
Cross-validating threat
roc_auc: 0.9528030564046871
Cross-validating insult
roc_auc: 0.9514289069299136
Cross-validating identity_hate
roc_auc: 0.9435259598984205


In [27]:
pipe.predictions['XGBoost'] = pipe.predictions['Logistic regression newton']

In [28]:
pipe.create_submission()

Creating submissions


In [37]:
X_train, X_test, y_train, y_test = train_test_split(pipe.train_features, list(pipe.train["severe_toxic"]), test_size=0.2, random_state=40)
xgb.fit(X_train, y_train, eval_metric='auc')
y_pred = xgb.predict_proba(X_test)

NameError: name 'self' is not defined

In [40]:
roc_auc_score(y_test, y_pred[:,1])

0.9877136940558803

In [44]:
scorelist = []
for label in pipe.class_labels:
    pipe.log("Cross-validating " + label)
    scores = cross_val_score(xgb, pipe.train_features, list(pipe.train[label]), scoring=pipe.metric, cv=5)
    pipe.log(pipe.metric + ": " + str(np.mean(scores)))
    scorelist.append(np.mean(scores))

Cross-validating toxic
roc_auc: 0.967015805216979
Cross-validating severe_toxic
roc_auc: 0.9853333628435932
Cross-validating obscene
roc_auc: 0.9778539295697459
Cross-validating threat
roc_auc: 0.9820443915227862
Cross-validating insult
roc_auc: 0.9739457957004263
Cross-validating identity_hate
roc_auc: 0.9730230143579437


NameError: name 'model' is not defined

In [45]:
pipe.cv_scores["XGBoost"] = np.mean(scorelist)

In [46]:
pipe.cv_scores["XGBoost"]

0.976536049868579

In [None]:
for label in pipe.class_labels:
    pipe.log("Fitting submission classifier for " + label)
    y_train = np.array(pipe.train[label])
    xgb.fit(pipe.train_features, y_train)
    pipe.predictions["XGBoost"][label] = xgb.predict_proba(pipe.test_features)

Fitting submission classifier for toxic


In [None]:
pipe.log("Creating submissions")
submission = pipe.test[pipe.id_column].to_frame()
for label in pipe.class_labels:
    submission[label] = pipe.predictions["XGBoost"][label][:,1]

submission_num = 1
past_submissions = pipe.get_past_submissions()
if past_submissions is not None and past_submissions != []:
    submission_num = max(past_submissions)[0] + 1
filename = 'submissions\\submission' + str(submission_num) + '.csv'
submission.to_csv(filename, index=False)
pipe.store_submission_metadata(filename, submission_num, xgb)