In [1]:
import pandas as pd
import numpy as np
import os
import re
import string
import gensim
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler



In [2]:
N_DIMS = 100
pretrained = 'data\\glove.6B.100d.txt'

In [3]:
def get_coefs(row):
    row = row.strip().split()
    # can't use row[0], row[1:] split because 840B contains multi-part words 
    word, arr = " ".join(row[:-N_DIMS]), row[-N_DIMS:]
    return word, np.asarray(arr, dtype='float32')

In [4]:
def get_glove():
    return dict(get_coefs(row) for row in open(pretrained, encoding="utf-8"))

In [5]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(row):
    return re_tok.sub(r' \1 ', row).lower().split()

In [6]:
def lengths(series):
    return np.array(series.apply(len)).reshape(-1,1).astype(float)

def asterixes(series):
    return np.array(series.apply(lambda x: x.count('!'))).reshape(-1,1).astype(float)

def uppercase_count(series):
    return np.array(series.apply(lambda x: len(re.findall(r'[A-Z]',x)))).reshape(-1,1).astype(float)

In [7]:
def get_average_wordvector(tokens_list, vector, generate_missing=False, k=N_DIMS):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments.apply(lambda x: get_average_wordvector(x, vectors, generate_missing=generate_missing))
    return list(embeddings)

def embed(series):
    return get_embeddings(glove, series)

In [8]:
from nlp_pipeline import NlpPipeline

In [9]:
train = pd.read_csv('data\\train.csv')
test = pd.read_csv('data\\test.csv')
train["comment_text"] = train["comment_text"].fillna("_na_")
test["comment_text"] = test["comment_text"].fillna("_na_")

In [10]:
glove = get_glove()

In [11]:
class_labels = [column for column in train.columns[2:8]]
feature_funcs = [lengths, asterixes, uppercase_count]
transforms = [tokenize]
logreg = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg')
logreg.name = "Logistic regression newton"
models = [logreg]

In [22]:
pipe = NlpPipeline(train, test, "comment_text", class_labels, feature_funcs, transforms, models)

In [23]:
pipe.engineer_features()

Engineering features


In [24]:
pipe.create_embeddings(embed)

Creating embeddings


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-24-4d0b38be630b>", line 1, in <module>
    pipe.create_embeddings(embed)
  File "E:\Code\Kaggle\toxic-comments\nlp_pipeline.py", line 122, in create_embeddings
    embeddings = func(pd.concat([self.train_transformed, self.test_transformed]))
  File "<ipython-input-7-92487f3c00cc>", line 18, in embed
    return get_embeddings(glove, series)
  File "<ipython-input-7-92487f3c00cc>", line 14, in get_embeddings
    embeddings = clean_comments.apply(lambda x: get_average_wordvector(x, vectors, generate_missing=generate_missing))
  File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py", line 2355, in apply
    mapped = lib.map_infer(values, f, convert=convert_dtype)
  File "pandas/_libs/src\inference.pyx", line 1574, in pandas._libs.lib.map_infer
  File "<ipy

KeyboardInterrupt: 

In [14]:
pipe.apply_transforms()

Applying transforms


In [None]:
pipe.log("Cross-validating") 
for model in pipe.models:
    pipe.log(str(model)) 
    scorelist = [] 
    for label in pipe.class_labels:
        pipe.log("Cross-validating " + label)
        scores = cross_val_score(model, pipe.train_features, list(pipe.train[label]), scoring=pipe.metric, cv=5)
        pipe.log(pipe.metric + ": " + str(np.mean(scores)))
        scorelist.append(np.mean(scores))
    pipe.cv_scores[model.name] = np.mean(scorelist)