In [176]:
import pandas as pd
import numpy as np
import os
import re
import string
import gensim
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [43]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments.apply(lambda x: get_average_word2vec(x, vectors, generate_missing=generate_missing))
    return list(embeddings)

pretrained = "data\\GoogleNews-vectors-negative300.bin"

def w2v(series):
    word_vectors = gensim.models.KeyedVectors.load_word2vec_format(pretrained, binary=True)        
    return get_word2vec_embeddings(word_vectors, series)    

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(row):
    return re_tok.sub(r' \1 ', row).lower().split()

In [477]:
class NlpPipeline():

    def __init__(self, train=None, test=None, input_column='text_comment', class_labels=None, feature_functions=None, transforms=None, models=None, metric='roc_auc', id_column='id', verbosity=1):
        self.train = train
        self.test = test
        self.input_column = input_column
        self.class_labels = class_labels        
        self.feature_functions = feature_functions
        self.transforms = transforms
        self.models = models
        self.metric = metric
        self.id_column = id_column
        self.verbosity = verbosity
        self.train_features = np.array([])
        self.test_features = np.array([])
        self.cv_scores = {}
        for model in self.models:
            self.cv_scores[model.name] = -1
        self.scaler = StandardScaler()
        self.train_transformed = self.train[input_column]
        self.test_transformed = self.test[input_column]

    def run(self):        
        self.engineer_features()
        self.apply_transforms()
        self.create_embeddings(w2v)
        self.train_test()
        # self.cross_val()
        self.fit_predict()
        self.create_submission()

    def log(self, s):
        if self.verbosity > 0:
            print(s)
            
    def create_embeddings(self, func):
        self.log("Creating embeddings")
        embeddings = func(self.train_transformed)
        
        self.train_features = np.hstack((self.train_features, np.array(embeddings[:len(train[input_column])])))
        self.test_features = np.hstack((self.test_features, np.array(embeddings[len(train[input_column]):])))

    def engineer_features(self, use_transform=False, normalize=True):
        self.log("Engineering features")
        train_feats = []
        test_feats = []
        if use_transform:
            train_data = self.train_transformed
            test_data = self.test_transformed
        else:
            train_data = self.train[input_column]
            test_data = self.test[input_column]
            
        for func in self.feature_functions:
            train_feature = np.array(func(train_data))            
            test_feature = np.array(func(test_data))            
            if normalize:
                train_feature = self.normalize(train_feature)
                test_feature = self.normalize(test_feature)
            train_feats.append(train_feature)
            test_feats.append(test_feature)            

        self.train_features = np.hstack((feature for feature in train_feats))
        self.test_features = np.hstack((feature for feature in test_feats))
        
    def add_feature(self, func, use_transform=False, normalize=True):
        self.log("Adding feature")
        self.feature_functions.append(func)
        if use_transform:
            train_data = self.train_transformed
            test_data = self.test_transformed
        else:
            train_data = self.train[input_column]
            test_data = self.test[input_column]
        
        train_feature = np.array(func(train_data))
        test_feature = np.array(func(test_data))
        if normalize:
            train_feature = self.normalize(train_feature)
            test_feature = self.normalize(test_feature)
        
        self.train_features = np.hstack((self.train_features, np.array(train_feature)))
        self.test_features = np.hstack((self.test_features, np.array(test_feature)))
        
    def normalize(self, data):
        self.scaler.fit(data)
        return self.scaler.transform(data)
    
    def apply_transforms(self):        
        self.log("Applying transforms")
        for transform in self.transforms:
            self.train_transformed = self.train[self.input_column].apply(transform)
            self.test_transformed = self.test[self.input_column].apply(transform)

    def train_test(self):
        self.log("Training and testing") 
        for model in self.models:
            self.log(str(model)) 
            scorelist = [] 
            for label in self.class_labels:
                self.log("Fitting classifier for " + label)
                X_train, X_test, y_train, y_test = train_test_split(self.train_features, list(self.train[label]), test_size=0.2, random_state=40)
                model.fit(X_train, y_train)
                y_pred = model.predict_proba(X_test)
                self.log(self.metric + str(roc_auc_score(y_test, y_pred[:,1])))
                scorelist.append(np.mean(roc_auc_score(y_test, y_pred[:,1])))
            self.cv_scores[model.name] = np.mean(scorelist)

    def cross_val(self):
        self.log("Cross-validating") 
        for model in self.models:
            self.log(str(model)) 
            scorelist = [] 
            for label in self.class_labels:
                self.log("Cross-validating " + label)
                scores = cross_val_score(model, self.train_features, list(train[label]), scoring=self.metric, cv=5)
                self.log(self.metric + str(np.mean(scores)))
                scorelist.append(np.mean(scores))
            self.cv_scores[model.name] = np.mean(scorelist)

    def fit_predict(self):
        self.log("Fitting and predicting") 
        self.predictions = {}
        for model in self.models:
            self.predictions[model.name] = {}
            for label in self.class_labels:
                self.log("Fitting submission classifier for " + label)
                y_train = np.array(train[label])
                model.fit(self.train_features, y_train)
                self.predictions[model.name][label] = model.predict_proba(self.test_features)

    def create_submission(self):
        for model in self.models:
            self.log("Creating submissions")
            submission = self.test[self.id_column].to_frame()
            for label in self.class_labels:
                submission[label] = self.predictions[model.name][label][:,1]
            
            submission_num = 1
            past_submissions = self.get_past_submissions()
            if past_submissions is not None and past_submissions != []:
                submission_num = max(past_submissions)[0] + 1
            filename = 'submissions\\submission' + str(submission_num) + '.csv'
            submission.to_csv(filename, index=False)
            self.store_submission_metadata(filename, submission_num, model)

    def get_past_submissions(self):
        current_dir = os.getcwd()
        path = os.path.join(current_dir, 'submissions')
        try:
            return [[int(s) for s in re.findall(r'\d+', f)] for f in os.listdir(path)]
        except:
            return None

    def store_submission_metadata(self, filename, submission_num, model):
        feature_funcs = ""
        transforms = ""
        for func in self.feature_functions:
            feature_funcs += str(func).split(' ')[1] + " "
        for trf in self.transforms:
            transforms += str(trf).split(' ')[1] + " "   
        cols = ["submission", "filename", "model", "feature_funcs", "transforms", "cv_score"]
        metadata = pd.DataFrame([[submission_num, filename, self.model_info(model), feature_funcs, transforms, self.cv_scores[model.name]]], columns=cols)
        filename = 'submissions\\submeta.csv'
        try:
            df = pd.read_csv(filename)
            metadata.to_csv(filename, mode='a', header=False, index=False)
        except:            
            metadata.to_csv(filename, mode='a', index=False)
            
    def model_info(self, model):
        s = model.name + ":"
        for param in model.get_params():            
            s += " "
            s += str(model.get_params()[param])
        
        return s
    
    def __repr__(self):
        s = "Train: "
        s += str(self.train.shape)
        s += "\n"
        s += "Test: "
        s += str(self.test.shape)
        s += "\n"
        s += "Train features: "
        s += str(self.train_features.shape)
        s += "\n"
        s += "Test features: "
        s += str(self.test_features.shape)
        s += "\n"
        s += "Input column: "
        s += self.input_column
        s += "\n"
        s += "Class labels:"
        for label in self.class_labels:
            s += " "
            s += label
        s += "\n"
        s += "Models: "
        for model in self.models:            
            s += self.model_info(model)
            s += " | "
            
        s += "\n"
        s += "Transforms: "
        for transform in self.transforms:
            s += " "
            s += str(transform).split(' ')[1]
        s += "\n"
        s += "Feature functions: "
        for func in self.feature_functions:
            s += " "
            s += str(func).split(' ')[1]
        s += "\n"
        s += "Metric: "
        s += self.metric
        s += "\n"
        s += "CV scores: "
        s += str(self.cv_scores)
        
        return s

In [13]:
train = pd.read_csv('data\\train.csv')
test = pd.read_csv('data\\test.csv')

In [443]:
input_column = 'comment_text'
class_labels = [column for column in train.columns[2:8]]
feature_funcs = [lengths, asterixes, uppercase_count]
transforms = [tokenize]
logreg = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg')
logreg.name = "Logistic regression newton"
logreg2 = LogisticRegression()
logreg2.name = "Logistic regression linear"
models = [logreg]

In [118]:
pipeline = NlpPipeline(train, test, input_column, class_labels, feature_funcs, transforms, models)

In [119]:
pipeline.apply_transforms()

Applying transforms


In [120]:
pipeline.engineer_features()

Engineering features


In [133]:
def lengths(series):
    return np.array(series.apply(len)).reshape(-1,1).astype(float)

In [134]:
pipeline.add_feature(lengths, use_transform=False)

Adding feature


In [294]:
pipeline4

Train: (159571, 8)
Test: (153164, 2)
Train features: (159571, 301)
Test features: (153164, 301)
Input column: comment_text
Class labels: toxic severe_toxic obscene threat insult identity_hate
Models: Logistic regression newton: 30.0 balanced False True 1 100 ovr 1 l2 None newton-cg 0.0001 0 False | 
Transforms:  tokenize
Feature functions:  w2v
Metric: roc_auc
CV scores: {'Logistic regression C=30 balanced newton-cg': 0.96642205385485747, 'Logistic regression C=1': 0.96450135056771169, 'Logistic regression newton': 0.96642205385485747, 'Logistic regression linear': 0.96450135056771169}

In [291]:
pipeline4.models

[LogisticRegression(C=30.0, class_weight='balanced', dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)]

In [303]:
pipeline4.cross_val()

Cross-validating
LogisticRegression(C=30.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
Cross-validating toxic
roc_auc0.957181539826
Cross-validating severe_toxic
roc_auc0.979454310046
Cross-validating obscene
roc_auc0.966984833341
Cross-validating threat
roc_auc0.967254666722
Cross-validating insult
roc_auc0.964695071303
Cross-validating identity_hate
roc_auc0.959227665704


In [322]:
pipeline4

Train: (159571, 8)
Test: (153164, 2)
Train features: (159571, 302)
Test features: (153164, 302)
Input column: comment_text
Class labels: toxic severe_toxic obscene threat insult identity_hate
Models: Logistic regression newton: 30.0 balanced False True 1 100 ovr 1 l2 None newton-cg 0.0001 0 False | 
Transforms:  tokenize
Feature functions:  w2v lengths asterixes
Metric: roc_auc
CV scores: {'Logistic regression C=30 balanced newton-cg': 0.96642205385485747, 'Logistic regression C=1': 0.96450135056771169, 'Logistic regression newton': 0.96579968115716197, 'Logistic regression linear': 0.96450135056771169}

In [None]:
def lengths(series):
    return np.array(series.apply(len)).reshape(-1,1).astype(float)

In [323]:
def asterixes(series):
    return np.array(series.apply(lambda x: x.count('!'))).reshape(-1,1).astype(float)

In [331]:
def uppercase_count(series):
    return np.array(series.apply(lambda x: len(re.findall(r'[A-Z]',x)))).reshape(-1,1).astype(float)

In [334]:
pipeline4.add_feature(uppercase_count)
pipeline4.feature_functions.append(uppercase_count)

Adding feature


In [342]:
pipeline4.cross_val()

Cross-validating
LogisticRegression(C=30.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
Cross-validating toxic
roc_auc0.959831859074
Cross-validating severe_toxic
roc_auc0.98111790585
Cross-validating obscene
roc_auc0.968355363638
Cross-validating threat
roc_auc0.967321720965
Cross-validating insult
roc_auc0.965616720284
Cross-validating identity_hate
roc_auc0.959071625642


In [330]:
pipeline4.create_submission()

Creating submissions


In [343]:
pipeline4.cv_scores

{'Logistic regression C=1': 0.96450135056771169,
 'Logistic regression C=30 balanced newton-cg': 0.96642205385485747,
 'Logistic regression linear': 0.96450135056771169,
 'Logistic regression newton': 0.96688586590880032}

In [419]:
p5 = NlpPipeline(train, test, input_column, class_labels, feature_funcs, transforms, models)

In [420]:
p5.models = pipeline4.models
p5.predictions = pipeline4.predictions
p5.train_features = pipeline4.train_features
p5.test_features = pipeline4.test_features
p5.feature_functions = pipeline4.feature_functions
p5.cv_scores = pipeline4.cv_scores

In [421]:
p5

Train: (159571, 8)
Test: (153164, 2)
Train features: (159571, 303)
Test features: (153164, 303)
Input column: comment_text
Class labels: toxic severe_toxic obscene threat insult identity_hate
Models: Logistic regression newton: 30.0 balanced False True 1 100 ovr 1 l2 None newton-cg 0.0001 0 False | 
Transforms:  tokenize
Feature functions:  w2v lengths asterixes uppercase_count
Metric: roc_auc
CV scores: {'Logistic regression C=30 balanced newton-cg': 0.96642205385485747, 'Logistic regression C=1': 0.96450135056771169, 'Logistic regression newton': 0.96688586590880032, 'Logistic regression linear': 0.96450135056771169}

In [422]:
p5.create_submission()

Creating submissions


In [480]:
p6 = NlpPipeline(train, test, input_column, class_labels, feature_funcs, transforms, models)

In [481]:
p6.engineer_features()


Engineering features


In [482]:
p6.apply_transforms()

Applying transforms


In [483]:
p6.create_embeddings(w2v)

Creating embeddings


MemoryError: 

In [487]:
w2v

<function __main__.w2v>