In [2]:
import pandas as pd
import numpy as np
from nlp_pipeline import *

In [3]:
train = pd.read_csv('data\\train.csv').fillna(' ')
test = pd.read_csv('data\\test.csv').fillna(' ')

In [4]:
pretrained = 'data\\glove.840B.300d.txt'

In [5]:
print("Getting pretrained model from", pretrained)
vector_model = get_pretrained(pretrained)

Getting pretrained model from data\glove.840B.300d.txt


In [6]:
train = train[:10000]
test = test[:10000]

In [9]:
class_labels = [column for column in train.columns[2:8]]
feature_funcs = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq]
transforms = [tokenize]
logreg = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg')
logreg.name = "Logistic regression newton"
models = [logreg]

In [8]:
def avg_word_length(x):
    return np.mean([len(s) for s in x.split(' ')])

In [10]:
pipe = NlpPipeline(train, test, "comment_text", class_labels, feature_funcs, transforms, models, word_vectors=vector_model)
print(pipe)

Train: (10000, 8)
Test: (10000, 2)
Train features: (0,)
Test features: (0,)
Input column: comment_text
Class labels: toxic severe_toxic obscene threat insult identity_hate
Models: Logistic regression newton: 30.0 balanced False True 1 100 ovr 1 l2 None newton-cg 0.0001 0 False | 
Transforms:  tokenize
Feature functions:  function asterix_freq uppercase_freq line_change_freq rep_freq question_freq avg_word_length
Metric: roc_auc
CV scores: {'Logistic regression newton': -1}


In [10]:
pipe.run()

Engineering features
Applying transforms
Creating embeddings
Cross-validating
LogisticRegression(C=30.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
Cross-validating toxic
roc_auc: 0.9506399333511346
Cross-validating severe_toxic
roc_auc: 0.9118874083358671
Cross-validating obscene
roc_auc: 0.9578279043009091
Cross-validating threat
roc_auc: 0.9317573176845425
Cross-validating insult
roc_auc: 0.9538170169144499
Cross-validating identity_hate
roc_auc: 0.9371347571530688
Fitting and predicting
Fitting submission classifier for toxic
Fitting submission classifier for severe_toxic
Fitting submission classifier for obscene
Fitting submission classifier for threat
Fitting submission classifier for insult
Fitting submission classifier for identity_hate
Creating submissions


In [11]:
pipe.run()

Engineering features
Applying transforms
Creating embeddings
Cross-validating
LogisticRegression(C=30.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
Cross-validating toxic
roc_auc: 0.9504455588890061
Cross-validating severe_toxic
roc_auc: 0.9070486249195193
Cross-validating obscene
roc_auc: 0.9578249857951221
Cross-validating threat
roc_auc: 0.9357188306850641
Cross-validating insult
roc_auc: 0.9537063551922313
Cross-validating identity_hate
roc_auc: 0.9371930174270675
Fitting and predicting
Fitting submission classifier for toxic
Fitting submission classifier for severe_toxic
Fitting submission classifier for obscene
Fitting submission classifier for threat
Fitting submission classifier for insult
Fitting submission classifier for identity_hate
Creating submissions


In [13]:
df = pd.read_csv('submissions\\submeta.csv')

In [14]:
df

Unnamed: 0,submission,filename,model,pretrained,feature_funcs,transforms,cv_score
0,1,submissions\submission1.csv,Logistic regression C=30 balanced newton-cg,w2v,lengths,tokenize,-1.0
1,2,submissions\submission2.csv,Logistic regression C=1,w2v,lengths,tokenize,-1.0
2,3,submissions\submission3.csv,Logistic regression C=30 balanced newton-cg,w2v,lengths,tokenize,0.966422
3,4,submissions\submission4.csv,Logistic regression C=1,w2v,lengths,tokenize,0.964501
4,9,submissions\submission9.csv,Logistic regression newton: 30.0 balanced Fals...,w2v,lengths,tokenize,0.966422
5,10,submissions\submission10.csv,Logistic regression linear: 1.0 None False Tru...,w2v,lengths,tokenize,0.964501
6,11,submissions\submission10.csv,Logistic regression newton: 30.0 balanced Fals...,w2v,,tokenize,0.9658
7,12,submissions\submission10.csv,Logistic regression newton: 30.0 balanced Fals...,w2v,lengths asterixes,tokenize,0.966697
8,13,submissions\submission11.csv,Logistic regression newton: 30.0 balanced Fals...,w2v,lengths asterixes uppercase_count,tokenize,0.966886
9,7,submissions\submission7.csv,Logistic regression newton: 30.0 balanced Fals...,w2v,,tokenize,0.970816
