In [1]:
import pandas as pd
import numpy as np
import os
import re
import string
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('data\\train.csv').fillna(' ')
test = pd.read_csv('data\\test.csv').fillna(' ')

In [3]:
from nlp_pipeline import *

In [4]:
N_DIM = 300
# pretrained = "data\\GoogleNews-vectors-negative300.bin.gz"
# pretrained = "data\\crawl-300d-2M.vec"
pretrained = "data\\glove.840B.300d.txt"

In [5]:
# w2v = gensim.models.KeyedVectors.load_word2vec_format(pretrained, binary=True)   
glove = get_pretrained(pretrained)

In [6]:
input_column = 'comment_text'
class_labels = [column for column in train.columns[2:8]]
feature_funcs = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq]
transforms = [tokenize]
gbm = lgb.LGBMClassifier(max_depth=3, metric="auc", n_estimators=125, num_leaves=10, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)
gbm.name = "LightGBM"
models = [gbm]

In [7]:
pipeline = NlpPipeline(train, test, input_column, class_labels, feature_funcs, transforms, models, word_index=glove, pretrained=pretrained)

In [8]:
pipeline.engineer_features()

Engineering features


In [9]:
pipeline.apply_transforms()

Applying transforms


In [10]:
pipeline.create_embeddings()

Creating embeddings


In [14]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [28]:
from sklearn.svm import SVC

In [29]:
sv = SVC(kernel="linear", C=0.025)

In [30]:
sv.name = "SVM"
pipeline.models = [sv]

In [None]:
pipeline.fit_predict_oof()

Creating out-of-fold meta training set for stacker
SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
toxic


In [27]:
pipeline.create_submission()

Creating submissions


In [26]:
pipeline.cv_scores

{'LightGBM': 0.9715808951286572,
 'Quadratic Discriminant Analysis': 0.84085077791084495}

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
train = pd.read_csv('data\\train.csv').fillna(' ')
test = pd.read_csv('data\\test.csv').fillna(' ')

In [None]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [None]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 5),
    max_features=5000)

In [None]:
char_vectorizer.fit(all_text)

In [None]:
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [None]:
pipeline.engineer_features()

In [None]:
pipeline.train_features = pipeline.normalize(pipeline.train_features)
pipeline.test_features = pipeline.normalize(pipeline.test_features)

In [None]:
pipeline.train_features.shape

In [None]:
train_char_features[:10]

In [None]:
pipeline.train_features = train_char_features

In [None]:
pipeline.test_features = test_char_features

In [None]:
type(pipeline.test_features)

In [None]:
from scipy import sparse

In [None]:
sparse(np.array([1,2,3]))

In [None]:
from scipy import hstack
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix

In [None]:
test = hstack((pipeline.train_features, train_char_features))

In [None]:
type(pipeline.train_features.tocsr())

In [None]:
train_features = sparse.hstack([sparse.csr_matrix(pipeline.train_features), train_char_features]).tocsr()

In [None]:
pipeline.train_features = train_char_features

In [None]:
pipeline.test_features = test_char_features

In [None]:
pipeline

In [None]:
pipeline.cross_val()

In [None]:
pipeline.cross_val()

In [None]:
pipeline.cv_scores

In [None]:
pipeline.fit_predict()

In [None]:
pipeline.create_submission()

In [None]:
print("OK")

In [None]:
pipeline.models