In [None]:
import pandas as pd
import numpy as np
from nlp_pipeline import *
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
train = pd.read_csv('data\\train.csv').fillna(' ')
test = pd.read_csv('data\\test.csv').fillna(' ')

In [None]:
# pretrained = 'data\\glove.6B.300d.txt'
pretrained = "data\\crawl-300d-2M.vec"

In [None]:
print("Getting pretrained model from", pretrained)
vector_model = get_pretrained(pretrained)

In [None]:
class_labels = [column for column in train.columns[2:8]]
feature_funcs = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq]
transforms = [tokenize]
logreg = LogisticRegression(C=0.2, class_weight='balanced', solver='newton-cg', max_iter=10)
logreg.name = "Logistic regression newton"
ada = AdaBoostClassifier()
ada.name = "Ada"
nb = GaussianNB()
nb.name = "Naive Bayes"
rf = RandomForestClassifier(max_depth=5, n_estimators=50)
rf.name = "Random Forest"
ef = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0)
ef.name = "Extra Trees"
knn = KNeighborsClassifier(3)
knn.name = "KNN"

models = [logreg]

In [None]:
pipe = NlpPipeline(train, test, "comment_text", class_labels, feature_funcs, transforms, models, word_vectors=vector_model, pretrained=pretrained)
print(pipe)

In [None]:
pipe.cv_scores

In [None]:
param_grid = {'C': [0.05, 0.1, 0.2],
             'max_iter': [10, 15]}

In [None]:
model = GridSearchCV(estimator=logreg, param_grid=param_grid, scoring='roc_auc',
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=3)

In [None]:
model.fit(pipe.train_features, pipe.train["severe_toxic"])

In [None]:
model.best_score_ # C=10, max_iter=50, 0.9674429

In [None]:
model.best_score_ # C=1, max_iter=20, 0.98428259062386858

In [None]:
model.best_params_

In [None]:
model.best_score_ # C=0.2, max_iter=15, 0.98506603966572748

In [None]:
model.best_score_ # C=0.2, max_iter=10, 0.98524977008581871

In [None]:
pipe.models = [logreg]

In [None]:
pipe.cv_scores

In [None]:
pipe.fit_predict()

In [None]:
pipe.create_submission()

In [None]:
from sklearn.model_selection import KFold

In [None]:
class Ensemble2(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        folds = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))

        for i, clf in enumerate(self.base_models):
            print(clf)
            S_test_i = np.zeros((T.shape[0], self.n_folds))
            
            j = 0
            for train_idx, test_idx in folds.split(X):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
                print(roc_auc_score(y_holdout, y_pred))
                j += 1

            S_test[:, i] = S_test_i.mean(1)

        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict_proba(S_test)[:,1]
        return y_pred

In [None]:
stacker = Ensemble2(5, logreg, [logreg, rf, ada])

In [None]:
preds = stacker.fit_predict(pipe.train_features, pipe.train["severe_toxic"], pipe.test_features)

In [None]:
predictions = {}

In [None]:
for label in pipe.class_labels:
    predictions[label] = stacker.fit_predict(pipe.train_features, pipe.train[label], pipe.test_features)