# Project 2

## Process
* create a small data set (only 12 elements) to look at text processing and see how BoW vectorization works
* create vectors for the data
    * default
    * exclude common words
    * exclude rare words
    * count # of times word is used in single review? or just present at all
    * look at word pairs? word triples? expensive to generate, could produce better results
* train a simple logistic regression on the data
    * look at all different vectors, see which set is best
* tweak logistic hyperparams
* train MLP
* train decision tree

In [1]:
# imports

import re
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import minmax_scale

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier


## Data processing
* Removing whitespace and punctuation
* Converting to all lowercase
* Generating vectors

### Default BoW

In [None]:
# # count vectorizer
# raw_reviews = pd.read_csv('data/data_reviews/x_train.csv')['text'].values.tolist()
# reviews = list()

# pattern = re.compile('[^a-z ]')
# for review in raw_reviews:
#     review = review.lower()
#     review = pattern.sub('', review)
#     reviews.append(review)

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(reviews)
# y = pd.read_csv('data/data_reviews/y_train.csv').to_numpy().ravel()

In [None]:
# # TF-IDF vectorizor
# raw_reviews = pd.read_csv('data/data_reviews/x_train.csv')['text'].values.tolist()
# reviews = list()

# pattern = re.compile('[^a-z ]')
# for review in raw_reviews:
#     review = review.lower()
#     review = pattern.sub('', review)
#     reviews.append(review)

# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(reviews)
# y = pd.read_csv('data/data_reviews/y_train.csv').to_numpy().ravel()

In [None]:
# # n-grams
# raw_train_reviews = pd.read_csv('data/data_reviews/x_train.csv')['text'].values.tolist()
# train_reviews = list()
# raw_test_reviews = pd.read_csv('data/data_reviews/x_test.csv')['text'].values.tolist()
# test_reviews = list()

# pattern = re.compile('[^a-z ]')
# for review in raw_train_reviews:
#     review = review.lower()
#     review = pattern.sub('', review)
#     train_reviews.append(review)

# for review in raw_test_reviews:
#     review = review.lower()
#     review = pattern.sub('', review)
#     test_reviews.append(review)

# vectorizer = CountVectorizer(ngram_range=(1,3))
# fitter = vectorizer.fit(train_reviews)
# X = fitter.transform(train_reviews)
# y = pd.read_csv('data/data_reviews/y_train.csv').to_numpy().ravel()

# vectorizer = CountVectorizer(ngram_range=(1,3))
# X_test = fitter.transform(test_reviews)

In [2]:
# n-grams
raw_train_reviews = pd.read_csv('data/data_reviews/x_train.csv')['text'].values.tolist()
train_reviews = list()
raw_test_reviews = pd.read_csv('data/data_reviews/x_test.csv')['text'].values.tolist()
test_reviews = list()

pattern = re.compile('[^a-z ]')
for review in raw_train_reviews:
    review = review.lower()
    review = pattern.sub('', review)
    train_reviews.append(review)

for review in raw_test_reviews:
    review = review.lower()
    review = pattern.sub('', review)
    test_reviews.append(review)

all_reviews = train_reviews + test_reviews

vectorizer = CountVectorizer(ngram_range=(1,3))
fitter = vectorizer.fit(all_reviews)
# all_X = fitter.transform(all_reviews)
X_train = fitter.transform(train_reviews)
X_test = fitter.transform(test_reviews)

# print(all_X.shape)
# print(X_train.shape)
# print(X_test.shape)
y_train = list(pd.read_csv('data/data_reviews/y_train.csv')['is_positive_sentiment'])
y_test = np.array(pd.read_csv('data/data_reviews/y_test.csv')['is_positive_sentiment'])
# y = y_train + y_test

# final_model = linear_model.LogisticRegression(solver='liblinear', penalty='l1', C=18.4, tol=0.008)


# all_indices = list(range(2400))
# valid_indices = random.sample(all_indices, 360)
# train_indices = list()
# for i in all_indices:
#     if i not in valid_indices:
#         train_indices.append(i)

# valid_x = list()
# valid_y = list()
# r = list()
# train_x = list()
# train_y = list()
# for i in valid_indices:
#     valid_x.append(X_train[i].toarray())
#     valid_y.append(y_train[i])
#     r.append(raw_train_reviews[i])
# for i in train_indices:
#     train_x.append(X_train[i].toarray())
#     train_y.append(y_train[i])
# train_x = np.array(train_x).reshape(2040, -1)
# valid_x = np.array(valid_x).reshape(360, -1)
# print(train_x.shape)

In [9]:
final_model = linear_model.LogisticRegression(solver='liblinear', penalty='l1', C=18.4, tol=0.008)
final_model.fit(X_train, y_train)
train_probas = minmax_scale(final_model.decision_function(X_train).reshape(-1,1))
# test_probas = minmax_scale(final_model.decision_function(X_test).reshape(-1,1))
test_probas = final_model.predict_proba(X_test)[:,1]
print(test_probas)
print(log_loss(y_test, test_probas))
# preds = final_model.predict(valid_x)
# for i in range(len(valid_y)):
#     if preds[i] != valid_y[i]:
#         print(valid_y[i], r[i])

# # test_probas = minmax_scale(final_model.decision_function(X_test).reshape(-1,1))
# # print(test_probas)

# kf = KFold(shuffle=True, n_splits=5)

# for train_index, val_index in kf.split(X_train):
#     r = list()
#     x_train, x_val = X_train[train_index], X_train[val_index]
#     y_train, y_val = y_train[train_index], y_train[val_index]
#     r.append(raw_train_reviews[val_index])
#     final_model.fit(x_train, y_train)
    

#     y_pred = final_model.predict(x_val)
#     if y_pred[i] != y_val[i]:
#         print(y_val[i], r[i])

np.savetxt('yproba1_test.txt', test_probas)

[9.99475481e-01 4.19447592e-02 8.14477295e-03 4.45804295e-01
 1.23038357e-01 4.22172103e-01 1.93729197e-03 9.67064286e-01
 9.91832510e-01 9.43922459e-01 8.66917310e-01 3.37643952e-03
 2.44601382e-02 9.17405034e-01 9.00710978e-01 3.66463896e-02
 2.68171909e-02 5.79932340e-02 9.68462596e-03 9.76383367e-02
 9.57538822e-01 3.24809282e-01 3.41777618e-01 2.86526244e-01
 3.98973634e-02 2.32164514e-01 1.04760349e-01 4.96542588e-01
 9.72775315e-01 2.02099129e-02 9.34368053e-01 1.14027501e-01
 1.64645068e-02 7.13353465e-03 4.47572681e-04 9.99346753e-01
 2.69400768e-01 9.74313798e-02 3.97056410e-02 9.98336388e-01
 6.47165998e-03 8.04099379e-01 4.20643155e-02 8.14674162e-01
 3.89633404e-01 3.50674536e-01 9.65079940e-01 2.72659724e-01
 6.76419202e-01 4.94219825e-02 6.10363429e-04 9.65780134e-01
 9.99960332e-01 3.52972645e-02 6.38245164e-01 5.08255589e-01
 9.53605007e-01 2.97913826e-01 7.72207294e-02 9.20870067e-01
 8.54808007e-01 9.60663627e-01 5.26458225e-01 9.11102981e-01
 9.91740848e-01 1.521217

### BOW w 

## Logistic regression

### Default

In [None]:
kf = KFold(shuffle=True)
log_model = linear_model.LogisticRegression()

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    log_model.fit(x_train, y_train)

    k_train_scores.append(log_model.score(x_train, y_train))
    k_val_scores.append(log_model.score(x_val, y_val))
    
    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))

### LBFGS L2
best C: 8.68511373751352

In [None]:
# find best C
train_scores = list()
val_scores = list()

best_c = 10**-2
best_score = 0

kf = KFold(shuffle=True)
for c in np.logspace(-2, 2, 50):
    log_model = linear_model.LogisticRegression(solver='lbfgs', penalty='l2', max_iter=500, C=c)
    
    k_train_scores = list()
    k_val_scores = list()
    
    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        log_model.fit(x_train, y_train)
                
        k_train_scores.append(log_model.score(x_train, y_train))
        k_val_scores.append(log_model.score(x_val, y_val))

    if np.mean(k_val_scores) > best_score:
        best_c = c
        best_score = np.mean(k_val_scores)

    train_scores.append(np.mean(k_train_scores))
    val_scores.append(np.mean(k_val_scores))

    print('C: {:2f}   score: {:5f}'.format(c, np.mean(k_val_scores)))

print('best C:', best_c)

# plot accuracy
plt.title('Model Accuracy vs Inverse Regularization Penalty, C')
plt.xlabel('inverse regularization penalty, C')
plt.ylabel('accuracy')
plt.plot(np.logspace(-2, 2, 50), train_scores, c='r', label='train')
plt.plot(np.logspace(-2, 2, 50), val_scores, label='validation')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# find best tol
train_scores = list()
val_scores = list()

best_tol = 10**-8
best_score = 0

kf = KFold(shuffle=True)
for t in np.logspace(-8, 4, 50):
    log_model = linear_model.LogisticRegression(solver='lbfgs', penalty='l2', max_iter=500, C=best_c)
    
    k_train_scores = list()
    k_val_scores = list()
    
    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        log_model.fit(x_train, y_train)
        
        k_train_scores.append(log_model.score(x_train, y_train))
        k_val_scores.append(log_model.score(x_val, y_val))

    if np.mean(k_val_scores) > best_score:
        best_tol = t
        best_score = np.mean(k_val_scores)

    train_scores.append(np.mean(k_train_scores))
    val_scores.append(np.mean(k_val_scores))

    print('tol: {:8f}   score: {:5f}'.format(t, np.mean(k_val_scores)))

print('best tol:', best_tol)

# plot tolerance
plt.title('Model Accuracy vs Tolerance')
plt.xlabel('tolerance')
plt.ylabel('accuracy')
plt.plot(np.logspace(-8, 4, 50), train_scores, c='r', label='train')
plt.plot(np.logspace(-8, 4, 50), val_scores, label='validation')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# best model details

kf = KFold(shuffle=True, n_splits=10)
log_model = linear_model.LogisticRegression(solver='lbfgs', penalty='l2', max_iter=500, C=best_c)

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    log_model.fit(x_train, y_train)

    k_train_scores.append(log_model.score(x_train, y_train))
    k_val_scores.append(log_model.score(x_val, y_val))

    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))


### LBFGS none

In [None]:
kf = KFold(shuffle=True)
log_model = linear_model.LogisticRegression(solver='lbfgs', penalty='none', max_iter=500)

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    log_model.fit(x_train, y_train)

    k_train_scores.append(log_model.score(x_train, y_train))
    k_val_scores.append(log_model.score(x_val, y_val))
    
    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))

### liblinear L2
best C: 

In [None]:
# find best C
train_scores = list()
val_scores = list()

best_c = 10**-2
best_score = 0

kf = KFold(shuffle=True)
for c in np.logspace(-2, 2, 50):
    log_model = linear_model.LogisticRegression(solver='liblinear', penalty='l2', max_iter=500, C=c)
    
    k_train_scores = list()
    k_val_scores = list()
    
    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        log_model.fit(x_train, y_train)
                
        k_train_scores.append(log_model.score(x_train, y_train))
        k_val_scores.append(log_model.score(x_val, y_val))

    if np.mean(k_val_scores) > best_score:
        best_c = c
        best_score = np.mean(k_val_scores)

    train_scores.append(np.mean(k_train_scores))
    val_scores.append(np.mean(k_val_scores))

    print('C: {:2f}   score: {:5f}'.format(c, np.mean(k_val_scores)))

print('best C:', best_c)

# plot accuracy
plt.title('Model Accuracy vs Inverse Regularization Penalty, C')
plt.xlabel('inverse regularization penalty, C')
plt.ylabel('accuracy')
plt.plot(np.logspace(-2, 2, 50), train_scores, c='r', label='train')
plt.plot(np.logspace(-2, 2, 50), val_scores, label='validation')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# find best tol
train_scores = list()
val_scores = list()

best_tol = 10**-8
best_score = 0

kf = KFold(shuffle=True)
for t in np.logspace(-8, 4, 50):
    log_model = linear_model.LogisticRegression(solver='liblinear', penalty='l2', max_iter=500, C=best_c)
    
    k_train_scores = list()
    k_val_scores = list()
    
    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        log_model.fit(x_train, y_train)
        
        k_train_scores.append(log_model.score(x_train, y_train))
        k_val_scores.append(log_model.score(x_val, y_val))

    if np.mean(k_val_scores) > best_score:
        best_tol = t
        best_score = np.mean(k_val_scores)

    train_scores.append(np.mean(k_train_scores))
    val_scores.append(np.mean(k_val_scores))

    print('tol: {:8f}   score: {:5f}'.format(t, np.mean(k_val_scores)))

print('best tol:', best_tol)

# plot tolerance
plt.title('Model Accuracy vs Tolerance')
plt.xlabel('tolerance')
plt.ylabel('accuracy')
plt.plot(np.logspace(-8, 4, 50), train_scores, c='r', label='train')
plt.plot(np.logspace(-8, 4, 50), val_scores, label='validation')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# best model details

kf = KFold(shuffle=True, n_splits=10)
log_model = linear_model.LogisticRegression(solver='liblinear', penalty='l2', max_iter=500, C=15, tol=1.6)

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    log_model.fit(x_train, y_train)

    k_train_scores.append(log_model.score(x_train, y_train))
    k_val_scores.append(log_model.score(x_val, y_val))

    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))


### liblinear L1
best C: 8.68511373751352

In [None]:
# find best C
train_scores = list()
val_scores = list()

best_c = 10**-2
best_score = 0

kf = KFold(shuffle=True)
for c in np.logspace(-2, 2, 50):
    log_model = linear_model.LogisticRegression(solver='liblinear', penalty='l1', max_iter=500, C=c)
    
    k_train_scores = list()
    k_val_scores = list()
    
    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        log_model.fit(x_train, y_train)
                
        k_train_scores.append(log_model.score(x_train, y_train))
        k_val_scores.append(log_model.score(x_val, y_val))

    if np.mean(k_val_scores) > best_score:
        best_c = c
        best_score = np.mean(k_val_scores)

    train_scores.append(np.mean(k_train_scores))
    val_scores.append(np.mean(k_val_scores))

    print('C: {:2f}   score: {:5f}'.format(c, np.mean(k_val_scores)))

print('best C:', best_c)

# plot accuracy
plt.title('Model Accuracy vs Inverse Regularization Penalty, C')
plt.xlabel('inverse regularization penalty, C')
plt.ylabel('accuracy')
plt.plot(np.logspace(-2, 2, 50), train_scores, c='r', label='train')
plt.plot(np.logspace(-2, 2, 50), val_scores, label='validation')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# find best tol
train_scores = list()
val_scores = list()

best_tol = 10**-8
best_score = 0

kf = KFold(shuffle=True)
for t in np.logspace(-8, 4, 50):
    log_model = linear_model.LogisticRegression(solver='liblinear', penalty='l1', max_iter=500, C=best_c, tol=t)
    
    k_train_scores = list()
    k_val_scores = list()
    
    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        log_model.fit(x_train, y_train)
        
        k_train_scores.append(log_model.score(x_train, y_train))
        k_val_scores.append(log_model.score(x_val, y_val))

    if np.mean(k_val_scores) > best_score:
        best_tol = t
        best_score = np.mean(k_val_scores)

    train_scores.append(np.mean(k_train_scores))
    val_scores.append(np.mean(k_val_scores))

    print('tol: {:8f}   score: {:5f}'.format(t, np.mean(k_val_scores)))

print('best tol:', best_tol)

# plot tolerance
plt.title('Model Accuracy vs Tolerance')
plt.xlabel('tolerance')
plt.ylabel('accuracy')
plt.plot(np.logspace(-8, 4, 50), train_scores, c='r', label='train')
plt.plot(np.logspace(-8, 4, 50), val_scores, label='validation')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# best model details

kf = KFold(shuffle=True, n_splits=10)
log_model = linear_model.LogisticRegression(solver='liblinear', penalty='l1', max_iter=500, C=best_c, tol=best_tol)

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    log_model.fit(x_train, y_train)

    k_train_scores.append(log_model.score(x_train, y_train))
    k_val_scores.append(log_model.score(x_val, y_val))

    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))


## MLP

In [None]:
parameters = {
    'solver': ['adam', 'lbfgs'],
    'activation': ['logistic', 'relu'],
    'alpha': list(np.logspace(-3, 2, 25)),
    'tol': [10**i for i in range(-8, 3, 1)]
}

mlp = RandomizedSearchCV(MLPClassifier(), parameters, n_iter=16)
mlp.fit(X,y)

pd.DataFrame(mlp.cv_results_)

### Default logistic
pretty trash -- ~15% worse than ReLU or ID

In [None]:
kf = KFold(shuffle=True, n_splits=10)
model = MLPClassifier(activation='logistic')

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    model.fit(x_train, y_train)

    k_train_scores.append(model.score(x_train, y_train))
    k_val_scores.append(model.score(x_val, y_val))

    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))


### Default ReLU

In [None]:
kf = KFold(shuffle=True, n_splits=10)
model = MLPClassifier(activation='relu')

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    model.fit(x_train, y_train)

    k_train_scores.append(model.score(x_train, y_train))
    k_val_scores.append(model.score(x_val, y_val))

    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))


### Default identity

In [None]:
kf = KFold(shuffle=True, n_splits=10)
model = MLPClassifier(activation='identity')

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    model.fit(x_train, y_train)

    k_train_scores.append(model.score(x_train, y_train))
    k_val_scores.append(model.score(x_val, y_val))

    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))


LBFGS Logistic

In [None]:
kf = KFold(shuffle=True, n_splits=10)
model = MLPClassifier(solver='lbfgs', activation='logistic')

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    model.fit(x_train, y_train)

    k_train_scores.append(model.score(x_train, y_train))
    k_val_scores.append(model.score(x_val, y_val))

    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))


### LBFGS ReLU

In [None]:
kf = KFold(shuffle=True, n_splits=10)
model = MLPClassifier(solver='lbfgs', activation='relu')

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    model.fit(x_train, y_train)

    k_train_scores.append(model.score(x_train, y_train))
    k_val_scores.append(model.score(x_val, y_val))

    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))


### LBFGS identity

In [None]:
kf = KFold(shuffle=True, n_splits=10)
model = MLPClassifier(solver='lbfgs', activation='identity')

k_train_scores = list()
k_val_scores = list()

for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    model.fit(x_train, y_train)

    k_train_scores.append(model.score(x_train, y_train))
    k_val_scores.append(model.score(x_val, y_val))

    print('   TRAIN -- score: {:5f}'.format(k_train_scores[-1]), end='')
    print('      VALID -- score: {:5f}'.format(k_val_scores[-1]))

print('TRAIN -- score: {:5f}'.format(np.mean(k_train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
print('std:', np.std(k_val_scores))


In [None]:
# find best alpha
train_scores = list()
val_scores = list()

best_alpha = 10**-8
best_score = 0

kf = KFold(shuffle=True)
for a in np.logspace(-8, 2, 20):
    model = MLPClassifier(solver='lbfgs', activation='relu', alpha=a)
    
    k_train_scores = list()
    k_val_scores = list()
    
    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        model.fit(x_train, y_train)
                
        k_train_scores.append(model.score(x_train, y_train))
        k_val_scores.append(model.score(x_val, y_val))

    if np.mean(k_val_scores) > best_score:
        best_alpha = a
        best_score = np.mean(k_val_scores)

    train_scores.append(np.mean(k_train_scores))
    val_scores.append(np.mean(k_val_scores))

    print('alpha: {:2f}   score: {:5f}'.format(alpha, np.mean(k_val_scores)))

print('best alpha:', best_alpha)

# plot accuracy
plt.title('Model Accuracy vs Regularization Penalty, alpha')
plt.xlabel('regularization penalty alpha')
plt.ylabel('accuracy')
plt.plot(np.logspace(-8, 2, 20), train_scores, c='r', label='train')
plt.plot(np.logspace(-8, 2, 20), val_scores, label='validation')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# find best tol
train_scores = list()
val_scores = list()

best_tol = 10**-8
best_score = 0

kf = KFold(shuffle=True)
for t in np.logspace(-8, 4, 20):
    model = MLPClassifier(solver='lbfgs', activation='relu', alpha=best_alpha, tol=t)
    
    k_train_scores = list()
    k_val_scores = list()
    
    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        model.fit(x_train, y_train)
        
        k_train_scores.append(model.score(x_train, y_train))
        k_val_scores.append(model.score(x_val, y_val))

    if np.mean(k_val_scores) > best_score:
        best_tol = t
        best_score = np.mean(k_val_scores)

    train_scores.append(np.mean(k_train_scores))
    val_scores.append(np.mean(k_val_scores))

    print('tol: {:8f}   score: {:5f}'.format(t, np.mean(k_val_scores)))

print('best tol:', best_tol)

# plot tolerance
plt.title('Model Accuracy vs Tolerance')
plt.xlabel('tolerance')
plt.ylabel('accuracy')
plt.plot(np.logspace(-8, 4, 50), train_scores, c='r', label='train')
plt.plot(np.logspace(-8, 4, 50), val_scores, label='validation')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# plot tolerance
plt.title('Model Accuracy vs Tolerance')
plt.xlabel('tolerance')
plt.ylabel('accuracy')
plt.plot(np.logspace(-8, 4, 20), train_scores, c='r', label='train')
plt.plot(np.logspace(-8, 4, 20), val_scores, label='validation')
plt.xscale('log')
plt.legend()
plt.show()

### ADAM ReLU

In [None]:
# find best alpha
val_scores = list()
train_scores = list()

best_alpha = 10**-3
best_score = 0

kf = KFold(shuffle=True)

for a in np.logspace(-3, 3, 15):
    model = MLPClassifier(solver='adam', activation='relu', alpha=a)

    k_train_scores = list()
    k_val_scores = list()

    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        model.fit(x_train, y_train)

        k_train_scores.append(model.score(x_train, y_train))
        k_val_scores.append(model.score(x_val, y_val))

    print('   alpha: {:4f}   TRAIN -- score: {:5f}'.format(a, np.mean(k_train_scores)), end='')
    print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
    print('   std:', np.std(k_val_scores))

    if np.mean(k_val_scores) > best_score:
        best_alpha = a
        best_score = np.mean(k_val_scores)

    train_scores.append(np.mean(k_train_scores))
    val_scores.append(np.mean(k_val_scores))
    
print('best alpha: {:4f}   TRAIN -- score: {:5f}'.format(best_alpha, np.mean(train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(val_scores)))
print('std:', np.std(val_scores))


In [None]:
# find best tol
val_scores = list()
train_scores = list()

best_tol = 10**-8
best_score = 0

kf = KFold(shuffle=True)

for t in np.logspace(-12, 4, 15):
    model = MLPClassifier(solver='adam', activation='relu', alpha=best_alpha, tol=t)

    k_train_scores = list()
    k_val_scores = list()

    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        model.fit(x_train, y_train)

        k_train_scores.append(model.score(x_train, y_train))
        k_val_scores.append(model.score(x_val, y_val))

    print('   tol: {:4f}   TRAIN -- score: {:5f}'.format(t, np.mean(k_train_scores)), end='')
    print('      VALID -- score: {:5f}'.format(np.mean(k_val_scores)))
    print('   std:', np.std(k_val_scores))

    if np.mean(k_val_scores) > best_score:
        best_tol = t
        best_score = np.mean(k_val_scores)

    train_scores.append(np.mean(k_train_scores))
    val_scores.append(np.mean(k_val_scores))
    
print('best tol: {:4f}   TRAIN -- score: {:5f}'.format(best_tol, np.mean(train_scores)), end='')
print('      VALID -- score: {:5f}'.format(np.mean(val_scores)))
print('std:', np.std(val_scores))


In [None]:
# # find best tolerance
# train_scores = list()
# train_aucs = list()
# train_losses = list()

# val_scores = list()
# val_aucs = list()
# val_losses = list()

# best_tol = 10**-8
# best_loss = 1000

# tols = list()
# for i in range(-12, 4, 1):
#     tols.append(10**i)


# kf = KFold(shuffle=True)
# for t in tols:
#     model = MLPClassifier(activation='relu', solver='adam', alpha=best_alpha, tol=t)
    
#     k_train_scores = list()
#     k_train_aucs = list()
#     k_train_losses = list()
    
#     k_val_scores = list()
#     k_val_aucs = list()
#     k_val_losses = list()
    
#     for train_index, val_index in kf.split(X):
#         x_train, x_val = X[train_index], X[val_index]
#         y_train, y_val = y[train_index], y[val_index]
#         model.fit(x_train, y_train)
        
#         train_probas = model.predict_proba(x_train)[:,1]
#         val_probas = model.predict_proba(x_val)[:,1]
                
#         k_train_scores.append(model.score(x_train, y_train))
#         k_train_aucs.append(roc_auc_score(y_train, train_probas))
#         k_train_losses.append(log_loss(y_train, train_probas))

#         k_val_scores.append(model.score(x_val, y_val))
#         k_val_aucs.append(roc_auc_score(y_val, val_probas))
#         k_val_losses.append(log_loss(y_val, val_probas))

#     if np.mean(k_val_losses) < best_loss:
#         best_tol = t
#         best_loss = np.mean(k_val_losses)

#     train_scores.append(np.mean(k_train_scores))
#     train_aucs.append(np.mean(k_train_aucs))
#     train_losses.append(np.mean(k_train_losses))

#     val_scores.append(np.mean(k_val_scores))
#     val_aucs.append(np.mean(k_val_aucs))
#     val_losses.append(np.mean(k_val_losses))

#     print('tol: {}   score: {:5f}   auc: {:5f}   loss: {:5f}'.format(
#         t,
#         np.mean(k_val_scores), 
#         np.mean(k_val_aucs),
#         np.mean(k_val_losses)))

# print('best tol:', best_tol)

# # plot accuracy vs layer size
# plt.title('Model Accuracy vs Tolerance')
# plt.xlabel('tolerance (1e-tol)')
# plt.ylabel('accuracy')
# plt.plot(tols, train_scores, c='r', label='train')
# plt.plot(tols, val_scores, label='validation')
# plt.xscale('log')
# plt.legend()
# plt.show()

In [None]:
# # get details of best model
# model = MLPClassifier(activation='relu', solver='adam', alpha=best_alpha, tol=best_tol)
    
# k_train_scores = list()
# k_train_aucs = list()
# k_train_losses = list()

# k_val_scores = list()
# k_val_aucs = list()
# k_val_losses = list()

# for train_index, val_index in kf.split(X):
#     x_train, x_val = X[train_index], X[val_index]
#     y_train, y_val = y[train_index], y[val_index]
#     model.fit(x_train, y_train)
    
#     train_probas = model.predict_proba(x_train)[:,1]
#     val_probas = model.predict_proba(x_val)[:,1]
            
#     k_train_scores.append(model.score(x_train, y_train))
#     k_train_aucs.append(roc_auc_score(y_train, train_probas))
#     k_train_losses.append(log_loss(y_train, train_probas))

#     k_val_scores.append(model.score(x_val, y_val))
#     k_val_aucs.append(roc_auc_score(y_val, val_probas))
#     k_val_losses.append(log_loss(y_val, val_probas))

# print('TRAIN -- auc: {:5f}   loss: {:5f}   score: {:5f}'.format(
#     np.mean(k_train_aucs),
#     np.mean(k_train_losses),
#     np.mean(k_train_scores)))

# print('VALID -- auc: {:5f}   loss: {:5f}   score: {:5f}'.format(
#     np.mean(k_val_aucs),
#     np.mean(k_val_losses),
#     np.mean(k_val_scores)))

### Grid search other models 

In [None]:
# parameters = {
#     'solver': ['lbfgs', 'adam'],
#     'alpha': list(np.logspace(-3, 3, 20)),
#     'activation': ['identity', 'relu'],
#     'tol': list(range(-8, 1, 1))
# }

# mlp = GridSearchCV(MLPClassifier(), parameters)
# mlp.fit(X, y)

# pd.DataFrame(mlp.cv_results_)

## Decision Tree Classifier 