# Improving our log reg model for better improvement

So we saw even blending a log reg model initially at 0.849 with lasso model of 0.868 improves at 0.869! We will see if improving our log reg score will help towards that. 

In [1]:
import sys, os
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, r2_score, make_scorer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
# heuristics
rfe_min_features = 12
rfe_step = 5
rfe_cv = 25
sss_n_splits = 25
sss_test_size = 0.35
grid_search_cv = 25
noise_std = 0.01
r2_threshold = 0.185
random_seed = 213
np.random.seed(random_seed)

In [3]:
# import data
train = pd.read_csv("/Users/JoonH/dont-overfit-ii/train.csv")
train_y = train['target']
train_X = train.drop(['id','target'], axis=1).values

test = pd.read_csv("/Users/JoonH/dont-overfit-ii/test.csv")
test = test.drop(['id'], axis=1).values

# scale using RobustScaler
# fitting scaler on full data outperforms fitting on test_X only (+0.006 kaggle score)
#data = RobustScaler().fit_transform(np.concatenate((train_X, test), axis=0))
data = StandardScaler().fit_transform(np.concatenate((train_X, test), axis=0))
train_X = data[:250]
test = data[250:]

# add a bit of noise to train_X to reduce overfitting
train_X += np.random.normal(0, noise_std, train_X.shape)



In [24]:
# define roc_auc_metric robust to only one class in y_pred
def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5

robust_roc_auc = make_scorer(scoring_roc_auc)

# define model and its parameters
from sklearn.neighbors import KNeighborsClassifier

#model = LogisticRegression(penalty='l1', tol=0.0001, C=0.1, fit_intercept=False, intercept_scaling=1, class_weight='balanced', random_state=213, max_iter=1000)
model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
param_grid = {
            'C' : [0.1],
            'tol'   : [0.0001],
            #'intercept_scaling' : [0.5,0.75,1,1.25,1.5]
            'class_weight' : ['balanced',None]
        }


# define recursive elimination feature selector
feature_selector = RFECV(model, min_features_to_select=rfe_min_features, scoring=robust_roc_auc, step=rfe_step, verbose=0, cv=rfe_cv, n_jobs=-1)

print("counter | val_mse  |  val_mae  |  val_roc  |  val_cos  |  val_dist  |  val_r2    | feature_count ")
print("-------------------------------------------------------------------------------------------------")

predictions = pd.DataFrame()
counter = 0
# split training data to build one model on each traing-data-subset
# TODO: turn this into a method to ensemble different models
for train_index, val_index in StratifiedShuffleSplit(n_splits=sss_n_splits, test_size=sss_test_size, random_state=random_seed).split(train_X, train_y):
    X, val_X = train_X[train_index], train_X[val_index]
    y, val_y = train_y[train_index], train_y[val_index]

    # get the best features for this data set
    feature_selector.fit(X, y)
    # remove irrelevant features from X, val_X and test
    X_important_features        = feature_selector.transform(X)
    val_X_important_features    = feature_selector.transform(val_X)
    test_important_features     = feature_selector.transform(test)

    # run grid search to find the best Lasso parameters for this subset of training data and subset of features 
    grid_search = GridSearchCV(feature_selector.estimator_, param_grid=param_grid, verbose=0, n_jobs=-1, scoring=robust_roc_auc, cv=20)
    grid_search.fit(X_important_features, y)

    # score our fitted model on validation data
    val_y_pred = grid_search.best_estimator_.predict(val_X_important_features)
    val_mse = mean_squared_error(val_y, val_y_pred)
    val_mae = mean_absolute_error(val_y, val_y_pred)
    val_roc = roc_auc_score(val_y, val_y_pred)
    val_cos = cosine_similarity(val_y.values.reshape(1, -1), val_y_pred.reshape(1, -1))[0][0]
    val_dst = euclidean_distances(val_y.values.reshape(1, -1), val_y_pred.reshape(1, -1))[0][0]
    val_r2  = r2_score(val_y, val_y_pred)

    # if model did well on validation, save its prediction on test data, using only important features
    # r2_threshold (0.185) is a heuristic threshold for r2 error
    # you can use any other metric/metric combination that works for you
    if val_r2 > r2_threshold:
        message = '<-- OK'
        prediction = grid_search.best_estimator_.predict(test_important_features)
        predictions = pd.concat([predictions, pd.DataFrame(prediction)], axis=1)
    else:
        message = '<-- skipping'


    print("{0:2}      | {1:.4f}   |  {2:.4f}   |  {3:.4f}   |  {4:.4f}   |  {5:.4f}    |  {6:.4f}    |  {7:3}         {8}  ".format(counter, val_mse, val_mae, val_roc, val_cos, val_dst, val_r2, feature_selector.n_features_, message))
    
    counter += 1

print("-------------------------------------------------------------------------------------------------")
print("{}/{} models passed validation threshold and will be ensembled.".format(len(predictions.columns), sss_n_splits))

mean_pred = pd.DataFrame(predictions.mean(axis=1))
mean_pred.index += 250
mean_pred.columns = ['target']
mean_pred.to_csv('dont_overfit2_log_robust.csv', index_label='id', index=True)        
       

counter | val_mse  |  val_mae  |  val_roc  |  val_cos  |  val_dist  |  val_r2    | feature_count 
-------------------------------------------------------------------------------------------------
 0      | 0.3182   |  0.3182   |  0.7299   |  0.7104   |  5.2915    |  -0.3750    |   15         <-- skipping  
 1      | 0.3750   |  0.3750   |  0.6920   |  0.6429   |  5.7446    |  -0.6205    |   20         <-- skipping  
 2      | 0.3068   |  0.3068   |  0.7054   |  0.7371   |  5.1962    |  -0.3259    |   15         <-- skipping  
 3      | 0.3182   |  0.3182   |  0.7165   |  0.7154   |  5.2915    |  -0.3750    |   20         <-- skipping  
 4      | 0.3750   |  0.3750   |  0.6451   |  0.6678   |  5.7446    |  -0.6205    |   25         <-- skipping  
 5      | 0.3182   |  0.3182   |  0.7031   |  0.7217   |  5.2915    |  -0.3750    |   15         <-- skipping  
 6      | 0.2841   |  0.2841   |  0.7299   |  0.7570   |  5.0000    |  -0.2277    |   25         <-- skipping  
 7      | 0.3182   |