In [1]:
import sys, os
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn import preprocessing as pp
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, r2_score, make_scorer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:

# some heuristic settings
rfe_min_features = 12
rfe_step = 15
rfe_cv = 20
sss_n_splits = 20
sss_test_size = 0.35
noise_std = 0.01
r2_threshold = 0.185
random_seed = 213
gridsearchCV = 20

np.random.seed(random_seed)

# import data
train = pd.read_csv("/Users/JoonH/dont-overfit-ii/train.csv")
train_y = train['target']
train = train.drop(['id','target'], axis=1)

test = pd.read_csv("/Users/JoonH/dont-overfit-ii/test.csv")
test = test.drop(['id'], axis=1)

In [43]:
data = pd.concat((train, test), axis = 0)

for col in train.columns:
    data[col] = pd.cut(data[col], 100, labels = False)
train_X = data[:250]
test = data[250:]

In [44]:
train_X = train_X.values
test = test.values

In [45]:
# scale using RobustScaler
# fitting scaler on full data outperforms fitting on test_X only (+0.006 kaggle score)
#data = pp.RobustScaler().fit_transform(np.concatenate((train_X, test), axis=0))
#train_X = data[:250]
#test = data[250:]


scaler = pp.RobustScaler()
train_X = scaler.fit_transform(train_X)
test = scaler.transform(test)
# add a bit of noise to train_X to reduce overfitting
train_X += np.random.normal(0, noise_std, train_X.shape)

# define roc_auc_metric robust to only one class in y_pred
def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5

robust_roc_auc = make_scorer(scoring_roc_auc)


def build_and_train_model(model = 'lasso'):
    val_roc_sum = 0
    sum_counter = 0
    if model=='lasso':
        model = Lasso(alpha=0.031, tol=0.01, random_state=random_seed, selection='random')
        param_grid = {
                    'alpha' :  [0.019, 0.02, 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027, 0.029, 0.031],
                    'tol'   :  [0.001, 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017],
                    'max_iter':[500, 750, 1000, 1250, 1500, 1750, 2000]
        }
        
    if model=='RandomForest':
        model = RandomForestClassifier()
        param_grid = {
            'n_estimators':[10],
            'max_depth': [None]
            
        }
    
    # define recursive elimination feature selector
    feature_selector = RFECV(model, min_features_to_select=rfe_min_features, scoring=robust_roc_auc, step=rfe_step, verbose=0, cv=rfe_cv, n_jobs=-1)
    print("counter | val_mse  |  val_mae  |  val_roc  |  val_cos  |  val_dist  |  val_r2    | feature_count ")
    print("-------------------------------------------------------------------------------------------------")

    predictions = pd.DataFrame()
    counter = 0
    # split training data to build one model on each traing-data-subset

    for train_index, val_index in StratifiedShuffleSplit(n_splits=sss_n_splits, test_size=sss_test_size, random_state=random_seed).split(train_X, train_y):
        X, val_X = train_X[train_index], train_X[val_index]
        y, val_y = train_y[train_index], train_y[val_index]

        # get the best features for this data set
        feature_selector.fit(X, y)
        # remove irrelevant features from X, val_X and test
        X_important_features        = feature_selector.transform(X)
        val_X_important_features    = feature_selector.transform(val_X)
        test_important_features     = feature_selector.transform(test)

        # run grid search to find the best Lasso parameters for this subset of training data and subset of features 
        grid_search = GridSearchCV(feature_selector.estimator_, param_grid=param_grid, verbose=0, n_jobs=-1, scoring=robust_roc_auc, cv=gridsearchCV)
        grid_search.fit(X_important_features, y)

        # score our fitted model on validation data
        val_y_pred = grid_search.best_estimator_.predict(val_X_important_features)
        val_mse = mean_squared_error(val_y, val_y_pred)
        val_mae = mean_absolute_error(val_y, val_y_pred)
        val_roc = roc_auc_score(val_y, val_y_pred)
        val_cos = cosine_similarity(val_y.values.reshape(1, -1), val_y_pred.reshape(1, -1))[0][0]
        val_dst = euclidean_distances(val_y.values.reshape(1, -1), val_y_pred.reshape(1, -1))[0][0]
        val_r2  = r2_score(val_y, val_y_pred)

        # if model did well on validation, save its prediction on test data, using only important features
        # r2_threshold (0.185) is a heuristic threshold for r2 error
        # you can use any other metric/metric combination that works for you
        if val_r2 > r2_threshold:
            message = '<-- OK'
            prediction = grid_search.best_estimator_.predict(test_important_features)
            predictions = pd.concat([predictions, pd.DataFrame(prediction)], axis=1)
            val_roc_sum += val_roc
            sum_counter += 1
        else:
            message = '<-- skipping'


        print("{0:2}      | {1:.4f}   |  {2:.4f}   |  {3:.4f}   |  {4:.4f}   |  {5:.4f}    |  {6:.4f}    |  {7:3}         {8}  ".format(counter, val_mse, val_mae, val_roc, val_cos, val_dst, val_r2, feature_selector.n_features_, message))
    
        counter += 1
        
    print("-------------------------------------------------------------------------------------------------")
    print("{}/{} models passed validation threshold and will be ensembled.".format(len(predictions.columns), sss_n_splits))
    print("Average ROC score: " + str(val_roc_sum / sum_counter))
    return predictions


In [46]:
predictions = build_and_train_model()
mean_pred = pd.DataFrame(predictions.mean(axis=1))
mean_pred.index += 250
mean_pred.columns = ['target']
mean_pred.to_csv('/Users/JoonH/DO2_lasso_kernel_bin_submission.csv', index_label='id', index=True)        
#LB score: 0.868

counter | val_mse  |  val_mae  |  val_roc  |  val_cos  |  val_dist  |  val_r2    | feature_count 
-------------------------------------------------------------------------------------------------
 0      | 0.1820   |  0.3764   |  0.8019   |  0.8471   |  4.0018    |  0.2136    |   12         <-- OK  
 1      | 0.1896   |  0.3877   |  0.7623   |  0.8381   |  4.0848    |  0.1806    |  255         <-- skipping  
 2      | 0.2110   |  0.3939   |  0.7143   |  0.8247   |  4.3094    |  0.0880    |   12         <-- skipping  
 3      | 0.1806   |  0.3724   |  0.7829   |  0.8466   |  3.9861    |  0.2197    |  105         <-- OK  
 4      | 0.1837   |  0.3664   |  0.7846   |  0.8447   |  4.0210    |  0.2060    |  285         <-- OK  
 5      | 0.1878   |  0.3647   |  0.7673   |  0.8408   |  4.0655    |  0.1884    |   45         <-- OK  
 6      | 0.2084   |  0.3852   |  0.6970   |  0.8222   |  4.2822    |  0.0995    |   90         <-- skipping  
 7      | 0.1658   |  0.3627   |  0.8287   |  0.862