# Improving our log reg model for better improvement

So we saw even blending a log reg model initially at 0.849 with lasso model of 0.868 improves at 0.869! We will see if improving our log reg score will help towards that. 


Hence, let's try implementing bayesian methods to logistic regression

In [1]:
import sys, os
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, r2_score, make_scorer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import theano.tensor as t
from scipy.stats import mode
import pymc3 as pm

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

  from ._conv import register_converters as _register_converters


In [56]:
# import data
train = pd.read_csv("/Users/JoonH/dont-overfit-ii/train.csv")
train_y = train['target'].astype(int)
train_X = train.drop(['id','target'], axis=1).values

test_df = pd.read_csv("/Users/JoonH/dont-overfit-ii/test.csv")
test = test_df.drop(['id'], axis=1).values

# scale using RobustScaler
# fitting scaler on full data outperforms fitting on test_X only (+0.006 kaggle score)
data = RobustScaler().fit_transform(np.concatenate((train_X, test), axis=0))
#scaler = RobustScaler().fit(train_X)
#train_X = scaler.transform(train_X)
train_X = data[:250]
test = data[250:]
#test = scaler.transform(test)
# add a bit of noise to train_X to reduce overfitting
#train_X += np.random.normal(0, 0.01, train_X.shape)

In [57]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 200, n_jobs = 4, class_weight = 'balanced', max_depth=5)
boruta_selector = BorutaPy(rfc, n_estimators = 'auto', verbose = 0, max_iter = 5)
boruta_selector.fit(train_X,train_y)

BorutaPy(alpha=0.05,
     estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=489, n_jobs=4, oob_score=False,
            random_state=<mtrand.RandomState object at 0x00000208657F5750>,
            verbose=0, warm_start=False),
     max_iter=5, n_estimators='auto', perc=100,
     random_state=<mtrand.RandomState object at 0x00000208657F5750>,
     two_step=True, verbose=0)

In [58]:
feature_df = pd.DataFrame(train.drop(['id','target'],axis=1).columns.tolist(),columns = ['features'])
feature_df['rank'] = boruta_selector.ranking_
feature_df = feature_df.sort_values('rank',ascending=True).reset_index(drop=True)
feature_df.head(3)

Unnamed: 0,features,rank
0,33,2
1,217,2
2,65,2


In [59]:
#keep top 50 features
columns_to_keep = feature_df.features[0:80]

In [60]:
boruta_train = train[columns_to_keep]
boruta_test = test_df[columns_to_keep]
boruta_train.head()

Unnamed: 0,33,217,65,117,91,295,80,24,82,194,...,215,113,45,9,53,290,0,71,15,90
0,0.385,1.187,-0.77,0.71,0.019,-2.097,1.851,1.763,-0.38,-0.226,...,0.817,0.405,0.833,1.825,1.238,0.867,-0.098,-0.974,0.417,-1.675
1,-2.721,0.216,1.221,0.987,1.188,-1.624,-0.759,-1.519,0.406,0.083,...,0.329,-0.178,-1.102,-0.291,0.66,-0.165,1.081,-0.813,1.133,-0.03
2,0.924,0.269,0.943,-0.384,0.269,-1.165,0.758,1.786,-0.101,-0.5,...,0.307,-0.275,0.972,0.183,-0.714,0.013,-0.523,0.409,0.3,0.696
3,0.394,0.066,-0.706,-0.152,1.103,0.467,0.03,0.365,1.848,-0.902,...,1.41,0.589,-0.861,0.274,0.57,-0.404,0.067,0.205,-1.442,-0.604
4,0.037,0.11,0.357,1.027,0.892,1.378,-0.187,0.024,-0.054,0.138,...,1.96,0.011,0.344,2.198,-0.312,0.898,2.347,0.732,0.27,-1.906


In [61]:
# scale using RobustScaler
# fitting scaler on full data outperforms fitting on test_X only (+0.006 kaggle score)
data = RobustScaler().fit_transform(np.concatenate((boruta_train, boruta_test), axis=0))
train_X = data[:250]
test = data[250:]
# add a bit of noise to train_X to reduce overfitting
train_X += np.random.normal(0, 0.01, train_X.shape)

In [62]:
import pystan

In [63]:
#https://www.kaggle.com/gkoundry/bayesian-logistic-regression-with-pystan                                                                                                     
code = """                                                                                           
data {                                                                                               
  int N; //the number of training observations                                                       
  int N2; //the number of test observations                                                          
  int K; //the number of features                                                                    
  int y[N]; //the response                                                                           
  matrix[N,K] X; //the model matrix                                                                  
  matrix[N2,K] new_X; //the matrix for the predicted values                                          
}                                                                                                    
parameters {                                                                                         
  real alpha;                                                                                        
  vector[K] beta; //the regression parameters                                                        
}                                                                                                    
transformed parameters {                                                                             
  vector[N] linpred;                                                                                 
  linpred = alpha+X*beta;                                                                            
}                                                                                                    
model {                                                                                              
  alpha ~ cauchy(0,10); //prior for the intercept following Gelman 2008                              
                                                                                                     
  for(i in 1:K)                                                                                      
    beta[i] ~ student_t(1, 0, 0.03);                                                                 
                                                                                                     
  y ~ bernoulli_logit(linpred);                                                                      
}                                                                                                    
generated quantities {                                                                               
  vector[N2] y_pred;                                                                                 
  y_pred = alpha+new_X*beta; //the y values predicted by the model                                   
}                                                                                                    
"""    

data = {                                                                                             
    'N': 250,                                                                                        
    'N2': 19750,                                                                                     
    'K': 80,                                                                                        
    'y': train_y,                                                                                     
    'X': train_X,                                                                                      
    'new_X': test,                                                                                   
} 

n_itr = 3000
n_warmup = 1000

sm = pystan.StanModel(model_code = code)
fit = sm.sampling(data = data, iter = n_itr, warmup = n_warmup, seed = None)
ex = fit.extract(permuted = True)



INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3ff3dda648840fef960ca7e40a266ea9 NOW.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)


In [64]:
import scipy
from scipy.stats import bernoulli

In [65]:
def logit_to_prob(logit):
    odds = np.exp(logit)
    prob = odds / (1 + odds)
    return prob

In [66]:
#target = np.mean(ex['y_pred'], axis = 0)
target = np.mean(logit_to_prob(ex['y_pred']), axis = 0)
ids = test_df['id']
df = pd.DataFrame({'id': ids, 'target' : target})
df[['id', 'target']].to_csv("/Users/JoonH/DO2_pystan_log.csv", index = False)

This gives us LB score of 0.858

In [None]:
https://www.kaggle.com/gkoundry/bayesian-logistic-regression-with-pystan/log
https://barnesanalytics.com/bayesian-logistic-regression-in-python-using-pymc3