# Improving our log reg model for better improvement

So we saw even blending a log reg model initially at 0.849 with lasso model of 0.868 improves at 0.869! We will see if improving our log reg score will help towards that. 


Hence, let's try implementing bayesian methods to logistic regression. To do so we will try two different implementations - pymc3 and pystan.

# pystan

In [1]:
import sys, os
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, r2_score, make_scorer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import theano.tensor as t
from scipy.stats import mode

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
# import data
train = pd.read_csv("/Users/JoonH/dont-overfit-ii/train.csv")
train_y = train['target'].astype(int)
train_X = train.drop(['id','target'], axis=1).values

test_df = pd.read_csv("/Users/JoonH/dont-overfit-ii/test.csv")
test = test_df.drop(['id'], axis=1).values

# scale using RobustScaler
# fitting scaler on full data outperforms fitting on test_X only (+0.006 kaggle score)
data = RobustScaler().fit_transform(np.concatenate((train_X, test), axis=0))
train_X = data[:250]
test = data[250:]
# add a bit of noise to train_X to reduce overfitting
train_X += np.random.normal(0, 0.01, train_X.shape)

In [3]:
import pystan

In [25]:
#https://www.kaggle.com/gkoundry/bayesian-logistic-regression-with-pystan                                                                                                     
code = """                                                                                           
data {                                                                                               
  int N; //the number of training observations                                                       
  int N2; //the number of test observations                                                          
  int K; //the number of features                                                                    
  int y[N]; //the response                                                                           
  matrix[N,K] X; //the model matrix                                                                  
  matrix[N2,K] new_X; //the matrix for the predicted values                                          
}                                                                                                    
parameters {                                                                                         
  real alpha;                                                                                        
  vector[K] beta; //the regression parameters                                                        
}                                                                                                    
transformed parameters {                                                                             
  vector[N] linpred;                                                                                 
  linpred = alpha+X*beta;                                                                            
}                                                                                                    
model {                                                                                              
  alpha ~ cauchy(0,13); //prior for the intercept following Gelman 2008
  //originally cauchy(0, 10) but 13 gives better score
                                                                                                     
  for(i in 1:K)                                                                                      
    beta[i] ~ student_t(1, 0, 0.03);                                                                 
                                                                                                     
  y ~ bernoulli_logit(linpred);                                                                      
}                                                                                                    
generated quantities {                                                                               
  vector[N2] y_pred;                                                                                 
  y_pred = alpha+new_X*beta; //the y values predicted by the model                                   
}                                                                                                    
"""    

data = {                                                                                             
    'N': 250,                                                                                        
    'N2': 19750,                                                                                     
    'K': 300,                                                                                        
    'y': train_y,                                                                                     
    'X': train_X,                                                                                      
    'new_X': test,                                                                                   
} 

n_itr = 3000
n_warmup = 1000

sm = pystan.StanModel(model_code = code)
fit = sm.sampling(data = data, iter = n_itr, warmup = n_warmup, seed = 2019)
ex = fit.extract(permuted = True)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ff9f6282f9e65bf0bbfafc8f9821850f NOW.
To run all diagnostics call pystan.check_hmc_diagnostics(fit)


In [26]:
import scipy
from scipy.stats import bernoulli

def logit_to_prob(logit):
    odds = np.exp(logit)
    prob = odds / (1 + odds)
    return prob

In [27]:
#target = np.mean(ex['y_pred'], axis = 0)
target = np.mean(logit_to_prob(ex['y_pred']), axis = 0)
ids = test_df['id']
df = pd.DataFrame({'id': ids, 'target' : target})
df[['id', 'target']].to_csv("/Users/JoonH/DO2_pystan_log.csv", index = False)

This gives us LB score of 0.860

*sources:*

1. https://www.kaggle.com/gkoundry/bayesian-logistic-regression-with-pystan/log
2. https://barnesanalytics.com/bayesian-logistic-regression-in-python-using-pymc3