In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import TruncatedSVD

In [43]:
train_path = '../Datasets/train.csv'  #paths to data
test_path = '../Datasets/test.csv'

In [44]:
train_df.columns                      #listing columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '290', '291', '292', '293', '294', '295', '296', '297', '298', '299'],
      dtype='object', length=300)

In [45]:
train_df = pd.read_csv(train_path)                          #load the train data
train_df_values = train_df.drop(['id', 'target'], axis = 1)   #retain only required columns for train matrix

In [46]:
X = train_df_values.values                                  #convert the df into array

In [29]:
svd_100 = TruncatedSVD(n_components=100, n_iter=20, random_state=42)  #svd object with 100 components

In [30]:
svd_100.fit(X)                                  

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=20,
       random_state=42, tol=0.0)

In [31]:
print(svd_100.explained_variance_ratio_.sum()) 

0.7803804657811666


We see that 100 columns explain upto 78% of variance. Let's try 150 columns

In [33]:
svd_150 = TruncatedSVD(n_components=150, n_iter=20, random_state=42)
svd_150.fit(X)

TruncatedSVD(algorithm='randomized', n_components=150, n_iter=20,
       random_state=42, tol=0.0)

In [34]:
print(svd_150.explained_variance_ratio_.sum()) 

0.9246093979004004


150 columns gives about 92% variance which seems enough to achieve a parsimonious model

In [37]:
X_reduced = svd_150.transform(X)

X_reduced is the new train matrix with 150 columns which explain 92% of the variance. Lasso regression has been the most successful model yet so we will perform lasso regression using this reduced matrix

In [39]:
test = pd.read_csv(test_path)                 #read test data

In [47]:
X_test = test.drop(['id'], axis=1)            #load X_test and y_train 
y_train = train_df['target']

Rest of the code is borrowed from Lasso Regression which can be found in ../3 folder.

In [52]:
random_state = 0

#Hyperparameter = lambda
#Using Logistic Regression with l1 regularisation
logit = LogisticRegression(random_state=random_state)
#Using ROC_AUC score for testing each lambda value
rocauc_score = make_scorer(roc_auc_score) 
#Using GridSearch to search for the best lambda value for the model
parameter_grid = {'class_weight':['balanced'], 'penalty' : ['l1', 'l2'], 'C':[0.0001, 0.0005, 0.001,0.005, 0.01, 0.05, 0.1, 0.5, 1,10, 100, 1000, 1500, 2000, 2500,2600, 2700, 2800, 2900, 3000, 3100, 3200],'max_iter' : [100, 1000, 2000, 5000, 10000] }

#Grid Search
grid = GridSearchCV(estimator=logit,param_grid=parameter_grid,scoring=rocauc_score,verbose=1,cv=20,n_jobs=-1)
grid.fit(X_reduced, y_train)
best_score = grid.best_score_
best_para = grid.best_params_
best_logit = grid.best_estimator_
#roc_auc Score
print("Best Score obtained is: ", best_score, "for the parameters: ", best_para)
#Hyperparameters of the best model
print(best_logit)

Fitting 20 folds for each of 220 candidates, totalling 4400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 319 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 819 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 1519 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 2419 tasks      | elapsed:   44.7s
[Parallel(n_jobs=-1)]: Done 3519 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 4385 out of 4400 | elapsed:  1.3min remaining:    0.3s


Best Score obtained is:  0.7017 for the parameters:  {'C': 1, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l1'}
LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=0,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)


[Parallel(n_jobs=-1)]: Done 4400 out of 4400 | elapsed:  1.3min finished


In [53]:
#Creating a Logistic Regression Model with l1 Regularisation with the above obtained best hyperparameters
model = LogisticRegression(C=1, class_weight='balanced', dual=False,fit_intercept=True, intercept_scaling=1, max_iter=100,multi_class='warn', n_jobs=None, penalty='l1', random_state=0, solver='liblinear', tol=0.0001, verbose=0, warm_start=False);
model.fit(X_reduced, y_train)

LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [55]:
#Train Score on the model
score_train = model.score(X_reduced, y_train)
print("Train Score :", str(score_train*100)+" %")

Train Score : 100.0 %


In [57]:
#do svd with same params on  X_test 
X_test_reduced = svd_150.transform(X_test)

In [75]:
#Generating the predicted values and testing them on Kaggle to get a score
y_pred_logit_lasso = model.predict_proba(X_test_reduced)[:,1]

In [76]:
# y_pred_logit_lasso_05 = [1 if x > 0.5 else 0 for x in y_pred_logit_lasso]
data = {'id':test['id'], 'target':y_pred_logit_lasso}
df = pd.DataFrame(data)
df.to_csv('results_05.csv', index=False)

          id      0      1      2      3      4      5      6      7      8  \
0        250  0.500 -1.033 -1.595  0.309 -0.714  0.502  0.535 -0.129 -0.687   
1        251  0.776  0.914 -0.494  1.347 -0.867  0.480  0.578 -0.313  0.203   
2        252  1.750  0.509 -0.057  0.835 -0.476  1.428 -0.701 -2.009 -1.378   
3        253 -0.556 -1.855 -0.682  0.578  1.592  0.512 -1.419  0.722  0.511   
4        254  0.754 -0.245  1.173 -1.623  0.009  0.370  0.781 -1.763 -1.432   
5        255 -1.650 -0.534 -1.056  2.040  1.479 -0.197 -0.441 -0.557 -0.241   
6        256  0.096  0.794 -0.126  0.787  1.187 -0.611  1.383 -0.383 -1.934   
7        257 -1.205  0.295 -0.221  0.840  1.545 -0.152  0.373 -0.349 -0.922   
8        258 -0.377 -1.135 -1.791 -0.642  0.353 -1.234  0.878 -2.155 -0.542   
9        259 -1.731  1.171  0.383 -1.039 -0.091 -0.076  0.244 -1.023 -0.876   
10       260  1.208 -2.625 -0.417  1.182  0.151 -0.457 -0.080  1.658 -1.032   
11       261 -0.541  0.281 -1.248 -0.069 -1.235 -2.1