In [34]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [2]:
import pyarrow.feather as feather

## Loading the Data

#### Training Data

In [3]:
train_data = feather.read_feather("C:/Users/manas/Downloads/masq_train.feather")

In [4]:
train_data

Unnamed: 0,D_DEPDYS,GENDER,Leeftijd,DEMOG1,DEMOG2,DEMOG3,DEMOG4,DEMOG5,DEMOG6,DEMOG7,...,MASQ81,MASQ82,MASQ83,MASQ84,MASQ85,MASQ86,MASQ87,MASQ88,MASQ89,MASQ90
0,1,v,27,2,5,3,1,6,1,1,...,2,1,1,3,1,5,1,3,4,5
1,1,v,59,1,2,2,1,2,1,1,...,4,3,1,3,1,5,2,4,1,4
2,0,v,55,,,,,,,,...,1,1,1,1,4,1,1,4,1,2
3,1,v,45,2,4,4,2,6,1,1,...,2,1,1,1,1,5,1,2,5,5
4,1,m,58,1,2,1,1,1,1,1,...,4,4,4,4,4,4,1,4,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1793,0,v,44,2,4,2,1,6,1,1,...,2,4,2,4,1,5,1,1,1,3
1794,1,m,57,4,2,2,1,6,1,1,...,2,3,1,4,4,5,1,2,2,4
1795,1,m,72,1,2,2,1,4,1,1,...,2,4,4,1,4,5,1,1,1,4
1796,1,v,48,2,1,4,1,6,1,1,...,5,4,5,5,1,5,1,1,5,5


In [5]:
train_data.replace(["NA", "NaN", "None"], float("NaN"), inplace=True)
train_data.dropna(inplace=True)

In [6]:
X_train = train_data.drop(columns=['D_DEPDYS','GENDER'], axis=1)
Y_train = train_data['D_DEPDYS']

In [7]:
std_scaler = StandardScaler()
Scaled_X_train = std_scaler.fit_transform(X_train)

#### Test data

In [8]:
test_data = feather.read_feather("C:/Users/manas/Downloads/masq_test.feather")

In [9]:
test_data.replace(["NA", "NaN", "None"], float("NaN"), inplace=True)
test_data.dropna(inplace=True)

In [20]:
X_test = test_data.drop(columns=['D_DEPDYS','GENDER'], axis=1)
Y_test = test_data['D_DEPDYS']

In [21]:
std_scaler = StandardScaler()
Scaled_X_test = std_scaler.fit_transform(X_test)

### Multicollinearity

In [10]:
masq_items = []

In [11]:
for i in train_data.columns :
    if i.startswith('MASQ'):
        masq_items.append(i)

In [12]:
masq_items_col = train_data[masq_items]

In [13]:
collineraity_mat = masq_items_col.corr()


In [14]:
collineraity_mat

Unnamed: 0,MASQ01,MASQ02,MASQ03,MASQ04,MASQ05,MASQ06,MASQ07,MASQ08,MASQ09,MASQ11,...,MASQ81,MASQ82,MASQ83,MASQ84,MASQ85,MASQ86,MASQ87,MASQ88,MASQ89,MASQ90
MASQ01,1.000000,0.262217,0.265062,0.380913,0.335543,0.474756,0.315556,0.479949,0.172166,0.540486,...,0.224389,0.238898,0.277891,0.429284,0.142316,0.587306,0.128830,0.207448,0.324105,0.398467
MASQ02,0.262217,1.000000,0.608717,0.478178,0.179281,0.438185,-0.001811,0.408861,0.294180,0.211487,...,0.243169,0.280581,0.207109,0.460370,0.230764,0.264327,0.238559,0.292913,0.270357,0.281993
MASQ03,0.265062,0.608717,1.000000,0.502096,0.202690,0.382599,-0.024594,0.385485,0.329800,0.227690,...,0.336869,0.340081,0.215435,0.387407,0.248509,0.249502,0.270307,0.354405,0.260470,0.322646
MASQ04,0.380913,0.478178,0.502096,1.000000,0.213340,0.561569,0.110424,0.528397,0.300713,0.291479,...,0.308149,0.373863,0.239166,0.504445,0.268716,0.378310,0.268544,0.316389,0.386688,0.433322
MASQ05,0.335543,0.179281,0.202690,0.213340,1.000000,0.278225,0.135236,0.244953,0.163066,0.242694,...,0.180029,0.190420,0.641756,0.322225,0.212618,0.290202,0.141148,0.209039,0.160104,0.260034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MASQ86,0.587306,0.264327,0.249502,0.378310,0.290202,0.434135,0.306033,0.440462,0.170428,0.564515,...,0.178069,0.198345,0.214043,0.410241,0.127133,1.000000,0.128708,0.173450,0.305387,0.371028
MASQ87,0.128830,0.238559,0.270307,0.268544,0.141148,0.208962,0.041781,0.206729,0.287024,0.122648,...,0.271658,0.238203,0.177449,0.225679,0.287911,0.128708,1.000000,0.335287,0.166844,0.264125
MASQ88,0.207448,0.292913,0.354405,0.316389,0.209039,0.242952,0.048697,0.272203,0.303965,0.149199,...,0.374239,0.278315,0.207560,0.270816,0.290134,0.173450,0.335287,1.000000,0.201574,0.332461
MASQ89,0.324105,0.270357,0.260470,0.386688,0.160104,0.428470,0.076493,0.428600,0.161391,0.244015,...,0.134511,0.172068,0.167206,0.307362,0.141278,0.305387,0.166844,0.201574,1.000000,0.319554


## Implementing the models

In [15]:
lasso = Lasso()
ridge = Ridge()
elastic_net = ElasticNet()

In [16]:
lasso_hyperparams = {'alpha': [0.01, 0.1, 1, 10]}
ridge_hyperparams = {'alpha': [0.01, 0.1, 1, 10]}
elastic_net_hyperparams = {'alpha': [0.01, 0.1, 1, 10], 'l1_ratio': [0.1, 0.5, 0.9]}

### Lasso CV

In [17]:
lasso_cv = GridSearchCV(lasso, param_grid=lasso_hyperparams, cv=10)
lasso_cv.fit(Scaled_X_train, Y_train)

### Ridge CV

In [18]:
ridge_cv = GridSearchCV(ridge, param_grid=ridge_hyperparams, cv=10)
ridge_cv.fit(Scaled_X_train, Y_train)

#### Elastic Net CV

In [19]:
enet_cv = GridSearchCV(elastic_net, param_grid=elastic_net_hyperparams, cv=10)
enet_cv.fit(Scaled_X_train, Y_train)

### Selecting the best models

In [29]:
lasso_best = lasso_cv.best_estimator_
ridge_best = ridge_cv.best_estimator_
enet_best = enet_cv.best_estimator_

### Performing Prediction

#### Lasso

In [30]:
pred_Lasso = lasso_best.predict(Scaled_X_test)

#### Ridge

In [31]:
pred_ridge = ridge_best.predict(Scaled_X_test)

#### Elastic Net

In [32]:
elastic_net_pred = enet_best.predict(Scaled_X_test)

### Accuracy

#### Lasso

In [37]:
Lass_acc_score = accuracy_score(Y_test, pred_Lasso.round())
Lass_acc_score

0.7772667542706965

#### Ridge

In [38]:
Ridge_acc_score = accuracy_score(Y_test, pred_ridge.round())
Ridge_acc_score

0.7752956636005256

#### Elastic Net

In [39]:
elnet_acc_score = accuracy_score(Y_test, elastic_net_pred.round())
elnet_acc_score

0.778580814717477

### Extracting the coefficients

#### Lasso

In [42]:
MASQ_coeff_lasso = dict(zip(X_train.columns, lasso_best.coef_))
MASQ_coeff_lasso


{'Leeftijd': 0.01479089887927484,
 'DEMOG1': -0.0,
 'DEMOG2': -0.0,
 'DEMOG3': -0.016300475591037006,
 'DEMOG4': -0.0018809499097131818,
 'DEMOG5': 0.0005189564630116448,
 'DEMOG6': 0.0,
 'DEMOG7': -0.0,
 'DEMOG8': -0.0,
 'MASQ01': 0.041216180860564534,
 'MASQ02': -0.023035078265081693,
 'MASQ03': -0.010055470719310573,
 'MASQ04': 0.0,
 'MASQ05': 9.50174726711432e-05,
 'MASQ06': 0.0,
 'MASQ07': 0.0,
 'MASQ08': -0.0,
 'MASQ09': 0.0,
 'MASQ11': 0.0,
 'MASQ12': -0.0,
 'MASQ13': 0.01063724087895744,
 'MASQ14': 0.0023702738145435953,
 'MASQ15': 0.0,
 'MASQ16': 0.08281855073828258,
 'MASQ17': -0.002139427236641269,
 'MASQ18': 0.0,
 'MASQ19': 0.0,
 'MASQ20': -0.0,
 'MASQ21': 0.0005806067375582409,
 'MASQ22': 0.04239542131145587,
 'MASQ23': 0.0,
 'MASQ24': 0.010210687515838823,
 'MASQ25': -0.0031180114405663525,
 'MASQ26': 0.0,
 'MASQ27': 0.0,
 'MASQ28': -0.0,
 'MASQ29': 0.0029588500311477483,
 'MASQ30': 0.028849751445227063,
 'MASQ31': 0.01350633702986144,
 'MASQ32': 0.0,
 'MASQ33': 0.0054722

#### Ridge

In [43]:
MASQ_coeff_ridge = dict(zip(X_train.columns, ridge_best.coef_))
MASQ_coeff_ridge

{'Leeftijd': 0.0037160033610032102,
 'DEMOG1': -0.009665083609585928,
 'DEMOG2': -0.005400859476483578,
 'DEMOG3': -0.024780047416813507,
 'DEMOG4': -0.0069610970691253915,
 'DEMOG5': 0.013823710674293643,
 'DEMOG6': 0.005973934605459416,
 'DEMOG7': -0.003385946502305092,
 'DEMOG8': -0.011849564671361659,
 'MASQ01': 0.04864051143870646,
 'MASQ02': -0.030031552163509334,
 'MASQ03': -0.024229876116464297,
 'MASQ04': 0.010142847691749802,
 'MASQ05': 0.009364192094218678,
 'MASQ06': -0.003879945086609907,
 'MASQ07': -0.0006814810872363259,
 'MASQ08': -0.022224722627055476,
 'MASQ09': 0.012152113026813426,
 'MASQ11': 0.007204322843791639,
 'MASQ12': -0.01748998394489419,
 'MASQ13': 0.0447832014393135,
 'MASQ14': 0.013305652166948682,
 'MASQ15': 0.014598376979594326,
 'MASQ16': 0.09862378804802542,
 'MASQ17': -0.0304684440320716,
 'MASQ18': 0.009418608109222026,
 'MASQ19': -0.0016104142823778434,
 'MASQ20': -0.026684325701459576,
 'MASQ21': 0.006656358941045628,
 'MASQ22': 0.0572230395323019

#### Elastic Net

In [44]:
MASQ_coeff_elanet = dict(zip(X_train.columns, enet_best.coef_))
MASQ_coeff_elanet

{'Leeftijd': 0.014793192640770315,
 'DEMOG1': -0.0,
 'DEMOG2': -0.0,
 'DEMOG3': -0.015414998111684118,
 'DEMOG4': -0.0021918137221413007,
 'DEMOG5': 0.0005171519091301324,
 'DEMOG6': 0.0,
 'DEMOG7': -0.0,
 'DEMOG8': -0.0,
 'MASQ01': 0.03706342544720465,
 'MASQ02': -0.019159994266461193,
 'MASQ03': -0.009784511801573954,
 'MASQ04': 0.0,
 'MASQ05': 0.001010058624440845,
 'MASQ06': 0.0,
 'MASQ07': 0.0,
 'MASQ08': -0.0,
 'MASQ09': 0.0,
 'MASQ11': 0.0,
 'MASQ12': -0.0,
 'MASQ13': 0.012340359697115905,
 'MASQ14': 0.005974417556023659,
 'MASQ15': 0.0,
 'MASQ16': 0.0683807134930063,
 'MASQ17': -0.0,
 'MASQ18': 0.0013681054058610458,
 'MASQ19': 0.0,
 'MASQ20': -0.0,
 'MASQ21': 0.002437242865682775,
 'MASQ22': 0.037142592395685604,
 'MASQ23': 0.0,
 'MASQ24': 0.009955011473571737,
 'MASQ25': -0.0026142725006029974,
 'MASQ26': 0.0,
 'MASQ27': 0.0,
 'MASQ28': -0.0,
 'MASQ29': 0.00650219911551528,
 'MASQ30': 0.02544623631997613,
 'MASQ31': 0.013118962799443412,
 'MASQ32': 0.0,
 'MASQ33': 0.009550735