# LASSO AND RIDGE MODELS

## CONTENTS

__1.LIBRARIES AND DATA__
    
_Importing Necessary tools for data science and dataset_
        
__2.RUNNING RIDGE AND LASSO REGRESSIONS__

    2.1 Necessary tranformations and defining dataframes
    2.2 Ridge Regression
    2.3 Lasso Regression
    
__3.ADDING POLYNOMIAL FEATURE OF 2ND DEGREE__

_Creating the X^2 dataframe_

__4.RUNNING RIDGE AND LASSO REGRESSION WITH THE EXPANDED X__

    4.1 Ridge Regression
    4.2 Lasso Regression
    
__5.CONCLUSION__


## 1. LIBRARIES AND DATA

__Libraries__

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import arange

from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', 100)

__Importing Data__

In [2]:
cancer_df = pd.read_csv(r'C:\Users\Constantine\OneDrive\Υπολογιστής\Regression Port\cancer_reg_refined.csv',
                 encoding='latin-1') 
cancer_df.head(2)

# Without the encoding paremeter, this error presents itself: UnicodeDecodeError: 'utf-8' codec can't decode 
# byte 0xf1 in position 41137: invalid continuation byte

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,Geography,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate
0,1397.0,469,164.9,489.8,61898,260131,11.2,499.748204,93564.75,39.3,36.9,41.7,WEST,2.54,52.5,11.5,39.5,6.9,23.2,19.6,51.9,8.0,75.1,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,49534.0,33.0,32.2,33.7,WEST,2.34,44.5,6.1,22.4,7.5,26.0,22.7,55.9,7.8,70.2,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096


In [3]:
# Copying and shuffling the original Dataframe
df1 = cancer_df.sample(frac = 1) 

## 2. RUNNING RIDGE AND LASSO REGRESSIONS

__2.1 Necessary tranformations and defining dataframes__

In [4]:
# Getting Dummy Variables
df1 = pd.get_dummies(data = df1, drop_first = True)
df1.head(2) 

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Geography_EAST,Geography_WEST
980,522.0,181,161.9,484.1,77364,87850,5.0,330.108139,93564.75,43.8,42.7,44.8,2.5,60.5,14.3,28.5,15.9,22.5,30.3,64.8,4.5,85.2,61.4,24.4,9.5,94.598559,1.364683,2.091139,0.311666,60.599832,4.880531,0,0
2099,42.0,21,196.7,383.2,39484,7673,17.9,0.0,38888.25,47.2,46.2,47.8,2.23,46.5,15.2,36.6,6.1,42.2,7.4,48.9,9.3,58.6,33.3,38.0,20.8,66.19171,32.564767,0.272021,0.07772,43.211679,1.638311,1,0


In [5]:
# Defining X and Y
Y = df1['TARGET_deathRate']
X = df1.drop('TARGET_deathRate', axis = 1)
X.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Geography_EAST,Geography_WEST
980,522.0,181,484.1,77364,87850,5.0,330.108139,93564.75,43.8,42.7,44.8,2.5,60.5,14.3,28.5,15.9,22.5,30.3,64.8,4.5,85.2,61.4,24.4,9.5,94.598559,1.364683,2.091139,0.311666,60.599832,4.880531,0,0
2099,42.0,21,383.2,39484,7673,17.9,0.0,38888.25,47.2,46.2,47.8,2.23,46.5,15.2,36.6,6.1,42.2,7.4,48.9,9.3,58.6,33.3,38.0,20.8,66.19171,32.564767,0.272021,0.07772,43.211679,1.638311,1,0


In [6]:
# Scaling X
X_scaled = StandardScaler().fit_transform(X)
X_scaled = pd.DataFrame(data = X_scaled, columns = X.columns)
X_scaled.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Geography_EAST,Geography_WEST
0,0.247422,0.592276,0.711729,2.839846,0.543015,-1.870391,0.435084,2.934271,0.521844,0.547544,0.449361,0.082245,1.242524,-0.513217,-0.75846,2.506711,-1.960054,3.514652,1.326471,-0.951653,1.985478,2.230822,-1.597233,-1.622698,0.625883,-0.511943,1.075433,-0.506055,1.439433,-0.382418,-0.817767,-0.380539
1,-0.631744,-0.63094,-1.247165,-0.595329,-0.624757,0.12652,-0.310592,-0.550756,1.178845,1.217432,1.020295,-0.552368,-0.820551,-0.40342,0.130181,0.078377,1.016658,-1.075351,-0.584775,0.416879,-0.516639,-0.797377,0.166213,0.231632,-1.162616,1.622078,-0.52541,-0.59329,-1.301247,-1.998592,1.222843,-0.380539


In [7]:
# Using the RepeatedKFold function for cross-validation
rkf = RepeatedKFold(n_splits= 5, n_repeats= 10, random_state= 126) 

In [8]:
# Defining function that calculates cross-validated scores
def cv_scores(X,Y, model):
        
    cv_MAE = round(np.mean(cross_val_score(model, X, Y, cv=rkf, 
                                       scoring='neg_mean_absolute_error', error_score='raise') * (-1)),3)

    cv_MSE = round(np.mean(cross_val_score(model, X, Y, cv=rkf, 
                                          scoring='neg_mean_squared_error', error_score='raise') * (-1)),1)
    
    cv_MAPE = round(np.mean(cross_val_score(model, X, Y, cv=rkf, 
                                          scoring='neg_mean_absolute_percentage_error', error_score='raise') * (-100)),3)
    
    cv_R2 = round(np.mean(cross_val_score(model, X, Y, cv=rkf, 
                                          scoring='r2', error_score='raise')),3)
    
    cv_F_stat = round(((cv_R2)/(1-cv_R2))*((len(X) - len(X.columns) - 1 )/len(X.columns)),2)
    
    score_list = [cv_MAE, cv_MSE, cv_MAPE, cv_R2, cv_F_stat]
    return score_list

__2.2 Ridge Regression__

In [9]:
# Defining Grid to be 'fed' to GridSearchCV
grid = dict()
grid['alpha'] = arange(0.1,20, 0.1)

In [10]:
# Instantiating GridSearchCV for Ridge
Ridge_search = GridSearchCV(Ridge(), grid, scoring='neg_mean_absolute_error', cv=rkf, n_jobs=-1)

In [11]:
# Searching for the best parameters
CV_Ridge_results = Ridge_search.fit(X_scaled, Y)

In [12]:
# Best alpha
alpha_ridge = CV_Ridge_results.best_params_['alpha']
alpha_ridge

16.3

In [13]:
# Storing the cross-validated scores

CV_RESULTS = pd.DataFrame({'Ridge':cv_scores(X_scaled,Y, Ridge(alpha = alpha_ridge, max_iter=100000))}, 
                          index=['MAE','MSE', 'MAPE','R^2','F-stat'])
CV_RESULTS

Unnamed: 0,Ridge
MAE,14.168
MSE,367.7
MAPE,8.186
R^2,0.517
F-stat,88.54


__2.3 Lasso Regression__

In [14]:
# Defining Grid to be 'fed' to GridSearchCV
grid = dict()
grid['alpha'] = arange(0.01, 3, 0.01)

In [15]:
# Instantiating GridSearchCV for Lasso
Lasso_search = GridSearchCV(Lasso(), grid, scoring='neg_mean_absolute_error', cv=rkf, n_jobs=-1)

In [16]:
# Searching for the best parameters
CV_Lasso_results = Lasso_search.fit(X, Y)

In [17]:
# Best alpha
alpha_lasso = CV_Lasso_results.best_params_['alpha']
alpha_lasso

0.060000000000000005

In [18]:
# Storing the cross-validated scores

CV_RESULTS = pd.concat([CV_RESULTS ,pd.DataFrame({'Lasso':cv_scores(X_scaled,Y, Lasso(alpha = alpha_lasso, max_iter=100000))},
                                                 index=['MAE','MSE', 'MAPE','R^2','F-stat'])], axis =  1)
CV_RESULTS

Unnamed: 0,Ridge,Lasso
MAE,14.168,14.19
MSE,367.7,368.4
MAPE,8.186,8.199
R^2,0.517,0.516
F-stat,88.54,88.19


## 3. ADDING POLYNOMIAL FEATURE OF 2ND DEGREE

In [19]:
# Dividing X to numeric and categorical dataframes

X_num = X.drop(columns = ['Geography_EAST','Geography_WEST'])
X_cat = X[['Geography_EAST','Geography_WEST']]

# There is no point to addying categorical features to power of n to the dataset

__Creating the X^2 dataframe__

In [20]:
# Copying the numeric dataframe and assigning it to a variable.
X2_num = X_num.copy()

In [21]:
# Loop that adds polynomial feature of 2nd degree.
for i in X_num.columns:
    
    name2 = i + '^2'
    
    Z2 = X_num[i] * X_num[i]
    
    X2_num[name2] = Z2

In [22]:
# Concatenating numeric and categorical dataframes
X2 = pd.concat([X2_num.round(3), X_cat], axis =  1)

In [23]:
# Independent dataframe is X + X^2
X2.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,avgAnnCount^2,avgDeathsPerYear^2,incidenceRate^2,medIncome^2,popEst2015^2,povertyPercent^2,studyPerCap^2,binnedInc^2,MedianAge^2,MedianAgeMale^2,MedianAgeFemale^2,AvgHouseholdSize^2,PercentMarried^2,PctNoHS18_24^2,PctHS18_24^2,PctBachDeg18_24^2,PctHS25_Over^2,PctBachDeg25_Over^2,PctEmployed16_Over^2,PctUnemployed16_Over^2,PctPrivateCoverage^2,PctEmpPrivCoverage^2,PctPublicCoverage^2,PctPublicCoverageAlone^2,PctWhite^2,PctBlack^2,PctAsian^2,PctOtherRace^2,PctMarriedHouseholds^2,BirthRate^2,Geography_EAST,Geography_WEST
980,522.0,181,484.1,77364,87850,5.0,330.108,93564.75,43.8,42.7,44.8,2.5,60.5,14.3,28.5,15.9,22.5,30.3,64.8,4.5,85.2,61.4,24.4,9.5,94.599,1.365,2.091,0.312,60.6,4.881,272484.0,32761,234352.81,5985188496,7717622500,25.0,108971.383,8754362000.0,1918.44,1823.29,2007.04,6.25,3660.25,204.49,812.25,252.81,506.25,918.09,4199.04,20.25,7259.04,3769.96,595.36,90.25,8948.887,1.862,4.373,0.097,3672.34,23.82,0,0
2099,42.0,21,383.2,39484,7673,17.9,0.0,38888.25,47.2,46.2,47.8,2.23,46.5,15.2,36.6,6.1,42.2,7.4,48.9,9.3,58.6,33.3,38.0,20.8,66.192,32.565,0.272,0.078,43.212,1.638,1764.0,441,146842.24,1558986256,58874929,320.41,0.0,1512296000.0,2227.84,2134.44,2284.84,4.973,2162.25,231.04,1339.56,37.21,1780.84,54.76,2391.21,86.49,3433.96,1108.89,1444.0,432.64,4381.342,1060.464,0.074,0.006,1867.249,2.684,1,0


In [24]:
# Scaling X2
X2_scaled = StandardScaler().fit_transform(X2)
X2_scaled = pd.DataFrame(data = X2_scaled, columns = X2.columns)
X2_scaled.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,avgAnnCount^2,avgDeathsPerYear^2,incidenceRate^2,medIncome^2,popEst2015^2,povertyPercent^2,studyPerCap^2,binnedInc^2,MedianAge^2,MedianAgeMale^2,MedianAgeFemale^2,AvgHouseholdSize^2,PercentMarried^2,PctNoHS18_24^2,PctHS18_24^2,PctBachDeg18_24^2,PctHS25_Over^2,PctBachDeg25_Over^2,PctEmployed16_Over^2,PctUnemployed16_Over^2,PctPrivateCoverage^2,PctEmpPrivCoverage^2,PctPublicCoverage^2,PctPublicCoverageAlone^2,PctWhite^2,PctBlack^2,PctAsian^2,PctOtherRace^2,PctMarriedHouseholds^2,BirthRate^2,Geography_EAST,Geography_WEST
0,0.247422,0.592276,0.71173,2.839846,0.543015,-1.870391,0.435083,2.934271,0.521844,0.547544,0.449361,0.082246,1.242524,-0.513217,-0.75846,2.506711,-1.960054,3.514652,1.326471,-0.951653,1.985478,2.230822,-1.597233,-1.622698,0.625911,-0.511922,1.075313,-0.505933,1.439462,-0.382182,-0.161701,0.058029,0.704028,3.166655,0.021566,-1.151438,-0.075752,3.202074,0.475659,0.495594,0.403866,-0.004648,1.329505,-0.535483,-0.783997,2.617629,-1.726728,4.529846,1.41526,-0.765286,2.283863,2.618222,-1.422969,-1.247462,0.680165,-0.358119,0.346458,-0.236384,1.543652,-0.425525,-0.817767,-0.380539
1,-0.631744,-0.63094,-1.247164,-0.595329,-0.624757,0.12652,-0.310592,-0.550756,1.178845,1.217432,1.020295,-0.552359,-0.820551,-0.40342,0.130181,0.078377,1.016658,-1.075351,-0.584775,0.416879,-0.516639,-0.797377,0.166213,0.231632,-1.162598,1.622094,-0.525424,-0.593188,-1.301198,-1.998739,-0.411413,-0.32213,-1.252008,-0.577899,-0.348264,-0.049034,-0.15369,-0.508737,1.202437,1.234017,1.032837,-0.821451,-0.875791,-0.4662,0.003536,-0.161487,1.03891,-0.825495,-0.649465,0.182529,-0.591522,-0.818289,0.063588,0.075715,-1.350742,0.948218,-0.293762,-0.238561,-1.290408,-1.16876,1.222843,-0.380539


## 4. RUNNING RIDGE AND LASSO REGRESSION WITH THE EXPANDED X

__4.1 Ridge Regression__

In [25]:
# Defining Grid to be 'fed' to GridSearchCV
grid = dict()
grid['alpha'] = arange(0.1,20, 0.1)

In [26]:
# Instantiating GridSearchCV for Ridge
Ridge_search = GridSearchCV(Ridge(), grid, scoring='neg_mean_absolute_error', cv=rkf, n_jobs=-1)

In [27]:
# Searching for the best parameters
CV_Ridge_results = Ridge_search.fit(X2_scaled, Y)

In [28]:
# Best alpha
alpha_ridge = CV_Ridge_results.best_params_['alpha']
alpha_ridge

0.30000000000000004

In [29]:
# Storing the cross-validated scores

CV_RESULTS = pd.concat([CV_RESULTS ,pd.DataFrame({'Ridge_X2':cv_scores(X2_scaled,Y, Ridge(alpha = alpha_ridge, 
                                                                                          max_iter=100000))},
                                                 index=['MAE','MSE', 'MAPE','R^2','F-stat'])], axis =  1)
CV_RESULTS

Unnamed: 0,Ridge,Lasso,Ridge_X2
MAE,14.168,14.19,13.376
MSE,367.7,368.4,333.7
MAPE,8.186,8.199,7.75
R^2,0.517,0.516,0.561
F-stat,88.54,88.19,53.94


__4.2 Lasso Regression__

In [30]:
# Defining Grid to be 'fed' to GridSearchCV
grid = dict()
grid['alpha'] = arange(0.01,3, 0.01)

In [31]:
# Instantiating GridSearchCV for Lasso
Lasso_search = GridSearchCV(Lasso(max_iter=100000), grid, scoring='neg_mean_absolute_error', cv=rkf, n_jobs=-1)

In [32]:
# Searching for the best parameters
CV_Lasso_results = Lasso_search.fit(X2_scaled, Y)

In [33]:
# Best alpha
alpha_lasso = CV_Lasso_results.best_params_['alpha']
alpha_lasso

0.01

In [34]:
# Storing the cross-validated scores

CV_RESULTS = pd.concat([CV_RESULTS ,pd.DataFrame({'Lasso_X2':cv_scores(X2_scaled,Y, Lasso(alpha = alpha_lasso, 
                                                                                          max_iter=100000))},
                                                 index=['MAE','MSE', 'MAPE','R^2','F-stat'])], axis =  1)
CV_RESULTS

Unnamed: 0,Ridge,Lasso,Ridge_X2,Lasso_X2
MAE,14.168,14.19,13.376,13.357
MSE,367.7,368.4,333.7,333.2
MAPE,8.186,8.199,7.75,7.737
R^2,0.517,0.516,0.561,0.562
F-stat,88.54,88.19,53.94,54.16


## 5. Conclusion

The best model (error wise) is produced  by applying the Lasso Regression to expanded intependent dataframe.

In [35]:
CV_RESULTS.to_csv(r'RidgeAndLasso_CV_Results.csv', index=True, index_label = 'Metric')