# LASSO AND RIDGE MODELS

## CONTENTS

__1.LIBRARIES AND DATA__
    
_Importing Necessary tools for data science and dataset_
        
__2.RUNNING RIDGE AND LASSO REGRESSIONS__

    2.1 Necessary tranformations and defining dataframes
    2.2 Ridge Regression
    2.3 Lasso Regression
    
__3.ADDING POLYNOMIAL FEATURE OF 2ND DEGREE__

_Creating the X^2 dataframe_

__4.RUNNING RIDGE AND LASSO REGRESSION WITH THE EXPANDED X__

    4.1 Ridge Regression
    4.2 Lasso Regression
    
__5.CONCLUSION__


## 1. LIBRARIES AND DATA

__Libraries__

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import arange

from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', 100)

__Importing Data__

In [2]:
cancer_df = pd.read_csv(r'C:\Users\Constantine\OneDrive\Υπολογιστής\projects\Predicting Death Rate From Cancer\cancer_reg_refined.csv',
                 encoding='latin-1') 
cancer_df.head(2)

# Without the encoding paremeter, this error presents itself: UnicodeDecodeError: 'utf-8' codec can't decode 
# byte 0xf1 in position 41137: invalid continuation byte

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,Geography,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate
0,1397.0,469,164.9,489.8,61898,260131,11.2,499.748204,93564.75,39.3,36.9,41.7,WEST,2.54,52.5,11.5,39.5,6.9,23.2,19.6,51.9,8.0,75.1,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,49534.0,33.0,32.2,33.7,WEST,2.34,44.5,6.1,22.4,7.5,26.0,22.7,55.9,7.8,70.2,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096


In [3]:
# Copying and shuffling the original Dataframe
df1 = cancer_df.sample(frac = 1) 

## 2. RUNNING RIDGE AND LASSO REGRESSIONS

__2.1 Necessary tranformations and defining dataframes__

In [4]:
# Getting Dummy Variables
df1 = pd.get_dummies(data = df1, drop_first = True)
df1.head(2) 

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Geography_EAST,Geography_WEST
2512,124.0,52,175.9,438.7,52493,22372,11.6,0.0,52796.0,42.3,41.2,43.0,2.45,54.8,9.4,28.1,7.6,37.7,11.4,57.8,8.0,76.0,56.5,28.2,13.5,97.255249,0.424309,0.746961,0.097238,56.070215,3.045785,0,0
571,37.0,15,150.3,423.8,54556,5619,10.1,0.0,58020.05,47.5,46.2,49.1,2.22,57.5,15.8,33.1,5.8,39.1,13.5,63.3,3.4,74.7,44.2,31.9,12.7,97.191504,0.912761,0.017553,1.000527,53.409091,6.001847,0,0


In [5]:
# Defining X and Y
Y = df1['TARGET_deathRate']
X = df1.drop('TARGET_deathRate', axis = 1)
X.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Geography_EAST,Geography_WEST
2512,124.0,52,438.7,52493,22372,11.6,0.0,52796.0,42.3,41.2,43.0,2.45,54.8,9.4,28.1,7.6,37.7,11.4,57.8,8.0,76.0,56.5,28.2,13.5,97.255249,0.424309,0.746961,0.097238,56.070215,3.045785,0,0
571,37.0,15,423.8,54556,5619,10.1,0.0,58020.05,47.5,46.2,49.1,2.22,57.5,15.8,33.1,5.8,39.1,13.5,63.3,3.4,74.7,44.2,31.9,12.7,97.191504,0.912761,0.017553,1.000527,53.409091,6.001847,0,0


In [6]:
# Scaling X
X_scaled = StandardScaler().fit_transform(X)
X_scaled = pd.DataFrame(data = X_scaled, columns = X.columns)
X_scaled.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Geography_EAST,Geography_WEST
0,-0.481553,-0.393942,-0.169676,0.584401,-0.410667,-0.848716,-0.310592,0.33571,0.231991,0.260449,0.106801,-0.035276,0.402558,-1.111004,-0.802343,0.450061,0.336698,-0.273604,0.485042,0.046235,1.120084,1.702773,-1.104505,-0.966298,0.793148,-0.576263,-0.107457,-0.586012,0.725485,-1.296997,-0.817767,-0.380539
1,-0.640902,-0.676811,-0.458948,0.771486,-0.654673,-1.080915,-0.310592,0.668686,1.236816,1.217432,1.267699,-0.575872,0.800437,-0.330221,-0.2538,0.00404,0.548241,0.147314,1.146164,-1.265275,0.9978,0.377262,-0.624744,-1.097578,0.789135,-0.542854,-0.749344,-0.24919,0.306045,0.176533,-0.817767,-0.380539


In [7]:
# Using the RepeatedKFold function for cross-validation
rkf = RepeatedKFold(n_splits= 5, n_repeats= 10, random_state= 126) 

In [8]:
# Defining function that calculates cross-validated scores
def cv_scores(X,Y, model):
        
    cv_MAE = round(np.mean(cross_val_score(model, X, Y, cv=rkf, 
                                       scoring='neg_mean_absolute_error', error_score='raise') * (-1)),3)

    cv_MSE = round(np.mean(cross_val_score(model, X, Y, cv=rkf, 
                                          scoring='neg_mean_squared_error', error_score='raise') * (-1)),1)
    
    cv_MAPE = round(np.mean(cross_val_score(model, X, Y, cv=rkf, 
                                          scoring='neg_mean_absolute_percentage_error', error_score='raise') * (-100)),3)
    
    cv_R2 = round(np.mean(cross_val_score(model, X, Y, cv=rkf, 
                                          scoring='r2', error_score='raise')),3)
    
    cv_F_stat = round(((cv_R2)/(1-cv_R2))*((len(X) - len(X.columns) - 1 )/len(X.columns)),2)
    
    score_list = [cv_MAE, cv_MSE, cv_MAPE, cv_R2, cv_F_stat]
    return score_list

__2.2 Ridge Regression__

In [9]:
# Defining Grid to be 'fed' to GridSearchCV
grid = dict()
grid['alpha'] = arange(0.1,20, 0.1)

In [10]:
# Instantiating GridSearchCV for Ridge
Ridge_search = GridSearchCV(Ridge(), grid, scoring='neg_mean_absolute_error', cv=rkf, n_jobs=-1)

In [11]:
# Searching for the best parameters
CV_Ridge_results = Ridge_search.fit(X_scaled, Y)

In [12]:
# Best alpha
alpha_ridge = CV_Ridge_results.best_params_['alpha']
alpha_ridge

15.9

In [13]:
# Storing the cross-validated scores

CV_RESULTS = pd.DataFrame({'Ridge':cv_scores(X_scaled,Y, Ridge(alpha = alpha_ridge, max_iter=100000))}, 
                          index=['MAE','MSE', 'MAPE','R^2','F-stat'])
CV_RESULTS

Unnamed: 0,Ridge
MAE,14.149
MSE,366.8
MAPE,8.176
R^2,0.517
F-stat,88.54


__2.3 Lasso Regression__

In [14]:
# Defining Grid to be 'fed' to GridSearchCV
grid = dict()
grid['alpha'] = arange(0.01, 3, 0.01)

In [15]:
# Instantiating GridSearchCV for Lasso
Lasso_search = GridSearchCV(Lasso(), grid, scoring='neg_mean_absolute_error', cv=rkf, n_jobs=-1)

In [16]:
# Searching for the best parameters
CV_Lasso_results = Lasso_search.fit(X, Y)

In [17]:
# Best alpha
alpha_lasso = CV_Lasso_results.best_params_['alpha']
alpha_lasso

0.060000000000000005

In [18]:
# Storing the cross-validated scores

CV_RESULTS = pd.concat([CV_RESULTS ,pd.DataFrame({'Lasso':cv_scores(X_scaled,Y, Lasso(alpha = alpha_lasso, max_iter=100000))},
                                                 index=['MAE','MSE', 'MAPE','R^2','F-stat'])], axis =  1)
CV_RESULTS

Unnamed: 0,Ridge,Lasso
MAE,14.149,14.173
MSE,366.8,367.5
MAPE,8.176,8.19
R^2,0.517,0.517
F-stat,88.54,88.54


## 3. ADDING POLYNOMIAL FEATURE OF 2ND DEGREE

In [19]:
# Dividing X to numeric and categorical dataframes

X_num = X.drop(columns = ['Geography_EAST','Geography_WEST'])
X_cat = X[['Geography_EAST','Geography_WEST']]

# There is no point to addying categorical features to power of n to the dataset

__Creating the X^2 dataframe__

In [20]:
# Copying the numeric dataframe and assigning it to a variable.
X2_num = X_num.copy()

In [21]:
# Loop that adds polynomial feature of 2nd degree.
for i in X_num.columns:
    
    name2 = i + '^2'
    
    Z2 = X_num[i] * X_num[i]
    
    X2_num[name2] = Z2

In [22]:
# Concatenating numeric and categorical dataframes
X2 = pd.concat([X2_num.round(3), X_cat], axis =  1)

In [23]:
# Independent dataframe is X + X^2
X2.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,avgAnnCount^2,avgDeathsPerYear^2,incidenceRate^2,medIncome^2,popEst2015^2,povertyPercent^2,studyPerCap^2,binnedInc^2,MedianAge^2,MedianAgeMale^2,MedianAgeFemale^2,AvgHouseholdSize^2,PercentMarried^2,PctNoHS18_24^2,PctHS18_24^2,PctBachDeg18_24^2,PctHS25_Over^2,PctBachDeg25_Over^2,PctEmployed16_Over^2,PctUnemployed16_Over^2,PctPrivateCoverage^2,PctEmpPrivCoverage^2,PctPublicCoverage^2,PctPublicCoverageAlone^2,PctWhite^2,PctBlack^2,PctAsian^2,PctOtherRace^2,PctMarriedHouseholds^2,BirthRate^2,Geography_EAST,Geography_WEST
2512,124.0,52,438.7,52493,22372,11.6,0.0,52796.0,42.3,41.2,43.0,2.45,54.8,9.4,28.1,7.6,37.7,11.4,57.8,8.0,76.0,56.5,28.2,13.5,97.255,0.424,0.747,0.097,56.07,3.046,15376.0,2704,192457.69,2755515049,500506384,134.56,0.0,2787418000.0,1789.29,1697.44,1849.0,6.003,3003.04,88.36,789.61,57.76,1421.29,129.96,3340.84,64.0,5776.0,3192.25,795.24,182.25,9458.583,0.18,0.558,0.009,3143.869,9.277,0,0
571,37.0,15,423.8,54556,5619,10.1,0.0,58020.05,47.5,46.2,49.1,2.22,57.5,15.8,33.1,5.8,39.1,13.5,63.3,3.4,74.7,44.2,31.9,12.7,97.192,0.913,0.018,1.001,53.409,6.002,1369.0,225,179606.44,2976357136,31573161,102.01,0.0,3366326000.0,2256.25,2134.44,2410.81,4.928,3306.25,249.64,1095.61,33.64,1528.81,182.25,4006.89,11.56,5580.09,1953.64,1017.61,161.29,9446.189,0.833,0.0,1.001,2852.531,36.022,0,0


In [24]:
# Scaling X2
X2_scaled = StandardScaler().fit_transform(X2)
X2_scaled = pd.DataFrame(data = X2_scaled, columns = X2.columns)
X2_scaled.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,avgAnnCount^2,avgDeathsPerYear^2,incidenceRate^2,medIncome^2,popEst2015^2,povertyPercent^2,studyPerCap^2,binnedInc^2,MedianAge^2,MedianAgeMale^2,MedianAgeFemale^2,AvgHouseholdSize^2,PercentMarried^2,PctNoHS18_24^2,PctHS18_24^2,PctBachDeg18_24^2,PctHS25_Over^2,PctBachDeg25_Over^2,PctEmployed16_Over^2,PctUnemployed16_Over^2,PctPrivateCoverage^2,PctEmpPrivCoverage^2,PctPublicCoverage^2,PctPublicCoverageAlone^2,PctWhite^2,PctBlack^2,PctAsian^2,PctOtherRace^2,PctMarriedHouseholds^2,BirthRate^2,Geography_EAST,Geography_WEST
0,-0.481553,-0.393942,-0.169675,0.584401,-0.410667,-0.848716,-0.310592,0.33571,0.231991,0.260449,0.106801,-0.035273,0.402558,-1.111004,-0.802343,0.450061,0.336698,-0.273604,0.485042,0.046235,1.120084,1.702773,-1.104505,-0.966298,0.793133,-0.576284,-0.107419,-0.586103,0.725453,-1.296885,-0.398857,-0.295512,-0.232412,0.434361,-0.326939,-0.742585,-0.15369,0.144631,0.172287,0.196926,0.046046,-0.162636,0.361987,-0.838532,-0.81781,0.103406,0.258749,-0.35902,0.435109,-0.139276,1.169034,1.872166,-1.072841,-0.891925,0.906796,-0.360194,-0.221683,-0.238489,0.713933,-0.936921,-0.817767,-0.380539
1,-0.640902,-0.676811,-0.458947,0.771486,-0.654673,-1.080915,-0.310592,0.668686,1.236816,1.217432,1.267699,-0.575863,0.800437,-0.330221,-0.2538,0.00404,0.548241,0.147314,1.146164,-1.265275,0.9978,0.377262,-0.624744,-1.097578,0.789167,-0.542838,-0.748946,-0.249015,0.306031,0.17661,-0.411777,-0.324671,-0.519663,0.621193,-0.349583,-0.864054,-0.15369,0.441262,1.269172,1.234017,1.318047,-0.850234,0.80836,-0.417662,-0.360801,-0.207504,0.492049,-0.034659,1.195805,-0.88963,1.021765,0.272622,-0.683317,-0.972926,0.901285,-0.359388,-0.304782,-0.214755,0.256522,0.003551,-0.817767,-0.380539


## 4. RUNNING RIDGE AND LASSO REGRESSION WITH THE EXPANDED X

__4.1 Ridge Regression__

In [25]:
# Defining Grid to be 'fed' to GridSearchCV
grid = dict()
grid['alpha'] = arange(0.1,20, 0.1)

In [26]:
# Instantiating GridSearchCV for Ridge
Ridge_search = GridSearchCV(Ridge(), grid, scoring='neg_mean_absolute_error', cv=rkf, n_jobs=-1)

In [27]:
# Searching for the best parameters
CV_Ridge_results = Ridge_search.fit(X2_scaled, Y)

In [28]:
# Best alpha
alpha_ridge = CV_Ridge_results.best_params_['alpha']
alpha_ridge

0.30000000000000004

In [29]:
# Storing the cross-validated scores

CV_RESULTS = pd.concat([CV_RESULTS ,pd.DataFrame({'Ridge_X2':cv_scores(X2_scaled,Y, Ridge(alpha = alpha_ridge, 
                                                                                          max_iter=100000))},
                                                 index=['MAE','MSE', 'MAPE','R^2','F-stat'])], axis =  1)
CV_RESULTS

Unnamed: 0,Ridge,Lasso,Ridge_X2
MAE,14.149,14.173,13.365
MSE,366.8,367.5,332.3
MAPE,8.176,8.19,7.746
R^2,0.517,0.517,0.563
F-stat,88.54,88.54,54.38


__4.2 Lasso Regression__

In [30]:
# Defining Grid to be 'fed' to GridSearchCV
grid = dict()
grid['alpha'] = arange(0.01,3, 0.01)

In [31]:
# Instantiating GridSearchCV for Lasso
Lasso_search = GridSearchCV(Lasso(max_iter=100000), grid, scoring='neg_mean_absolute_error', cv=rkf, n_jobs=-1)

In [32]:
# Searching for the best parameters
CV_Lasso_results = Lasso_search.fit(X2_scaled, Y)

In [33]:
# Best alpha
alpha_lasso = CV_Lasso_results.best_params_['alpha']
alpha_lasso

0.01

In [34]:
# Storing the cross-validated scores

CV_RESULTS = pd.concat([CV_RESULTS ,pd.DataFrame({'Lasso_X2':cv_scores(X2_scaled,Y, Lasso(alpha = alpha_lasso, 
                                                                                          max_iter=100000))},
                                                 index=['MAE','MSE', 'MAPE','R^2','F-stat'])], axis =  1)
CV_RESULTS

Unnamed: 0,Ridge,Lasso,Ridge_X2,Lasso_X2
MAE,14.149,14.173,13.365,13.341
MSE,366.8,367.5,332.3,331.7
MAPE,8.176,8.19,7.746,7.731
R^2,0.517,0.517,0.563,0.564
F-stat,88.54,88.54,54.38,54.6


## 5. Conclusion

The best model (error wise) is produced  by applying the Lasso Regression to expanded intependent dataframe.

In [35]:
CV_RESULTS.to_csv(r'RidgeAndLasso_CV_Results.csv', index=True, index_label = 'Metric')