# **Import of the Data files and basic Libraries**

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

# other imports will be made when needed later in the code

In [2]:
bank = pd.read_csv('Data/bank.csv', sep=';')
bank_full = pd.read_csv('Data/bank-full.csv', sep=';')

bank_add = pd.read_csv('Data/bank-additional.csv', sep=';')
bank_add_full = pd.read_csv('Data/bank-additional-full.csv', sep=';')

# **Inspecting the Datasets**

In [3]:
# In the code below we examine which dataset we will use for the study

In [4]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [5]:
bank_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [6]:
bank_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             4119 non-null   int64  
 1   job             4119 non-null   object 
 2   marital         4119 non-null   object 
 3   education       4119 non-null   object 
 4   default         4119 non-null   object 
 5   housing         4119 non-null   object 
 6   loan            4119 non-null   object 
 7   contact         4119 non-null   object 
 8   month           4119 non-null   object 
 9   day_of_week     4119 non-null   object 
 10  duration        4119 non-null   int64  
 11  campaign        4119 non-null   int64  
 12  pdays           4119 non-null   int64  
 13  previous        4119 non-null   int64  
 14  poutcome        4119 non-null   object 
 15  emp.var.rate    4119 non-null   float64
 16  cons.price.idx  4119 non-null   float64
 17  cons.conf.idx   4119 non-null   f

In [7]:
bank_add_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [8]:
print('The shape of bank is         : ', bank.shape)
print('The shape of bank_full is    : ',bank_full.shape)
print('The shape of bank_add is     : ',bank_add.shape)
print('The shape of bank_add_full is: ',bank_add_full.shape)

The shape of bank is         :  (4521, 17)
The shape of bank_full is    :  (45211, 17)
The shape of bank_add is     :  (4119, 21)
The shape of bank_add_full is:  (41188, 21)


In [9]:
# The biggest Dataset is the Bank_full but the one with the most independent variables is Bank_add_full. 
# In order to get the best performing prediction model we decided to choose Bank_add_full as our base dataset for this study

## Check the dataset for Missing Values

In [10]:
bank_add_full.isna().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [11]:
bank_add_full.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [12]:
bank_add_full.keys()

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [13]:
# As you can also see, there are no missing values.
# therefore the dataset is complete and no further cleaning has to be done

# **Transform the Dataset**

## Change the Classification to a Binary 

In [14]:
clean_bank_add_full = pd.get_dummies(bank_add_full, drop_first=True)
clean_bank_add_full.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,1,0,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,1,0,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,1,0,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,1,0,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,1,0,0


## Drop duration

In [15]:
# according to the researchers it is usefull to drop duration
clean_bank_add_full = clean_bank_add_full.drop('duration', axis=1)

In [16]:
clean_bank_add_full.keys()

Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_married', 'marital_single', 'marital_unknown',
       'education_basic.6y', 'education_basic.9y', 'education_high.school',
       'education_illiterate', 'education_professional.course',
       'education_university.degree', 'education_unknown', 'default_unknown',
       'default_yes', 'housing_unknown', 'housing_yes', 'loan_unknown',
       'loan_yes', 'contact_telephone', 'month_aug', 'month_dec', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
       'day_of_week_wed', 'poutcome_nonexistent', 'poutcome_success', 'y_

## Next we define the X & Y

In [17]:
# We define the x Table and y Table
    #x Table contains all ....... (without the outcome)
    #y Table only contains the Outcome 0 or 1 (=Success)
x = clean_bank_add_full.drop('y_yes', axis=1)       # independant features
y = clean_bank_add_full['y_yes']                    # dependent variable

## We split the Dataset into Train (0.5) / Validation (0.2) / Test (0.3)

In [18]:
from sklearn.model_selection import train_test_split

# First we split in Train (0.7) & in Test (0.3)
# random state means the code will always have the same training set (in order to reproduce results)
train1_x, test_x, train1_y, test_y = train_test_split(x, y, test_size=0.3, random_state=42)

# Second we split again to get Train (0.5) / Validation (0.2) / Test (0.3)
train_x, validation_x, train_y, validation_y = train_test_split(train1_x, train1_y, test_size=(0.2/0.7), random_state=42)

In [19]:
train1_x.head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_blue-collar,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
39075,29,3,999,1,-3.0,92.713,-33.0,0.709,5023.5,0,...,0,0,0,0,1,0,0,0,0,0
34855,29,4,999,0,-1.8,92.893,-46.2,1.25,5099.1,0,...,1,0,0,0,0,0,0,0,1,0
7107,45,2,999,0,1.1,93.994,-36.4,4.86,5191.0,1,...,1,0,0,0,0,1,0,0,1,0
31614,34,1,999,1,-1.8,92.893,-46.2,1.327,5099.1,0,...,1,0,0,0,0,1,0,0,0,0
34878,32,9,999,0,-1.8,92.893,-46.2,1.25,5099.1,0,...,1,0,0,0,0,0,0,0,1,0


In [20]:
#Quick check if the split is correct:
print("Shape of train_x             : "+str(train_x.shape))
print("Shape of validation_x        : "+str(validation_x.shape))
print("Shape of test_x              : "+str(test_x.shape))
print("Shape of train_y             : "+str(train_y.shape))
print("Shape of validation_y        : "+str(validation_y.shape))
print("Shape of test_y              : "+str(test_y.shape))
print('')
print("Lenght of Dataset            : ", len(x))
print("Percentage of Train          : ", '{:.2f}'.format(len(train_x)/len(x)*100), '%')
print("Percentage of Validation     : ", '{:.2f}'.format(len(validation_x)/len(x)*100), '%')
print("Percentage of Test           : ", '{:.2f}'.format(len(test_x)/len(x)*100), '%')


Shape of train_x             : (20593, 52)
Shape of validation_x        : (8238, 52)
Shape of test_x              : (12357, 52)
Shape of train_y             : (20593,)
Shape of validation_y        : (8238,)
Shape of test_y              : (12357,)

Lenght of Dataset            :  41188
Percentage of Train          :  50.00 %
Percentage of Validation     :  20.00 %
Percentage of Test           :  30.00 %


## Linear Regression

## OLS

In [21]:
x = train_x
xcon = sm.add_constant(x)
model = sm.OLS(train_y, xcon)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y_yes,R-squared:,0.218
Model:,OLS,Adj. R-squared:,0.216
Method:,Least Squares,F-statistic:,112.2
Date:,"Fri, 10 Jun 2022",Prob (F-statistic):,0.0
Time:,10:51:45,Log-Likelihood:,-2938.6
No. Observations:,20593,AIC:,5981.0
Df Residuals:,20541,BIC:,6394.0
Df Model:,51,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-29.5598,4.757,-6.214,0.000,-38.883,-20.236
age,-7.257e-05,0.000,-0.298,0.766,-0.001,0.000
campaign,-0.0023,0.001,-3.175,0.002,-0.004,-0.001
pdays,-0.0003,3.59e-05,-8.300,0.000,-0.000,-0.000
previous,-0.0197,0.010,-2.041,0.041,-0.039,-0.001
emp.var.rate,-0.2256,0.019,-11.790,0.000,-0.263,-0.188
cons.price.idx,0.2936,0.032,9.254,0.000,0.231,0.356
cons.conf.idx,0.0045,0.001,4.078,0.000,0.002,0.007
euribor3m,0.0688,0.016,4.320,0.000,0.038,0.100

0,1,2,3
Omnibus:,8326.874,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,33466.471
Skew:,2.023,Prob(JB):,0.0
Kurtosis:,7.758,Cond. No.,1e+16


In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

betaOLS = np.dot(np.linalg.inv(np.dot(train_x.T,train_x)),np.dot(train_x.T,train_y))
print("OLS Estimates               : ", betaOLS)

lam = 50
betaRidge = np.dot(np.linalg.inv(np.dot(train_x.T,train_x)+lam*np.identity(np.shape(xs)[1])),np.dot(train_x.T,train_y))
print("Own Ridge Estimates         : ", np.append(np.mean(y),betaRidge))

lr = LinearRegression()
lr.fit(train_x,train_y)
rr = Ridge(alpha=lam, fit_intercept=True, solver='sparse_cg')
rr.fit(train_x, train_y)
print("Ridge Estimates from package: ",np.append(rr.intercept_,rr.coef_))

OLS Estimates               :  [-6.38505826e+00 -1.26834090e+01 -1.64788788e-02 -1.94044798e+01
 -1.64169706e+02  3.03640624e+01 -6.53039018e+00  1.58739663e+02
 -6.82180242e-01  2.36499188e+00  7.24935578e+00  6.86571076e+00
  1.57120700e+01  4.65027859e+01  1.13086770e+01 -3.60432730e-01
 -2.78412714e+00  9.68187614e+00  3.78380235e+01  2.22254330e+01
 -1.11066819e+00  1.27076946e+00  9.32474357e+01 -1.75504302e+01
 -1.38354397e+00  4.11455861e+00  2.33504322e+02 -1.44439406e+01
 -4.03253794e+00  6.98303504e+00  1.54515657e+01 -1.02922540e+02
  9.20796923e+18 -1.12137385e+02 -9.20796923e+18  7.03778889e-02
  6.87901446e+00  1.36273707e+01  7.53597789e+00  6.79113880e+00
  1.42685237e+00  1.07990210e+01 -1.54650127e-01  1.12909656e+01
  8.20355802e+00  4.47766335e+00 -1.09083019e+01 -1.06450097e+01
 -6.67463124e+00 -4.60116847e+00 -1.03940133e+00 -4.48032872e-01]


NameError: name 'xs' is not defined

In [36]:
log_reg = sm.Logit(train_y, train_x).fit()

print(log_reg.summary())

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


         Current function value: inf
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                  y_yes   No. Observations:                20593
Model:                          Logit   Df Residuals:                    20542
Method:                           MLE   Df Model:                           50
Date:                Fri, 10 Jun 2022   Pseudo R-squ.:                     inf
Time:                        13:59:15   Log-Likelihood:                   -inf
converged:                      False   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
age                              -0.0007      0.003     -0.203      0.839      -0.007       0.006
campaign     



In [23]:
# does not work
from statsmodels.formula.api import logit
datalogit = pd.DataFrame(np.column_stack((train_y,train_x)))
formula = ('train_y ~ train_x')
model = logit(formula = formula, data = datalogit).fit()
print(model.summary())

PatsyError: Index mismatch between data.index and train_y
    train_y ~ train_x
    ^^^^^^^

In [24]:
from statsmodels.regression.linear_model import OLS

model1 = OLS(train_y-1, train_x)

result1 = model1.fit_regularized(alpha=1., L1_wt=0) #ridge

print(result1.params)

model2 = OLS(train_y-1, train_x)

result2 = model1.fit_regularized(alpha=1., L1_wt=1) #lasso

print(result2.params)

[-1.08164284e-01  6.04595560e-01  1.07574339e-01  6.40302908e-01
  3.78226347e+00 -5.05421150e+00 -1.24538899e+00  4.08425745e+00
  1.02880165e-01  8.23026501e-01  3.05240827e-02  9.42817770e-02
  5.67678737e-02 -5.32897377e-01  1.14994189e-01  3.38353969e-01
 -3.82267190e-01 -8.91241413e-02 -1.31593554e-01  5.55641251e-02
  4.31910162e-01 -5.35279702e-01 -9.21018297e-03  1.43034900e-01
  5.60621074e-01  1.64883349e-01 -1.29232163e-02 -8.75138939e-02
 -9.36951927e-01  1.86182143e-02  8.92163474e-01  8.09698320e-04
  3.68262913e-02 -1.78855174e-02  3.68262913e-02 -1.30384507e-01
  1.86307548e+00 -1.63684529e-01 -1.49563364e-01 -1.45813383e+00
 -5.72194547e-01 -7.85211762e-01  3.73273077e+00  2.88245856e-01
 -3.32543363e-01 -1.60290442e-01  5.07716307e-01 -4.44641578e-01
 -6.37776608e-03 -4.56125807e-01 -7.45545280e-01 -1.16677340e-01]
age                               0.434555
campaign                          0.949712
pdays                             0.179818
previous                 

## Linear Regression

In [25]:
#??
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_x, train_y)

In [26]:
# check how good the train model is on the test data (how it works on data which is not in the "data set")
reg.score(test_x, test_y)

0.20804890276087074

## Lambdas grid

In [27]:
from sklearn.linear_model import RidgeCV
lambdas = np.arange(7, 8, 0.001)
rr = RidgeCV(alphas=lambdas, fit_intercept=True, store_cv_values=True)
rr.fit(train_x, train_y)

In [28]:
betaRidge = np.append(rr.intercept_,rr.coef_)
betaRidge

array([-2.27087540e+01, -6.00063873e-05, -2.26483562e-03, -3.08061083e-04,
       -1.97382041e-02, -1.97818692e-01,  2.46253352e-01,  4.14797049e-03,
        7.29205791e-02,  1.43347260e-06, -4.82103558e-03, -1.80514349e-04,
       -1.51786130e-02, -6.08909604e-03,  3.22795442e-02, -1.39077115e-02,
       -7.81791916e-03,  3.00751262e-02,  5.82475186e-04,  1.25085495e-02,
       -2.24031590e-02,  9.70875823e-04, -1.11810045e-03,  2.10600653e-02,
        8.91573475e-04, -8.54959106e-03, -1.24027440e-03,  6.29243917e-02,
        2.56004188e-03,  1.09433518e-02, -9.84936011e-03, -8.69498320e-03,
       -6.83925747e-03, -5.10087728e-03, -3.11510573e-03, -5.10087728e-03,
        4.92104685e-03, -6.40227662e-02,  8.69229511e-02,  8.96182850e-02,
        2.46975574e-02, -7.10062567e-02,  2.41240088e-01, -4.84581790e-02,
       -4.62732645e-02, -1.08501033e-02,  1.70025114e-02, -1.24176462e-02,
        1.29727976e-02,  4.10291692e-03,  1.79867630e-02,  3.02779622e-02,
        7.45934509e-02])

In [29]:
yhat = np.dot(sm.add_constant(validation_x),betaRidge)
yhat1 = rr.predict(validation_x)
validationMSE = np.mean((yhat-validation_y)**2)

print("Optimal tuning parameter using cross validation: ", rr.alpha_)
print("Ridge Estimates using optimal tuning parameter : ", betaRidge)
print("Validation MSE: ", validationMSE)

Optimal tuning parameter using cross validation:  7.38900000000013
Ridge Estimates using optimal tuning parameter :  [-2.27087540e+01 -6.00063873e-05 -2.26483562e-03 -3.08061083e-04
 -1.97382041e-02 -1.97818692e-01  2.46253352e-01  4.14797049e-03
  7.29205791e-02  1.43347260e-06 -4.82103558e-03 -1.80514349e-04
 -1.51786130e-02 -6.08909604e-03  3.22795442e-02 -1.39077115e-02
 -7.81791916e-03  3.00751262e-02  5.82475186e-04  1.25085495e-02
 -2.24031590e-02  9.70875823e-04 -1.11810045e-03  2.10600653e-02
  8.91573475e-04 -8.54959106e-03 -1.24027440e-03  6.29243917e-02
  2.56004188e-03  1.09433518e-02 -9.84936011e-03 -8.69498320e-03
 -6.83925747e-03 -5.10087728e-03 -3.11510573e-03 -5.10087728e-03
  4.92104685e-03 -6.40227662e-02  8.69229511e-02  8.96182850e-02
  2.46975574e-02 -7.10062567e-02  2.41240088e-01 -4.84581790e-02
 -4.62732645e-02 -1.08501033e-02  1.70025114e-02 -1.24176462e-02
  1.29727976e-02  4.10291692e-03  1.79867630e-02  3.02779622e-02
  7.45934509e-02]
Validation MSE:  0.0

## Logit

In [37]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
model = LogisticRegression(penalty = 'l2',solver = '')      # L2 penalty = ridge ()
model.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
model = LogisticRegression(penalty = 'l1', solver='liblinear', C=1, fit_intercept=True)       #liblinear is good for l1 & l2
model.fit(train_x, train_yy)
n = len(test_x)

prob = model.predict_proba(test_x)

pred = model.predict(test_x)

pred = 1*(prob[:,1] > 0.5)

cm = confusion_matrix(test_y, model.predict(test_x))
frac_correct = (cm[0,0]+cm[1,1])/n

## Lasso - L1 regularization

In [39]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html

# L1 Error means that the Absolut Value will be added

# Beta can get zero

from sklearn import linear_model
from sklearn.linear_model = LassoCV

lasso_reg = linear_model.Lasso(alpha=2, fit_intercept=True)

lasso_reg.fit(train_x, train_y)
print("Lasso Estimates from package:", np.append(lasso_reg.intercept_,lasso_reg.coef_))

SyntaxError: invalid syntax (434462817.py, line 8)

In [53]:
lasso_reg.score(test_x, test_y)

0.31755083571261244

In [54]:
lasso_reg.score(train_x, train_y)

0.30283839040616223

## Ridge Regression - L2 regularization

In [38]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html

# L2 Error means that the error is squared

# Beta can get close to zero

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

ridge_reg = Ridge(alpha=2, fit_intercept=True, solver='sparse_cg')
ridge_reg.fit(train_x, train_y)
print("Ridge Estimates from package: ",np.append(ridge_reg.intercept_,ridge_reg.coef_))

Ridge Estimates from package:  [ 6.65417578e+00  4.09991818e-05  4.57938996e-04  2.92713891e-04
 -3.40331239e-04 -3.26779635e-04  2.48816244e-06  1.29281540e-04
  4.83328822e-03  2.96999086e-04 -1.19045207e-03 -3.96969236e-04
 -1.94275796e-05 -6.19725449e-06  4.65064293e-06  9.99399500e-05
 -1.93281354e-05 -1.11858465e-04  7.41355756e-05  1.29701974e-04
  2.11671814e-05 -3.59669227e-06 -1.64416375e-04  1.79936697e-04
 -3.03728528e-06 -5.16837599e-05 -2.16050052e-04 -1.20282012e-04
  2.31019267e-06  2.91108384e-05  4.29039948e-04  1.18142959e-05
 -2.50274048e-04  1.25552673e-07  1.56842502e-06 -2.06016190e-05
  1.56842502e-06  9.12095199e-06 -4.71469958e-04  3.62480662e-04
  1.98178222e-05  3.83989068e-04  3.39373295e-04  1.99929762e-04
 -1.40364805e-03 -1.59462422e-05  1.00638139e-04  1.70171390e-05
 -1.27898107e-04  8.58328157e-05  5.75020624e-05  4.94914044e-05
  3.04398093e-04  2.58877190e-05]


In [36]:
ridge_reg.score(test_x, test_y)

0.3339311554743658

In [37]:
ridge_reg.score(train_x, train_y)

0.315655873352511

# Laurin