In [2]:
from pydataset import data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scaling import add_scaled_columns

In [3]:
swiss = data('swiss')
swiss = swiss.rename(columns={'Infant.Mortality': 'infant_mortality'})

In [4]:
swiss

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,infant_mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6
Porrentruy,76.1,35.3,9,7,90.57,26.6
Broye,83.8,70.2,16,7,92.85,23.6
Glane,92.4,67.8,14,8,97.16,24.9
Gruyere,82.4,53.3,12,7,97.67,21.0
Sarine,82.9,45.2,16,13,91.38,24.4


In [5]:
train_validate, test = train_test_split(swiss, test_size=.15, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)
train.shape, validate.shape, test.shape

((27, 6), (12, 6), (8, 6))

In [7]:
scaler = StandardScaler()

In [9]:
train.columns.tolist()

['Fertility',
 'Agriculture',
 'Examination',
 'Education',
 'Catholic',
 'infant_mortality']

In [11]:
train, validate, test = add_scaled_columns(train, validate, test, scaler, train.columns.tolist())

In [13]:
train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,infant_mortality,Fertility_scaled,Agriculture_scaled,Examination_scaled,Education_scaled,Catholic_scaled,infant_mortality_scaled
Le Locle,72.7,16.7,22,13,11.22,18.9,0.23702,-1.832964,0.896364,0.254152,-0.813038,-0.342152
Sierre,92.2,84.6,3,3,99.46,16.3,1.94415,1.52573,-1.686978,-1.016608,1.302727,-1.255423
Conthey,75.5,85.9,3,2,99.71,15.1,0.482147,1.590035,-1.686978,-1.143685,1.308721,-1.676933
Rive Droite,44.7,46.6,16,29,50.43,18.2,-2.214242,-0.353952,0.080572,2.287369,0.127116,-0.588032
Paysd'enhaut,72.0,63.5,6,3,2.56,18.0,0.175739,0.482012,-1.279082,-1.016608,-1.020682,-0.658284


In [28]:
X_train_scaled = train.drop(columns=['infant_mortality', 'Fertility', 'Agriculture', 'Examination', 'Education', 'Catholic'])
X_validate_scaled = validate.drop(columns=['infant_mortality', 'Fertility', 'Agriculture', 'Examination', 'Education', 'Catholic'])
X_test_scaled = test.drop(columns=['infant_mortality', 'Fertility', 'Agriculture', 'Examination', 'Education', 'Catholic'])
y_train = train.infant_mortality
y_validate = validate.infant_mortality
y_test = test.infant_mortality

In [17]:
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.preprocessing import PolynomialFeatures

### Create a baseline

In [22]:
baseline = np.mean(y_train)

In [26]:
from sklearn.metrics import mean_squared_error

baseline_rmse = mean_squared_error(y_train, np.full(27, baseline))**1/2

In [27]:
baseline_rmse

4.05244170096022

### Linear Regression

In [30]:
lm = LinearRegression(normalize=True)
lm.fit(X_train_scaled, y_train)

lm_pred = lm.predict(X_train_scaled)

In [36]:
lm_rmse = mean_squared_error(y_train, lm_pred)**1/2
lm_rmse

4.9669019958360006e-30

### Lasso Lars

In [33]:
lars = LassoLars(alpha=0.1)
lars.fit(X_train_scaled, y_train)

lars_pred = lars.predict(X_train_scaled)

In [35]:
lars_rmse = mean_squared_error(y_train, lars_pred)**1/2
lars_rmse

0.13500000000000012

### Polynomial Features

In [37]:
pf = PolynomialFeatures(degree=2)

X_train_squared = pf.fit_transform(X_train_scaled)
X_validate_squared = pf.transform(X_validate_scaled)
X_test_squared = pf.transform(X_test_scaled)

In [39]:
lm_squared = LinearRegression()
lm_squared.fit(X_train_squared, y_train)

lm_squared_pred = lm_squared.predict(X_train_squared)

In [42]:
lm_squared_rmse = mean_squared_error(y_train, lm_squared_pred)**1/2
lm_squared_rmse

2.6879704918641884e-30

### Valdiate models

Linear Regression:

In [44]:
lm_pred_v = lm.predict(X_validate_scaled)
lm_rmse_v = mean_squared_error(y_validate, lm_pred_v)**1/2
lm_rmse_v

3.1554436208840472e-30

Lasso Lars:

In [48]:
lars_pred_v = lars.predict(X_validate_scaled)
lars_rmse_v = mean_squared_error(y_validate, lars_pred_v)**1/2
lars_rmse_v

0.07159561036209086

Polynomial Features:

In [49]:
lm_squared_pred_v = lm_squared.predict(X_validate_squared)
lm_squared_rmse_v = mean_squared_error(y_validate, lm_squared_pred_v)
lm_squared_rmse_v

0.0012169332158244066

### Test Model

Will be using model 1 the linear regression model

In [50]:
lm_pred_test = lm.predict(X_test_scaled)
lm_rmse_test = mean_squared_error(y_test, lm_pred_test)**1/2

In [51]:
lm_rmse_test

9.663546088957395e-30