In [2]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd

df = pd.read_csv("life_expectancy.csv")
df.head()

Unnamed: 0,Life expectancy,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,Total expenditure,Population,Schooling
0,72.6,0,9.85,1137.6237,92,0,58.5,6.84,2141669.0,16.4
1,66.5,30,0.07,0.0,63,0,51.3,4.36,25627626.0,9.4
2,73.3,1,3.67,192.322631,98,0,45.4,4.8,2728777.0,11.1
3,59.8,1,0.45,20.868787,25,24,3.9,7.72,8942.0,4.5
4,76.0,32,0.01,429.662508,61,15,57.2,5.26,32776571.0,10.5


Multivariate Linear Regression fits a line of the form:

$$\hat{y} = \theta_0 + \theta_1x_1 + \theta_2x_2 +...+ \theta_nx_n$$

In Matrix form:

$$\begin{bmatrix}
y_1\\
.\\
.\\
y_n
\end{bmatrix} =
\begin{bmatrix}
1 & x_{11} & .. & x_{1k}\\
.\\
.\\
1 & x_{n1} & .. & x_{nk}
\end{bmatrix} 
\begin{bmatrix}
\theta_0\\
.\\
.\\
\theta_k
\end{bmatrix}$$

$$\hat{y} = X\theta$$

Predicting Life Expectancy from features:

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler #Always scale / normalise data, else predictors given unessesary weights 
from sklearn.metrics import mean_squared_error

#Feature Selection
x_data = df[["Schooling", "BMI", "Alcohol", "Population"]]
y_data = df["Life expectancy"]

#Split data
train_x, test_x, train_y, test_y = train_test_split(x_data, y_data, test_size=0.30)
#Pipeline model
pipe = Pipeline(steps = [("scaler", StandardScaler()), ("reg", LinearRegression())])
pipe.fit(train_x, train_y)
#Predict the targets
predicted_y = pipe.predict(test_x)

print(pipe.named_steps["reg"].coef_)
print("MSE: ", mean_squared_error(test_y, predicted_y))

[ 5.74717395  1.93414533 -0.03710352 -0.22334281]
MSE:  35.78052345561173


Stochastic Gradient Decent, useful when dealing woth large data sets, reduces computation dramatically

In [26]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(), SGDRegressor())
model.fit(train_x, train_y)
predicted_y = model.predict(test_x)

print("MSE: ", mean_squared_error(test_y, predicted_y))

MSE:  27.951664959000347
