## DM Version - Real World Example
[Dataset](https://data.world/exercises/linear-regression-exercise-1)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv('cancer_reg.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
avganncount,1397,173,102,427,57
avgdeathsperyear,469,70,50,202,26
target_deathrate,164.9,161.3,174.7,194.8,144.4
incidencerate,489.8,411.6,349.7,430.4,350.1
medincome,61898,48127,49348,44243,49955
popest2015,260131,43269,21026,75882,10321
povertypercent,11.2,18.6,14.6,17.1,12.5
studypercap,499.748,23.1112,47.5602,342.637,0
binnedinc,"(61494.5, 125635]","(48021.6, 51046.4]","(48021.6, 51046.4]","(42724.4, 45201]","(48021.6, 51046.4]"
medianage,39.3,33,45,42.8,48.3


In [3]:
X = df[['povertypercent','medincome','medianage','medianagemale','pctemployed16_over','pctunemployed16_over','pctprivatecoverage','pctprivatecoveragealone']]
X.head()

Unnamed: 0,povertypercent,medincome,medianage,medianagemale,pctemployed16_over,pctunemployed16_over,pctprivatecoverage,pctprivatecoveragealone
0,11.2,61898,39.3,36.9,51.9,8.0,75.1,
1,18.6,48127,33.0,32.2,55.9,7.8,70.2,53.8
2,14.6,49348,45.0,44.0,45.9,7.0,63.7,43.5
3,17.1,44243,42.8,42.2,48.3,12.1,58.4,40.3
4,12.5,49955,48.3,47.8,48.2,4.8,61.6,43.9


In [4]:
y = df['avgdeathsperyear']

In [5]:
for col in X.columns:
    print((col, sum(X[col].isnull())))

('povertypercent', 0)
('medincome', 0)
('medianage', 0)
('medianagemale', 0)
('pctemployed16_over', 152)
('pctunemployed16_over', 0)
('pctprivatecoverage', 0)
('pctprivatecoveragealone', 609)


## Dealing With Null Values
[missingpy](https://pypi.org/project/missingpy/)

In [9]:
from missingpy import MissForest
imputer = MissForest(max_iter=10)
X_imputed = imputer.fit_transform(X)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5


In [10]:
col_names = ['povertypercent','medincome','medianage','medianagemale','pctemployed16_over','pctunemployed16_over','pctprivatecoverage','pctprivatecoveragealone']
x = pd.DataFrame(X_imputed, columns= col_names)


In [11]:
from sklearn import preprocessing
x = preprocessing.scale(x)

## Polynomial Regression

In [12]:
from sklearn.preprocessing import PolynomialFeatures
polynomial_features= PolynomialFeatures(degree=3,interaction_only=True, include_bias=True)
x_p = polynomial_features.fit_transform(x)

In [13]:
import statsmodels.api as sm
model = sm.OLS(y, x_p).fit()
ypred = model.predict(x_p) 

In [14]:
model.summary()

0,1,2,3
Dep. Variable:,avgdeathsperyear,R-squared:,0.267
Model:,OLS,Adj. R-squared:,0.245
Method:,Least Squares,F-statistic:,11.72
Date:,"Thu, 31 Oct 2019",Prob (F-statistic):,2.85e-139
Time:,12:14:46,Log-Likelihood:,-22810.0
No. Observations:,3047,AIC:,45810.0
Df Residuals:,2954,BIC:,46370.0
Df Model:,92,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,246.7044,15.874,15.541,0.000,215.579,277.830
x1,191.6524,40.840,4.693,0.000,111.575,271.729
x2,279.8076,35.710,7.835,0.000,209.788,349.827
x3,-23.3553,35.749,-0.653,0.514,-93.451,46.740
x4,80.4563,22.202,3.624,0.000,36.923,123.989
x5,118.6599,26.020,4.560,0.000,67.640,169.680
x6,149.3941,20.408,7.320,0.000,109.378,189.410
x7,-499.2287,63.839,-7.820,0.000,-624.402,-374.055
x8,494.1330,67.319,7.340,0.000,362.136,626.130

0,1,2,3
Omnibus:,5354.306,Durbin-Watson:,1.911
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9510876.602
Skew:,12.029,Prob(JB):,0.0
Kurtosis:,275.644,Cond. No.,1790.0


In [15]:
MSE=sum((ypred-y)**2)/len(y)
print('MSE = ', MSE)

MSE =  186111.568404456


## Regularization
[Statsmodel_Implementation](https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.fit_regularized.html)

In [16]:
import statsmodels.api as sm
model = sm.OLS(y,x).fit_regularized(method='elastic_net', alpha=1.0, L1_wt=0.8)
ypred = model.predict(x) 

In [17]:
regularized_regression_parameters = model.params
print(regularized_regression_parameters)

x1      0.000000
x2    106.763843
x3     -4.484105
x4    -34.783078
x5     30.042408
x6     74.301171
x7    -32.396902
x8     20.899197
dtype: float64


In [18]:
MSE=sum((ypred-y)**2)/len(y)
print('MSE = ', MSE)

MSE =  264688.52563264675
