# Step 4: Modeling

In [102]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, mean_absolute_error, confusion_matrix

from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')


In [103]:
# Import data 
datapath = "/Users/kbrewitt/Github/Predicting Longevity Using Urban Greenspace/clean_data/"

X_train = pd.read_csv(datapath + 'X_train.csv')
X_test = pd.read_csv(datapath + 'X_test.csv')
y_train = pd.read_csv(datapath + 'y_train.csv')
y_test = pd.read_csv(datapath + 'y_test.csv')
    

In [104]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,Childhood_Obesity,Incapacity_Benefit,Unemployment,Crime,Deliberate_Fires,GCSE_points,Unauthorised_School_Absence,Dependent_children,Public_Transport_Access,...,Borough_Richmond upon Thames,Borough_Southwark,Borough_Sutton,Borough_Tower Hamlets,Borough_Waltham Forest,Borough_Wandsworth,Borough_Westminster,Total_Greenspace_High,Total_Greenspace_Low,Total_Greenspace_Medium
0,0,-3.131879,-1.469351,-1.378691,-1.578345,-0.599698,1.584896,-1.576504,-1.716551,-0.349521,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,-0.470979,-1.156431,0.075746,0.673389,-0.524905,0.646249,0.212268,-0.147872,1.113057,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,-0.217785,1.441662,0.295841,-0.248006,-0.360325,-0.909599,0.604932,0.69196,-0.960857,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,3,0.629342,3.126469,0.841047,1.684427,-0.273891,-1.885972,-0.263692,2.240292,1.461732,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,0.666976,1.044254,1.301207,1.931331,3.776083,-1.313086,1.882319,0.871581,-0.5928,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [105]:
# Convert y data to array
y_train = np.array(y_train)
y_test = np.array(y_test)

In [106]:
y_train

array([[83.26749091],
       [79.18025867],
       [79.81147608],
       ...,
       [83.18958209],
       [82.56130231],
       [80.48830528]])

# Model 1: Linear regression 
Fit a simple linear regression model.

In [110]:
# Create the model and fit it to the training data
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions on train and test data
y_pred_tr = lr.predict(X_train)
y_pred_te = lr.predict(X_test)


In [108]:
# Assess performance using r-squared
r2_score(y_train, y_pred_tr), r2_score(y_test, y_pred_te)

(0.6642260262064753, 0.6398037266436656)

In [109]:
# Assess performance using RMSE
np.sqrt(mean_squared_error(y_train, y_pred_tr)), np.sqrt(mean_squared_error(y_test, y_pred_te))


(1.3531829400790458, 1.3934508540039967)

## Hyperparameter tuning and cross-validation

In [111]:
# Make a pipeline 
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression

LRpipe = make_pipeline(
    SelectKBest(),
    LinearRegression()
)

In [117]:
#  Pass the pipe object and use 5-fold cross-validation to assess fit on training data
lr_default_cv_results = cross_validate(LRpipe, X_train, y_train, cv=5)

In [118]:
lr_cv_scores = lr_default_cv_results['test_score']
np.mean(lr_cv_scores), np.std(lr_cv_scores)

(0.11526312892289345, 0.011443765820333872)