# Exploring Urban Data with ML
# Supervised Learning 1 - Regression Models



## Ordinary Least Squared (OLS) regression model

In [None]:
# !pip install regressors

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from math import sqrt
from regressors import stats

from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

__About data__<br>
Sklearn provides example datasets for exercise purposes. For more information, please check https://scikit-learn.org/stable/datasets/index.html <br>

__Boston Housing dataset__ (506 samples and 13 derived features)


* CRIM - per capita crime rate by town
* ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
* INDUS - proportion of non-retail business acres per town
* CHAS - Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
* NOX - nitric oxides concentration (parts per 10 million)
* RM - average number of rooms per dwelling
* AGE - proportion of owner-occupied units built prior to 1940
* DIS - weighted distances to five Boston employment centres
* RAD - index of accessibility to radial highways
* TAX - full-value property-tax rate per USD10,000
* PTRATIO - pupil-teacher ratio by town
* B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* LSTAT - percentage lower status of the population
* MEDV - Median value of owner-occupied homes in USD1000’s (*Our target variable*)



#### Load Boston housing dataset from sklearn

In [None]:
from sklearn.datasets import load_boston

boston = load_boston()
df = pd.DataFrame(boston.data, columns = boston.feature_names)
df['MEDV'] = boston.target

print (df.shape)
df.head()

In [None]:
# boston

#### Split data into target variable (y) and predictors (X)
* target variable - y - dependent variable - label
* predictors - X - independent variables - explanatory variables

In [None]:
X, y = df.iloc[:,:-1], df.iloc[:,-1]

# X = df.iloc[:,:-1]
# y = df.iloc[:,-1]

#### Split data in training and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.75, random_state = 0)

print (len(X_train))
print (len(X_test))

#### Building a linear regression model (OLS)

In [None]:
# You can choose any text for your model name
lr = LinearRegression().fit(X_train, y_train)

#### Predict values using OLS model and evaluate (training)
    predicted y value = YOUR_MODEL_NAME.predict(X_train)
    
    # Metrics
    MSE = YOUR_MODEL_NAME.mean_squared_error(y_train, y_pred_train)
    R2 = YOUR_MODEL_NAME.score(X_train, y_train)

In [None]:
y_pred_train = lr.predict(X_train)

In [None]:
# R2
lr.score(X_train, y_train)

In [None]:
# MSE
mean_squared_error(y_train, y_pred_train)

#### Predict values and evaluate (test)

In [None]:
# If the result from the test dataset is reasonable, our traning model can be used to test data
# Predicting new data points (future data)

y_pred_test = lr.predict(X_test)

print (lr.score(X_test, y_test))
print (mean_squared_error(y_test, y_pred_test))

### Conclusions:

#### Regression results: coefficients


    # The change in the value of dependent variable corresponding to the unit change in the independent variable.

    coefficients of predictors = YOUR_MODEL_NAME.coef_
    constant = YOUR_MODEL_NAME.intercept_
    p-values of predictors and intercept = stats.coef_pval(YOUR_MODEL_NAME, X_train, y_train)
    Summary table = stats.summary(YOUR_MODEL_NAME, X_train, y_train, list of predictors)    

In [None]:
print (df.columns.tolist()[:-1])
print (lr.coef_)
print (stats.coef_pval(lr, X_train, y_train))
print (lr.intercept_)

In [None]:
# Create pandas dataframe of results
result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = df.columns.tolist()[:-1]
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

In [None]:
stats.summary(lr, X_train, y_train, df.columns.tolist()[:-1])

#### Do more complex models with more predictors perform better?
__Let's use more features to predict housing prices in Boston__
* 506 samples, 104 predictors (artificial data)

In [None]:
df = pd.read_csv('boston_data_extended.csv')
print (df.shape)
df.head(2)

#### Try the same OLS process

In [None]:
# Split predictors and target variable
X, y = df.iloc[:,:-1], df.iloc[:,-1]

# Split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.75, random_state=0)

# Build OLS model
lr = LinearRegression().fit(X_train, y_train) # training process 

y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

# Model performance (R-squared values of train and test sets)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print ("Test set score: %.2f"% lr.score(X_test, y_test))

# Model performance (MSE of train and test sets)
print('Mean squared error (train set): %.2f'% mean_squared_error(y_train, y_pred_train))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))