# Regression Model

In [1]:
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import warnings
warnings.filterwarnings('ignore')

import statsmodels.api as sm
from statsmodels.api import OLS

from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from pandas.plotting import scatter_matrix

import seaborn as sns

%matplotlib inline

Step 1: splitting the data

In [None]:
#splits the data in a training set and test set
train_data, test_data = train_test_split(....., test_size=.2, random_state=42)

# Predictor and response array from training set 
x_train = train_data[['Area public green (ha)', 'Parking spots total', 'Mean disposable household income', 'Most vulnerable (%)', 'Safety index: High Impact Crime']].values
y_train = train_data['Rent: average'].values

#Predictor and response array from test set 
x_test = test_data[['Area public green (ha)', 'Parking spots total', 'Mean disposable household income', 'Most vulnerable (%)', 'Safety index: High Impact Crime']].values
y_test = test_data['Rent: average'].values






Step 2: retrieving $\beta$ coefficients and results table by using Statsmodel

In [None]:
#includes a column of ones
x_train_sm = sm.add_constant(x_train)

#to build the OLS model
model_sm = sm.OLS(y_train, x_train_sm)

# to fit the data to the model and store the information in results_sm
results_sm = model_sm.fit()

#to access other/more information in the results_sm
import warnings
warnings.filterwarnings('ignore')
print(results_sm.summary())

Step 3: Plotting the regression  

In [None]:


# creating a function to plot regression model for each x variable
def plot_regression(cur_model, poly_transformer=None):
    
    # build the x values for the prediction line, so in this case it creates the numbers 0 to 24 with steps of 0.1.
    x_vals = np.arange(0,24,.1).reshape(-1,1)
    
    # optionally use the passed-in transformer
    if poly_transformer != None:
        dm = poly_transformer.fit_transform(x_vals)
    else:
        dm = x_vals
        
    # make the prediction at each x value
    prediction = cur_model.predict(dm)
    
    # plot the prediction line, and the test data
    plt.plot(x_vals,prediction, color='k', label="Prediction")
    plt.scatter(X_test, y_test, label="Test Data")

    # label your plots
    plt.ylabel("Number of Taxi Pickups")
    plt.xlabel("Time of Day (Hours Past Midnight)")
    plt.legend()
    plt.show()

from sklearn.linear_model import LinearRegression




In [None]:
x_train_income = train_data[['Mean disposable household income']].values.reshape(-1, 1)
x_train_parking = train_data[['Parking spots total']].values.reshape(-1, 1)
x_train_green = train_data[['Area public green (ha)']].values.reshape(-1, 1)
x_train_vulnerable = train_data[['Most vulnerable (%)']].values.reshape(-1, 1)
x_train_safety = train_data[['Safety index: High Impact Crime']].values.reshape(-1, 1)
y_train_reshaped = train_data['Rent: average'].values.reshape(-1, 1)

fitted_model_income = LinearRegression().fit(x_train_income, y_train_reshaped)
plot_regression(fitted_model_income)

fitted_model_parking = LinearRegression().fit(x_train_parking, y_train_reshaped)
plot_regression(fitted_model_parking)

fitted_model_green = LinearRegression().fit(x_train_green, y_train_reshaped)
plot_regression(fitted_model_green)

fitted_model_vulnerable = LinearRegression().fit(x_train_vulnerable, y_train_reshaped)
plot_regression(fitted_model_vulnerable)

fitted_model_safety = LinearRegression().fit(x_train_safety, y_train_reshaped)
plot_regression(fitted_model_safety)