In [1]:
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import warnings
warnings.filterwarnings('ignore')

import statsmodels.api as sm
from statsmodels.api import OLS

from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from pandas.plotting import scatter_matrix

import seaborn as sns

%matplotlib inline

Statsmodel to retrieve $\beta$ values

In [2]:
import statsmodels.api as sm

In [None]:
#to include a column of ones
X_train = sm.add_constant(x_train)

#to build the OLS model
toyregr_sm = sm.OLS(y_train, X_train)

# to fit the data to the model and store the information in results_sm
results_sm = toyregr_sm.fit()

# to retrieve the beta parameters from results_sm
beta0_sm = results_sm.params[0]
beta1_sm = results_sm.params[1]

# print the regression coefficients
print(f'The regression coefficients from statsmodels are: beta_0 = {beta0_sm} and beta_1 = {beta1_sm}')

#to access other/more information in the results_sm
import warnings
warnings.filterwarnings('ignore')
print(results_sm.summary())

In [None]:
#for a multi linear regression, so including multiple x values and being able to see their relationship with y in one regression performance and one table:

#define x_train by selecting the multiple variables, so timemin, distance, fare are the selected x variables in this case
X_train = train_data[['TimeMin', 'Distance', 'Fare']] 

y_train = train_data['PickupCount']

#then follow steps above

In [None]:
#to export the table output (results_sm.summary) to an html file

with open('results_sm_summary.html', 'w') as f:
    f.write(results_sm.summary().as_html())

Sklearn package to retrieve $\beta$ values

In [3]:
from sklearn import linear_model


In [None]:
# build the model
toyregr = linear_model.LinearRegression()

# store regression info in results_skl
results_skl = toyregr.fit(x_train, y_train)

# retrieve the beta parameters from results
beta0_skl = toyregr.intercept_[0]
beta1_skl = toyregr.coef_[0][0]

Linear regression model with sklearn

In [None]:
from sklearn.model_selection import train_test_split

#splitting the data in a traning set and a test set. 0.2 means 20% of the dataset will be used as a test set and 80% will go to the training set. 
#42 controlls the randomness, can be any number, but should be consistent in choosing a specific number.
train_data, test_data = train_test_split(cab_df, test_size=.2, random_state=42)



In [None]:
# Predictor and response array from training set 
X_train = train_data['xcolumn'].values.reshape(-1,1)
y_train = train_data['ycolumn'].values

In [None]:
#Predictor and response array from test set 
X_test = test_data['xcolumn'].values.reshape(-1,1)
y_test = test_data['ycolumn'].values

In [None]:
# creating a function to plot regression model "results"
def plot_cabs(cur_model, poly_transformer=None):
    
    # build the x values for the prediction line, so in this case it creates the numbers 0 to 24 with steps of 0.1.
    x_vals = np.arange(0,24,.1).reshape(-1,1)
    
    # optionally use the passed-in transformer
    if poly_transformer != None:
        dm = poly_transformer.fit_transform(x_vals)
    else:
        dm = x_vals
        
    # make the prediction at each x value
    prediction = cur_model.predict(dm)
    
    # plot the prediction line, and the test data
    plt.plot(x_vals,prediction, color='k', label="Prediction")
    plt.scatter(X_test, y_test, label="Test Data")

    # label your plots
    plt.ylabel("Number of Taxi Pickups")
    plt.xlabel("Time of Day (Hours Past Midnight)")
    plt.legend()
    plt.show()

from sklearn.linear_model import LinearRegression

#with the linearregression function, the model is trained using the training (X_train, y_train) data to predict values over a range (0-24)
#It then plots the predicted values as a line and the test data (X_test, y_test) on a scatter plot.
fitted_cab_model0 = LinearRegression().fit(X_train, y_train)
plot_cabs(fitted_cab_model0)


In [None]:
# retrieving the R-squared value
fitted_cab_model0.score(X_test, y_test)

Linear regression model: statsmodels

In [None]:
# augment the data with a column ones, just as done above when using statsmodels
train_data_augmented = sm.add_constant(X_train)
test_data_augmented = sm.add_constant(X_test)

# fit the model on the training data
OLSModel = OLS(train_data['PickupCount'].values, train_data_augmented).fit()

# get the prediction results
ols_predicted_pickups_test = OLSModel.predict(test_data_augmented)
r2_score_test = r2_score(test_data[['PickupCount']].values, ols_predicted_pickups_test)
print(r2_score_test)

Polynomial regression models with sklearn (if the relationship between the dependent and independent variable is nonlinear, if it shows a curve instead of a line for example)

In [None]:
transformer_3 = PolynomialFeatures(3, include_bias=False) #3 means it wil go up to x^3 values, it will create 3 columns in the output
expanded_train = transformer_3.fit_transform(X_train) # transforms the original feautures of the train data to polynomial features based on the specified degree, so 3 in this case
pd.DataFrame(expanded_train).describe() 

In [None]:
# fit the Linear Regression model using the polynomial features generated from the training data
fitted_cab_model3 = LinearRegression().fit(expanded_train, y_train)

# print the fitted model and show the transformed training data with polynomial features
print("fitting expanded_train:", expanded_train)

# plot the results of the fitted model, including the prediction line and the test data
plot_cabs(fitted_cab_model3, transformer_3)

In [None]:
#to calculate r-squared value for the test data
expanded_test = transformer_3.fit_transform(X_test)
print("Test R-squared:", fitted_cab_model3.score(expanded_test, y_test))
