In [1]:
import datetime
import os
import random
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy
import statsmodels.api as sm
import pylab as py

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression
from mlxtend.evaluate import bootstrap

import math
import time
import datetime
from dython.nominal import associations
from dython.nominal import identify_nominal_columns
%config IPCompleter.greedy=True

In [2]:
data = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [3]:
cols = ['tripduration','dist','hour','birthyear','temp','snowdepth']

In [4]:
df = data[cols]

In [5]:
def split_dataset(df):
    np.random.seed(1342)
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [6]:
def model_evaluation(X_train,y_train):
    print(datetime.datetime.now())
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    print(datetime.datetime.now())
    return model_lr

In [7]:
def one_hot_encoding(X_train,X_test):
    categorical_features=identify_nominal_columns(X_train)
    print(categorical_features)
    one_hot_encoded_data_train = pd.get_dummies(X_train, columns = categorical_features)
    one_hot_encoded_data_test = pd.get_dummies(X_test, columns = categorical_features)
    return one_hot_encoded_data_train,one_hot_encoded_data_test

In [12]:
for col in  ['dist','hour','birthyear','temp','snowdepth']:
    mu = df[col].mean()
    sigma = df[col].std()
    mode = df[col].mode()
    print(f"The variable {col} has mean = {mu} and standard deviation ={sigma} and mode = {mode} ")
    

The variable dist has mean = 0.7531646295521108 and standard deviation =0.45860400131080553 and mode = 0    0.659048
Name: dist, dtype: float64 
The variable hour has mean = 13.529862126029933 and standard deviation =5.220618726405568 and mode = 0    8
Name: hour, dtype: int64 
The variable birthyear has mean = 1980.9073907929735 and standard deviation =9.636033258929617 and mode = 0    1988
Name: birthyear, dtype: int64 
The variable temp has mean = 17.56186702088147 and standard deviation =9.235627426482639 and mode = 0    23.8
Name: temp, dtype: float64 
The variable snowdepth has mean = 0.038611274026497486 and standard deviation =0.29514453988769984 and mode = 0    0.0
Name: snowdepth, dtype: float64 


In [18]:
X_train, X_test, y_train, y_test = split_dataset(df)
X_train, X_test = one_hot_encoding(X_train,X_test)
model_reg = model_evaluation(X_train,y_train)
print(model_reg.coef_)
y_pred = model_reg.predict(X_test)
print(y_pred)
print(model_reg.intercept_)

Train (252346, 5) (252346,)
Test (63087, 5) (63087,)
[]
2023-03-16 20:41:11.540023
2023-03-16 20:41:11.627618
[ 4.98536101  0.04298667 -0.01374404  0.0233046   0.22328528]
[8.18013093 4.32760288 8.79730231 ... 3.81187496 5.82428681 5.16156023]
28.49866835195106


In [14]:
mu_train = X_train.mean()
sigma_train = X_train.std()
mu_test = X_test.mean()
sigma_test = X_test.std()

In [20]:
model_metrics = {"MAE":mean_absolute_error(y_test, y_pred),"MSE":mean_squared_error(y_test, y_pred),
                              "RMSE":np.sqrt(mean_squared_error(y_test, y_pred)),"R2":r2_score(y_test, y_pred),
                              "Mu_train":mu_train, "sigma_train":sigma_train,
                              "Mu_test":mu_test, "sigma_test":sigma_test}

print("RMSE of mini model: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 of mini model: ", r2_score(y_test, y_pred))
print("Model intercept is: ",model_reg.intercept_)
print("Model coefficients: ", model_reg.coef_)

RMSE of mini model:  2.3096843898936936
R2 of mini model:  0.49555533444672484
Model intercept is:  28.49866835195106
Model coefficients:  [ 4.98536101  0.04298667 -0.01374404  0.0233046   0.22328528]


### Apply BootStarp on Linear Regression

In [13]:
X_train, X_test, y_train, y_test = split_dataset(df)
X_train, X_test = one_hot_encoding(X_train,X_test)
model_lr = LinearRegression(fit_intercept=True)
rng = np.random.RandomState(1234)
idx = np.arange(y_train.shape[0])
bootstarp_train_accuracies = {}
for i in range(501):
    train_idx = rng.choice(idx, size=idx.shape[0], replace=True)
    test_idx = np.setdiff1d(idx, train_idx, assume_unique=False)
    boot_train_X, boot_train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    boot_test_X, boot_test_y = X_train.iloc[test_idx], y_train.iloc[test_idx]
    model_lr.fit(boot_test_X, boot_test_y )
    boot_y_pred = model_lr.predict(boot_test_X)
    bootstarp_train_accuracies[i] = {"RMSE": np.sqrt(mean_squared_error(boot_test_y, boot_y_pred)),
                                  "R2":r2_score(boot_test_y, boot_y_pred), "intercept":model_lr.intercept_}
    

Train (279603, 5) (279603,)
Test (69901, 5) (69901,)
[]


In [14]:
bootstarp_train_accuracies

{0: {'RMSE': 10.609447955835657,
  'R2': 0.05665279707884141,
  'intercept': 168.0442666915999},
 1: {'RMSE': 10.571917656142363,
  'R2': 0.05698042014929339,
  'intercept': 149.94242050672167},
 2: {'RMSE': 10.804955870330716,
  'R2': 0.05476157237422574,
  'intercept': 166.37932520210654},
 3: {'RMSE': 10.761387774214384,
  'R2': 0.05583427851959977,
  'intercept': 158.86644853689305},
 4: {'RMSE': 10.610296641802915,
  'R2': 0.05808422552542114,
  'intercept': 169.51104324690337},
 5: {'RMSE': 10.578537457580568,
  'R2': 0.05751736743356117,
  'intercept': 165.64762461729984},
 6: {'RMSE': 10.637310792676445,
  'R2': 0.05700800319367283,
  'intercept': 171.43160706784843},
 7: {'RMSE': 10.818689134644753,
  'R2': 0.056505487993159687,
  'intercept': 169.4462945013886},
 8: {'RMSE': 10.535521680335844,
  'R2': 0.05541598589839791,
  'intercept': 158.04637171522225},
 9: {'RMSE': 10.599163728191247,
  'R2': 0.0557576138211634,
  'intercept': 168.67405946472005},
 10: {'RMSE': 10.45027

In [15]:
R2 = []
RMSE = []
intercept = []
for i in range(501):
    R2.append(bootstarp_train_accuracies[i]['R2'])
    RMSE.append(bootstarp_train_accuracies[i]['RMSE'])
    intercept.append(bootstarp_train_accuracies[i]['intercept'])

In [16]:
ci_lower_r2 = np.percentile(R2, 2.5)
ci_upper_r2 = np.percentile(R2, 97.5)
print(ci_lower_r2, ci_upper_r2)

0.05345404015532218 0.060250927587117586


In [17]:
ci_lower_rmse = np.percentile(RMSE, 2.5)
ci_upper_rmse = np.percentile(RMSE, 97.5)
print(ci_lower_rmse, ci_upper_rmse)

10.432176817872705 10.84657018953866


In [18]:
ci_lower_intercept = np.percentile(intercept, 2.5)
ci_upper_intercept = np.percentile(intercept, 97.5)
print(ci_lower_intercept, ci_upper_intercept)

151.81510052682674 175.662293422301
