In [1]:
# import all necessary packages
import pandas as pd
import numpy as np
import json
import datetime
import warnings
from pandas.core.common import SettingWithCopyWarning

#warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
#import seaborn as sns
#import matplotlib as mlp
import matplotlib.pyplot as plt
#from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import math
from sklearn import datasets
from sklearn.metrics import mean_squared_error

In [6]:
# import datasets
df_h_comar = pd.read_csv("data/datasets/df_h_comar.csv")
df_h_comar.drop(columns = ['start_time_day.1'],inplace = True)
df_h_comar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227660 entries, 0 to 227659
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   date_start             227660 non-null  object 
 1   start_stamp            227660 non-null  int64  
 2   Pickup_Community_Area  227660 non-null  float64
 3   numOfTaxis_area        227660 non-null  int64  
 4   start_time_hourly      227660 non-null  int64  
 5   dayOfWeek              227660 non-null  object 
 6   isHoliday              227660 non-null  bool   
 7   start_time_day         227660 non-null  int64  
 8   start_time_week        227660 non-null  int64  
 9   start_time_month       227660 non-null  int64  
 10  pressure               227660 non-null  float64
 11  humidity               227660 non-null  float64
 12  wind_direction         227660 non-null  float64
 13  wind_speed             227660 non-null  float64
 14  description            227660 non-nu

In [7]:
df_h_comar

Unnamed: 0,date_start,start_stamp,Pickup_Community_Area,numOfTaxis_area,start_time_hourly,dayOfWeek,isHoliday,start_time_day,start_time_week,start_time_month,pressure,humidity,wind_direction,wind_speed,description,temperature_celsius
0,2013-01-01,0,1.0,4,0,Tuesday,True,1,1,1,1024.0,64.0,200.0,4.0,overcast clouds,-0.19
1,2013-01-01,0,2.0,1,0,Tuesday,True,1,1,1,1024.0,64.0,200.0,4.0,overcast clouds,-0.19
2,2013-01-01,0,3.0,43,0,Tuesday,True,1,1,1,1024.0,64.0,200.0,4.0,overcast clouds,-0.19
3,2013-01-01,0,4.0,13,0,Tuesday,True,1,1,1,1024.0,64.0,200.0,4.0,overcast clouds,-0.19
4,2013-01-01,0,5.0,17,0,Tuesday,True,1,1,1,1024.0,64.0,200.0,4.0,overcast clouds,-0.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227655,2013-12-31,23,59.0,1,23,Tuesday,False,31,1,12,1023.0,89.0,217.0,1.0,mist,-11.00
227656,2013-12-31,23,60.0,3,23,Tuesday,False,31,1,12,1023.0,89.0,217.0,1.0,mist,-11.00
227657,2013-12-31,23,61.0,1,23,Tuesday,False,31,1,12,1023.0,89.0,217.0,1.0,mist,-11.00
227658,2013-12-31,23,76.0,3,23,Tuesday,False,31,1,12,1023.0,89.0,217.0,1.0,mist,-11.00


In [18]:
# defining categorical and numeric features of dfPhiladelphia

categoric = ['start_stamp', 'Pickup_Community_Area','dayOfWeek','start_time_month','start_time_day','start_time_week',
             'isHoliday','description'] #'isRushhour', season
numeric = ['temperature_celsius','wind_speed','wind_direction','humidity','pressure']


In [19]:
# function for normalize numeric and encode categorical features and for create pipeline

def pipeline_for_prediction(categoric, numeric, model):
    
    numeric_transformer = Pipeline(steps=[("standard_scaler", StandardScaler())])
    categorical_transformer = Pipeline(
        steps=[("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ("numerical scaler", numeric_transformer, numeric),
            ("one hot encoder", categorical_transformer, categoric),
        ]
    )
    pipeline = Pipeline(
        steps=[("preprocessor", preprocessor), ("model", model)]
    )
    return pipeline

In [20]:
# function for getting different scores for a model

def get_prediction_scores(y_true, y_predicted):
    print("MODEL SCORES:")
    print(f"MAE: {metrics.mean_absolute_error(y_true, y_predicted): .3f}")
    print(f"MSE: {metrics.mean_squared_error(y_true, y_predicted): .3f}")
    print(f"RMSE: {math.sqrt(metrics.mean_squared_error(y_true, y_predicted)): .3f}")
    print(f"Accuracy:", round((1-(metrics.mean_absolute_error(y_true, y_predicted)/df_h_comar["numOfTaxis_area"].mean()))*100,2), "%")
    print(f"R2: {100 * metrics.r2_score(y_true, y_predicted): .3f} %")
    print(f"Max Residual Error: {metrics.max_error(y_true, y_predicted): .3f}")

In [21]:
# function for creating pipeline and fitting model (created by the pipeline), predict and printing scores

def pipeline_fit_predict(reg, categoric, numeric, x_train, y_train, x_val, y_val):
    pipeline = pipeline_for_prediction(categoric, numeric, reg)
    pipeline.fit(x_train, y_train)
    y_predict = pipeline.predict(x_val)
    get_prediction_scores(y_val, y_predict)

In [22]:
from sklearn.model_selection import train_test_split

#split the data set in 70% training set and 30% testing set
#x_train, x_test, y_train, y_test = train_test_split(x_norm, y, test_size=0.3,random_state=42)
x_train, x_test, y_train, y_test = train_test_split(df_h_comar.drop('numOfTaxis_area', axis=1)
                                                    , df_h_comar['numOfTaxis_area'], 
                                                    test_size=0.3,random_state=42)

# save the combination of training and validation set in extra variables
x_train_val = x_train
y_train_val = y_train

#split the training data set in 70% training set and 20% validation set to achieve a 50-20-30 split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=(0.2/0.7), random_state = 42)

In [23]:
# creating the regression model

lin_reg = LinearRegression()

In [24]:
pipeline = pipeline_fit_predict(lin_reg, categoric, numeric, x_train_val, y_train_val, x_test, y_test)

MODEL SCORES:
MAE:  30.751
MSE:  3905.025
RMSE:  62.490
Accuracy: 21.99 %
R2:  62.134 %
Max Residual Error:  1339.461
