In [1]:
# import all necessary packages
import pandas as pd
import numpy as np
import json
import datetime
import warnings
from pandas.core.common import SettingWithCopyWarning

#warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
#import seaborn as sns
#import matplotlib as mlp
import matplotlib.pyplot as plt
#from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.svm import SVR
import math
from sklearn import datasets
from sklearn.metrics import mean_squared_error

In [2]:
# import datasets
df_h_comar = pd.read_csv("data/datasets/df_h_comar.csv")
df_h_comar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227660 entries, 0 to 227659
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   date_start             227660 non-null  object 
 1   start_stamp            227660 non-null  int64  
 2   Pickup_Community_Area  227660 non-null  float64
 3   numOfTaxis_area        227660 non-null  int64  
 4   start_time_hourly      227660 non-null  int64  
 5   dayOfWeek              227660 non-null  object 
 6   isHoliday              227660 non-null  bool   
 7   isRushhour             227660 non-null  bool   
 8   start_time_day         227660 non-null  int64  
 9   start_time_week        227660 non-null  int64  
 10  start_time_month       227660 non-null  int64  
 11  pressure               227660 non-null  float64
 12  humidity               227660 non-null  float64
 13  wind_direction         227660 non-null  float64
 14  wind_speed             227660 non-nu

In [3]:
df_h_comar.drop(columns = ['date_start'])

Unnamed: 0,start_stamp,Pickup_Community_Area,numOfTaxis_area,start_time_hourly,dayOfWeek,isHoliday,isRushhour,start_time_day,start_time_week,start_time_month,pressure,humidity,wind_direction,wind_speed,description,season,temperature_celsius
0,0,1.0,4,0,Tuesday,True,False,1,1,1,1024.0,64.0,200.0,4.0,overcast clouds,winter,-0.19
1,0,2.0,1,0,Tuesday,True,False,1,1,1,1024.0,64.0,200.0,4.0,overcast clouds,winter,-0.19
2,0,3.0,43,0,Tuesday,True,False,1,1,1,1024.0,64.0,200.0,4.0,overcast clouds,winter,-0.19
3,0,4.0,13,0,Tuesday,True,False,1,1,1,1024.0,64.0,200.0,4.0,overcast clouds,winter,-0.19
4,0,5.0,17,0,Tuesday,True,False,1,1,1,1024.0,64.0,200.0,4.0,overcast clouds,winter,-0.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227655,23,59.0,1,23,Tuesday,False,False,31,1,12,1023.0,89.0,217.0,1.0,mist,winter,-11.00
227656,23,60.0,3,23,Tuesday,False,False,31,1,12,1023.0,89.0,217.0,1.0,mist,winter,-11.00
227657,23,61.0,1,23,Tuesday,False,False,31,1,12,1023.0,89.0,217.0,1.0,mist,winter,-11.00
227658,23,76.0,3,23,Tuesday,False,False,31,1,12,1023.0,89.0,217.0,1.0,mist,winter,-11.00


In [4]:
# defining categorical and numeric features of dfPhiladelphia

categoric = ['start_stamp', 'Pickup_Community_Area','dayOfWeek','start_time_month','start_time_day','start_time_week',
             'isHoliday','description','isRushhour', 'season']
numeric = ['temperature_celsius','wind_speed','wind_direction','humidity','pressure']


In [5]:
# function for normalize numeric and encode categorical features and for create pipeline

def pipeline_for_prediction(categoric, numeric, model):
    
    numeric_transformer = Pipeline(steps=[("standard_scaler", StandardScaler())])
    categorical_transformer = Pipeline(
        steps=[("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ("numerical scaler", numeric_transformer, numeric),
            ("one hot encoder", categorical_transformer, categoric),
        ]
    )
    pipeline = Pipeline(
        steps=[("preprocessor", preprocessor), ("model", model)]
    )
    return pipeline

In [6]:
# function for getting different scores for a model

def get_prediction_scores(y_true, y_predicted):
    print("MODEL SCORES:")
    print(f"MAE: {metrics.mean_absolute_error(y_true, y_predicted): .3f}")
    print(f"MSE: {metrics.mean_squared_error(y_true, y_predicted): .3f}")
    print(f"RMSE: {math.sqrt(metrics.mean_squared_error(y_true, y_predicted)): .3f}")
    print(f"Accuracy:", round((1-(metrics.mean_absolute_error(y_true, y_predicted)/df_h_comar["numOfTaxis_area"].mean()))*100,2), "%")
    print(f"R2: {100 * metrics.r2_score(y_true, y_predicted): .3f} %")
    print(f"Max Residual Error: {metrics.max_error(y_true, y_predicted): .3f}")

In [7]:
# function for creating pipeline and fitting model (created by the pipeline), predict and printing scores

def pipeline_fit_predict(reg, categoric, numeric, x_train, y_train, x_val, y_val):
    pipeline = pipeline_for_prediction(categoric, numeric, reg)
    pipeline.fit(x_train, y_train)
    y_predict = pipeline.predict(x_val)
    get_prediction_scores(y_val, y_predict)

In [8]:
from sklearn.model_selection import train_test_split

#split the data set in 70% training set and 30% testing set
#x_train, x_test, y_train, y_test = train_test_split(x_norm, y, test_size=0.3,random_state=42)
x_train, x_test, y_train, y_test = train_test_split(df_h_comar.drop('numOfTaxis_area', axis=1)
                                                    , df_h_comar['numOfTaxis_area'], 
                                                    test_size=0.3,random_state=42)

# save the combination of training and validation set in extra variables
x_train_val = x_train
y_train_val = y_train

#split the training data set in 70% training set and 20% validation set to achieve a 50-20-30 split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=(0.2/0.7), random_state = 42)

## Linear Regression (as Benchmark)

In [9]:
# creating the regression model

lin_reg = LinearRegression()

In [10]:
pipeline = pipeline_fit_predict(lin_reg, categoric, numeric, x_train_val, y_train_val, x_test, y_test)

MODEL SCORES:
MAE:  30.758
MSE:  3904.656
RMSE:  62.487
Accuracy: 21.97 %
R2:  62.137 %
Max Residual Error:  1339.486


## Support Vector Machine

### Checking for the best kernel

#### Kernel: linear

In [11]:
svr_lin = SVR(kernel = 'linear',verbose = 10,cache_size=15000, max_iter=50000)

In [12]:
pipeline = pipeline_fit_predict(svr_lin, categoric, numeric, x_train_val, y_train_val, x_test, y_test)

[LibSVM].................................................WARN: libsvm Solver reached max_iter
optimization finished, #iter = 50000
obj = -3363473.268517, rho = -11.201561
nSV = 95981, nBSV = 95028




MODEL SCORES:
MAE:  23.337
MSE:  4498.339
RMSE:  67.070
Accuracy: 40.8 %
R2:  56.381 %
Max Residual Error:  1446.159


#### Kernel: poly

In [13]:
#svr_poly = SVR(kernel = 'poly')

In [14]:
#pipeline = pipeline_fit_predict(svr_poly, categoric, numeric, x_train_val, y_train_val, x_test, y_test)

#### Kernel: rbf

In [15]:
#svr_rbf = SVR(kernel = 'rbf')