# Testing different ML-models

In [21]:
import pandas as pd
import numpy as np
import joblib
from taxipred.utils.constants import DATA_PATH
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

df = pd.read_csv(DATA_PATH / "taxi_cleaned_incl_all_features.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 574 entries, 0 to 573
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Trip_Distance_km           574 non-null    float64
 1   Passenger_Count            574 non-null    float64
 2   Base_Fare                  574 non-null    float64
 3   Per_Km_Rate                574 non-null    float64
 4   Per_Minute_Rate            574 non-null    float64
 5   Trip_Duration_Minutes      574 non-null    float64
 6   Trip_Price                 548 non-null    float64
 7   Time_of_Day_Afternoon      574 non-null    bool   
 8   Time_of_Day_Evening        574 non-null    bool   
 9   Time_of_Day_Morning        574 non-null    bool   
 10  Time_of_Day_Night          574 non-null    bool   
 11  Day_of_Week_Weekday        574 non-null    bool   
 12  Day_of_Week_Weekend        574 non-null    bool   
 13  Traffic_Conditions_High    574 non-null    bool   

In [None]:
df_top_features = df[["Trip_Distance_km", "Per_Km_Rate", "Trip_Duration_Minutes", "Per_Minute_Rate", "Trip_Price"]]

df_top_features

Unnamed: 0,Trip_Distance_km,Per_Km_Rate,Trip_Duration_Minutes,Per_Minute_Rate,Trip_Price
0,19.35,0.80,53.82,0.32,36.2624
1,36.87,1.21,37.27,0.15,52.9032
2,8.64,1.71,89.33,0.48,60.2028
3,30.45,1.78,110.33,0.34,
4,41.79,1.77,86.95,0.11,88.1328
...,...,...,...,...,...
569,35.04,1.10,9.99,0.15,
570,14.34,1.01,45.07,0.29,30.7837
571,18.69,1.79,79.41,0.17,51.8548
572,5.49,0.62,58.39,0.49,34.4049


: 

### 0. Divide into X and y

In [None]:
df_train = df_top_features.dropna(subset=["Trip_Price"])

X, y = df_train.drop("Trip_Price", axis="columns"), df_train["Trip_Price"]

X.head()

Unnamed: 0,Trip_Distance_km,Per_Km_Rate,Trip_Duration_Minutes,Per_Minute_Rate
0,19.35,0.8,53.82,0.32
1,36.87,1.21,37.27,0.15
2,8.64,1.71,89.33,0.48
4,41.79,1.77,86.95,0.11
5,9.91,1.26,41.72,0.34


: 

In [29]:
y.info()

<class 'pandas.core.series.Series'>
Index: 548 entries, 0 to 573
Series name: Trip_Price
Non-Null Count  Dtype  
--------------  -----  
548 non-null    float64
dtypes: float64(1)
memory usage: 8.6 KB


### 1. train|test split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
    )


## LinearRegression

### 2. Scaling

In [31]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(X_train)

minmax_scaled_X_train = minmax_scaler.transform(X_train)
minmax_scaled_X_test = minmax_scaler.transform(X_test)

minmax_scaled_X_test[:5]

array([[0.62927131, 0.19333333, 0.75590003, 0.20512821],
       [0.23610539, 0.86666667, 0.41330663, 0.15384615],
       [0.97221079, 0.77333333, 0.64460507, 0.74358974],
       [0.16941128, 0.48      , 0.27118349, 0.84615385],
       [0.17455743, 0.32666667, 0.87781939, 0.51282051]])

In [32]:
minmax_scaled_X_train.max(), minmax_scaled_X_train.min(), minmax_scaled_X_test.max(), minmax_scaled_X_test.min()

(np.float64(1.0),
 np.float64(0.0),
 np.float64(1.0256410256410255),
 np.float64(0.0))

### 3. Training

In [33]:
model = LinearRegression()
model.fit(minmax_scaled_X_train, y_train)

model.intercept_, model.coef_

(np.float64(-23.32863332534437),
 array([60.2728179 , 37.25179553, 33.73123557, 22.74887419]))

### 4. Prediction

In [34]:
y_pred_linear = model.predict(minmax_scaled_X_test)
y_pred_linear[:10]

array([ 51.96521284,  40.6281635 , 102.73646051,  33.15952531,
        40.6373766 ,  57.07875474,  42.96025581,  30.67163073,
        25.77844444,  39.35627812])

### 5. Evaluation

In [35]:
mae_linear = mean_absolute_error(y_test, y_pred_linear)
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)

mae_linear, mse_linear, rmse_linear

(5.3760202180893835, 48.52075007558026, np.float64(6.965683747887228))

### <mark> LinearRegression gave us RMSE = 6.97  </mark>

## RandomForestRegressor

### 3. Training

In [10]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""squared_error"", ""absolute_error"", ""friedman_mse"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in Poisson deviance to find splits. Training using ""absolute_error"" is significantly slower than when using ""squared_error"". .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 1.0  Poisson criterion.",'squared_error'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None or 1.0, then `max_features=n_features`. .. note::  The default of 1.0 is equivalent to bagged trees and more  randomness can be achieved by setting smaller values, e.g. 0.3. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to 1.0. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",1.0
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


### 4. Prediction

In [23]:
y_pred_rf = rf.predict(X_test)
y_pred_rf[:10]

array([ 46.614647,  34.659868, 102.698734,  27.400895,  44.556148,
        52.005078,  47.593131,  35.427837,  30.92749 ,  38.986599])

### 5. Evaluation

In [28]:
mae = mean_absolute_error(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)

mae, mse, rmse

(3.3459048895027563, 20.233447236762828, np.float64(4.498160428081999))

### <mark> RandomForestRegression gave us RMSE = 4.5 </mark>

### Feature importance

In [26]:
rf.feature_importances_

array([0.50377095, 0.26467851, 0.15092757, 0.08062296])

Feature importance sorted on most important to least important:

In [27]:
feature_importance = pd.DataFrame([X.columns, rf.feature_importances_]).T
feature_importance.columns = ["feature", "importance"]
feature_importance = feature_importance.sort_values(by="importance", ascending=False)
feature_importance

Unnamed: 0,feature,importance
0,Trip_Distance_km,0.503771
1,Per_Km_Rate,0.264679
2,Trip_Duration_Minutes,0.150928
3,Per_Minute_Rate,0.080623


# KNN

### 2. Scaling dataset

In [36]:
scaler = StandardScaler()
scaler.fit(X_train)

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

scaled_X_train.min(), scaled_X_train.max(), scaled_X_test.min(), scaled_X_test.max()

(np.float64(-1.7360722288089256),
 np.float64(1.8417507012296939),
 np.float64(-1.716131388651957),
 np.float64(1.8221214367001777))

### 3. Training

In [37]:
model = KNeighborsRegressor(n_neighbors=5)
model.fit(scaled_X_train, y_train)

0,1,2
,"n_neighbors  n_neighbors: int, default=5 Number of neighbors to use by default for :meth:`kneighbors` queries.",5
,"weights  weights: {'uniform', 'distance'}, callable or None, default='uniform' Weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood  are weighted equally. - 'distance' : weight points by the inverse of their distance.  in this case, closer neighbors of a query point will have a  greater influence than neighbors which are further away. - [callable] : a user-defined function which accepts an  array of distances, and returns an array of the same shape  containing the weights. Uniform weights are used by default. See the following example for a demonstration of the impact of different weighting schemes on predictions: :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`.",'uniform'
,"algorithm  algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm  based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force.",'auto'
,"leaf_size  leaf_size: int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.",30
,"p  p: float, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.",2
,"metric  metric: str, DistanceMetric object or callable, default='minkowski' Metric to use for distance computation. Default is ""minkowski"", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance `_ and the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric values. If metric is ""precomputed"", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only ""nonzero"" elements may be considered neighbors. If metric is a callable function, it takes two arrays representing 1D vectors as inputs and must return one value indicating the distance between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. If metric is a DistanceMetric object, it will be passed directly to the underlying computation routines.",'minkowski'
,"metric_params  metric_params: dict, default=None Additional keyword arguments for the metric function.",
,"n_jobs  n_jobs: int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. Doesn't affect :meth:`fit` method.",


### 4. Predicting

In [38]:
y_pred_knn = model.predict(scaled_X_test)
y_pred_knn[:10]

array([ 43.69698,  33.3364 , 108.82104,  30.82946,  46.91994,  56.39806,
        55.47476,  34.15338,  29.47202,  36.29126])

### 5. Evaluation

In [39]:
mae_knn = mean_absolute_error(y_test, y_pred_knn)
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)

mae_knn, mse_knn, rmse_knn

(4.0392148066298335, 26.194641149290604, np.float64(5.1180700609986385))

### <mark> KNN gave us RMSE = 5.12 </mark>

# Conclusion

- LinearRegression gave us RMSE = 6.97
- RandomForestRegression gave us RMSE = 4.5
- KNN gave us RMSE = 5.12

RandomForest gave us the best result. 


# Training model

In [40]:
rf.fit(X, y)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""squared_error"", ""absolute_error"", ""friedman_mse"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in Poisson deviance to find splits. Training using ""absolute_error"" is significantly slower than when using ""squared_error"". .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 1.0  Poisson criterion.",'squared_error'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None or 1.0, then `max_features=n_features`. .. note::  The default of 1.0 is equivalent to bagged trees and more  randomness can be achieved by setting smaller values, e.g. 0.3. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to 1.0. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",1.0
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [41]:
# joblib.dump(rf, "taxi_price_prediction.joblib", compress=("xz", 3), protocol=5)