In [1]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
import statsmodels.api as sm
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import  train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

In [2]:
#Import LGBM
import lightgbm as lgb

In [3]:
from xgboost import XGBRegressor
import xgboost as xgb

## Config

In [4]:
# Matplotlib Config
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

# Pandas and numpy config
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Helper Functions

In [5]:
def generate_metrics(Y_test,Y_predicted):
    mse = mean_squared_error(Y_test, Y_predicted)
    rmse = math.sqrt(mean_squared_error(Y_test, Y_predicted))
    mae = mean_absolute_error(Y_test, Y_predicted)
    rsquare_score = r2_score(Y_test, Y_predicted)
    return round(mse,2), round(rmse,2), round(mae,2) , round(rsquare_score,2)

In [6]:
def missing_values_table(input_df):
    """
    Returns the number of missing values in each column (if it has any missing values) and percentage of missing values.

    Parameters
    ----------
    input_df: pd.DataFrame
        The dataframe that whose missing data information is required 

    Returns
    -------
    mis_val_table_ren_columns: pd.DataFrame
        Returns a dataframe containing columns and missing data information

    """
    # Total missing values
    mis_val = input_df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * input_df.isnull().sum() / len(input_df)

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Values Missing'})

    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Values Missing', ascending=False).round(1)

    # Print some summary information
    print ("Your selected dataframe has " + str(input_df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")

    # Return the dataframe with missing information
    return mis_val_table_ren_columns

## Load Data

In [7]:
pip install gdown

In [8]:
!gdown https://drive.google.com/uc?id=1NGPdXsplAbbGf1Na7Kn3x9netwZXVyGw
!gdown https://drive.google.com/uc?id=1M_qVhIVMqgsCs5OS-bGCxIZB8jTPaeqr

In [14]:
raw_data = pd.read_csv('./london_2019_v3.csv')

In [15]:
raw_data_sao_paulo = pd.read_csv('./sao_paulo_2019_v6.csv')

In [11]:
raw_data_sao_paulo = raw_data_sao_paulo.drop(['Unnamed: 0'], axis = 1)

In [13]:
raw_data = raw_data.drop(['Unnamed: 0'], axis = 1)

In [None]:
frames = [raw_data, raw_data_sao_paulo]
main_data = pd.concat(frames)

In [None]:
main_data.head()

In [None]:
main_data.head()

In [None]:
missing_values_table(main_data)

## Exploratory Data Analysis

In [None]:
main_data.osm_way_id.nunique()

## Data Preparation

In [None]:
main_data.columns

In [None]:
main_data.speed_mph_mean.describe()

In [None]:
sao_paulo_pop_mean = main_data[main_data.Country_Name == 'Brazil']['population_density_start'].mean()
sao_paulo_pop_mean_end = main_data[main_data.Country_Name == 'Brazil']['population_density_end'].mean()

In [None]:
main_data['School_Name_start'] = main_data['School_Name_start'].fillna(0)
main_data['School_Name_end'] = main_data['School_Name_end'].fillna(0)
main_data['hospital_address_start'] = main_data['hospital_address_start'].fillna(0)
main_data['hospital_address_end'] = main_data['hospital_address_end'].fillna(0)
main_data['population_density_start'] = main_data['population_density_start'].fillna(sao_paulo_pop_mean)
main_data['population_density_end'] = main_data['population_density_end'].fillna(sao_paulo_pop_mean_end)
main_data['start_area_code'] = main_data['start_area_code'].fillna('missing')
main_data['area_names_start'] = main_data['area_names_start'].fillna('missing')
main_data['end_area_code'] = main_data['end_area_code'].fillna('missing')
main_data['area_names_end'] = main_data['area_names_end'].fillna('missing')

In [None]:
## Column definitions
numerical_columns = ['day', 'hour', 'start_lat', 'start_lon', 'end_lat', 'end_lon',\
                   'TEMP_in_C', 'VISIB_in_miles', 'year', 'month', 'Country_Name', 'School_Name_start',\
                     'School_Name_end', 'hospital_address_start', 'hospital_address_end', 'population_density_start', 'population_density_end']
  
category_columns = ['start_area_code', 'end_area_code']


target_column = 'speed_mph_mean'

In [None]:
filtered_data = main_data[numerical_columns + category_columns + [target_column]].copy()

In [None]:
filtered_data = filtered_data.reset_index(drop = True)

In [None]:
filtered_data.head()

In [None]:
sns.histplot(x = 'speed_mph_mean', data = filtered_data)

In [None]:
labelencoder = LabelEncoder()


In [None]:
filtered_data['start_area_code'] = labelencoder.fit_transform(filtered_data['start_area_code'])

In [None]:
filtered_data['end_area_code'] = labelencoder.fit_transform(filtered_data['end_area_code'])

In [None]:
filtered_data.info()

In [None]:
filtered_data.head()

## Split Test Train

In [None]:
'''Generating a 80%-20% split between train/test datasets'''

X = filtered_data.loc[:, filtered_data.columns != target_column]
Y = filtered_data.loc[:, filtered_data.columns == target_column]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True) 

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

## Modeling

## Model Evaluation

In [None]:
rf = RandomForestRegressor(random_state = 42)
print('Parameters currently in use:\n')
pprint(rf.get_params())

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 40, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features}
pprint(random_grid)

In [None]:
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 2, cv = 3, verbose=2, random_state=42, n_jobs = 6)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
filtered_data.info()

In [None]:
rmse_list = []; mse_list = []
for estimator in n_estimators:
    regression_model = RandomForestRegressor(n_estimators = estimator)
    regression_model.fit(X_train, y_train)
    model_predictions = regression_model.predict(X_test)
    mse, rmse, mae, rsquare_score = generate_metrics(y_test, model_predictions)
    rmse_list.append(rmse)
    mse_list.append(mse)

    
    

In [None]:
print(rmse_list)

In [None]:
print(mse_list)

In [None]:
print(mae)

In [None]:
print(rsquare_score)

In [None]:
font = {'family' : 'normal',
        'size'   : 12}

plt.rc('font', **font)

plt.plot(n_estimators, rmse_list)
plt.xlabel('Value of estimator', fontsize=18); 
plt.ylabel('Root mean square error', fontsize=18)
plt.show()

In [None]:


for index, coefficient_value in enumerate(feature_imp):
    print('feature_index: %0d, weight: %.5f' % (index,coefficient_value))


In [None]:
all_columns = numerical_columns + category_columns

In [None]:
indices = np.argsort(feature_imp)
feature_imp = regression_model.feature_importances_

plt.title('Feature Importances')
plt.barh(range(len(indices)), feature_imp[indices], color='g', align='center')
plt.yticks(range(len(indices)), [all_columns[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
plt.barh([x for x in range(len(feature_imp))], feature_imp)
plt.show()

In [None]:
model_predictions_train = regression_model.predict(X_train)

In [None]:
mse, rmse, mae, rsquare_score = generate_metrics(y_train, model_predictions_train)

In [None]:
print("Mean square error is:{}".format(mse))
print("Root mean square error is :{}".format(rmse))
print("Mean absolute error is :{}".format(mae))
print("R Square Score is :{}".format(rsquare_score))

## Baseline Model

In [None]:
baseline_mse, baseline_rmse, baseline_mae, baseline_rsquare_score = generate_metrics(y_test,len(y_test)*[filtered_data.speed_mph_mean.mean()])

In [None]:
print("Mean square error is:{}".format(baseline_mse))
print("Root mean square error is :{}".format(baseline_rmse))
print("Mean absolute error is :{}".format(baseline_mae))
print("R Squared Score is :{}".format(baseline_rsquare_score))