In [2]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
import statsmodels.api as sm
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import  train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold

In [3]:
#Import LGBM
import lightgbm as lgb

In [4]:
from xgboost import XGBRegressor
import xgboost as xgb

## Config

In [122]:
# Matplotlib Config
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 18}

plt.rc('font', **font)

# Pandas and numpy config
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Helper Functions

In [5]:
def generate_metrics(Y_test,Y_predicted):
    mse = mean_squared_error(Y_test, Y_predicted)
    rmse = math.sqrt(mean_squared_error(Y_test, Y_predicted))
    mae = mean_absolute_error(Y_test, Y_predicted)
    rsquare_score = r2_score(Y_test, Y_predicted)
    return round(mse,2), round(rmse,2), round(mae,2) , round(rsquare_score,2)

In [6]:
def missing_values_table(input_df):
    """
    Returns the number of missing values in each column (if it has any missing values) and percentage of missing values.

    Parameters
    ----------
    input_df: pd.DataFrame
        The dataframe that whose missing data information is required 

    Returns
    -------
    mis_val_table_ren_columns: pd.DataFrame
        Returns a dataframe containing columns and missing data information

    """
    # Total missing values
    mis_val = input_df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * input_df.isnull().sum() / len(input_df)

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Values Missing'})

    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Values Missing', ascending=False).round(1)

    # Print some summary information
    print ("Your selected dataframe has " + str(input_df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")

    # Return the dataframe with missing information
    return mis_val_table_ren_columns

## Load Data

In [7]:
pip install gdown

In [8]:
!gdown https://drive.google.com/uc?id=1NGPdXsplAbbGf1Na7Kn3x9netwZXVyGw
!gdown https://drive.google.com/uc?id=1M_qVhIVMqgsCs5OS-bGCxIZB8jTPaeqr

In [9]:
raw_data_london = pd.read_csv('./london_2019_v3.csv')
raw_data_sao_paulo = pd.read_csv('./sao_paulo_2019_v6.csv')

In [10]:
raw_data = pd.concat([raw_data_london,raw_data_sao_paulo])

In [11]:
raw_data.head()

In [12]:
raw_data = raw_data.drop(['Unnamed: 0'], axis = 1)

In [16]:
missing_values_table(raw_data)

## Exploratory Data Analysis

## Data Preparation

In [14]:
sao_paulo_pop_mean = raw_data[raw_data.Country_Name == 'Brazil']['population_density_start'].mean()
sao_paulo_pop_mean_end = raw_data[raw_data.Country_Name == 'Brazil']['population_density_end'].mean()

In [15]:
raw_data['School_Name_start'] = raw_data['School_Name_start'].fillna(0)
raw_data['School_Name_end'] = raw_data['School_Name_end'].fillna(0)
raw_data['hospital_address_start'] = raw_data['hospital_address_start'].fillna(0)
raw_data['hospital_address_end'] = raw_data['hospital_address_end'].fillna(0)
raw_data['population_density_start'] = raw_data['population_density_start'].fillna(sao_paulo_pop_mean)
raw_data['population_density_end'] = raw_data['population_density_end'].fillna(sao_paulo_pop_mean_end)
raw_data['start_area_code'] = raw_data['start_area_code'].fillna('missing')
raw_data['area_names_start'] = raw_data['area_names_start'].fillna('missing')
raw_data['end_area_code'] = raw_data['end_area_code'].fillna('missing')
raw_data['area_names_end'] = raw_data['area_names_end'].fillna('missing')

In [17]:
## Column definitions
numerical_columns = ['month', 'day', 'hour', 'start_lat', 'start_lon', 'end_lat', 'end_lon',\
                   'TEMP_in_C', 'VISIB_in_miles', 'Total_precipitation_in_inches', 'Fog', 'Rain',\
                   'Snow', 'Hail', 'Thunder', 'Tornado', 'Quality_of_roads', 'Road_connectivity_index', 'Roads']
  
category_columns = ['start_area_code', 'end_area_code']


target_column = 'speed_mph_mean'

In [18]:
filtered_data = raw_data[numerical_columns + category_columns + [target_column]].copy()

In [19]:
filtered_data = filtered_data.reset_index(drop = True)

In [20]:
filtered_data.head()

In [21]:
sns.histplot(x = 'speed_mph_mean', data = filtered_data)

In [22]:
one_hot_columns = category_columns
for col in one_hot_columns:
    filtered_data[col] = filtered_data[col].astype('category')

## Split Test Train

In [24]:
labelencoder = LabelEncoder()
filtered_data['start_area_code'] = labelencoder.fit_transform(filtered_data['start_area_code'])
# ['start_area_code', 'end_area_code']
filtered_data['end_area_code'] = labelencoder.fit_transform(filtered_data['end_area_code'])

In [25]:
'''Generating a 80%-20% split between train/test datasets'''

X = filtered_data.loc[:, filtered_data.columns != target_column]
Y = filtered_data.loc[:, filtered_data.columns == target_column]
print(X.shape, Y.shape)

In [26]:
# one_hot_pipeline = ColumnTransformer([('categorical', OneHotEncoder(handle_unknown='ignore'), one_hot_columns)], remainder='passthrough')
# encoder = one_hot_pipeline.fit(X)
# X = encoder.transform(X)
X['start_area_code']

In [27]:
print(X.shape,Y.shape)
print(type(X))

## Modeling

## Model Evaluation

In [113]:
MSE=[]; RMSE=[]; MAE=[]; RSQUARE=[]
Ci_range = [0.1, 0.5, 1, 5, 10, 50]
for Ci in Ci_range:
    alpha = 1/(2*Ci)
    model = Lasso(alpha=alpha)
    tempMSE=[]; tempRMSE=[]; tempMAE=[]; tempRSQUARE=[]
    kf = KFold(n_splits=5)
    index = 0
    for train, test in kf.split(X):
        print(index)
        model.fit(X.loc[train.tolist()], Y.loc[train.tolist()])
        ypred = model.predict(X.loc[test.tolist()])
        tempMSE.append(mean_squared_error(Y.loc[test.tolist()],ypred))
        tempRMSE.append(math.sqrt(mean_squared_error(Y.loc[test.tolist()],ypred)))
        tempMAE.append(mean_absolute_error(Y.loc[test.tolist()],ypred))
        tempRSQUARE.append(r2_score(Y.loc[test.tolist()],ypred))
        index+=1
    MSE.append(np.array(tempMSE).mean())
    RMSE.append(np.array(tempRMSE).mean())
    MAE.append(np.array(tempMAE).mean())
    RSQUARE.append(np.array(tempRSQUARE).mean())
    print('C:',Ci)

# Train Scores

In [None]:
MSE=[]; RMSE=[]; MAE=[]; RSQUARE=[]
Ci_range = [0.1, 0.5, 1, 5, 10, 50]
for Ci in Ci_range:
    alpha = 1/(2*Ci)
    model = Lasso(alpha=alpha)
    tempMSE=[]; tempRMSE=[]; tempMAE=[]; tempRSQUARE=[]
    kf = KFold(n_splits=5)
    index = 0
    for train, test in kf.split(X):
        print(index)
        model.fit(X.loc[train.tolist()], Y.loc[train.tolist()])
        ypred = model.predict(X.loc[train.tolist()])
        tempMSE.append(mean_squared_error(Y.loc[train.tolist()],ypred))
        tempRMSE.append(math.sqrt(mean_squared_error(Y.loc[train.tolist()],ypred)))
        tempMAE.append(mean_absolute_error(Y.loc[train.tolist()],ypred))
        tempRSQUARE.append(r2_score(Y.loc[train.tolist()],ypred))
        index+=1
    MSE.append(np.array(tempMSE).mean())
    RMSE.append(np.array(tempRMSE).mean())
    MAE.append(np.array(tempMAE).mean())
    RSQUARE.append(np.array(tempRSQUARE).mean())
    print('C:',Ci)

In [123]:
plt.plot(Ci_range, MSE)
plt.ylim([55, 57])
plt.xlabel('Value of C',fontsize = 18); plt.ylabel('Root mean square error',fontsize = 18)
plt.show()

In [56]:
features=filtered_data.columns.to_list()
importances = model.coef_
print(features)
print(len(importances))
print(len(X.columns.array))

In [126]:
tempMSE

In [124]:
features=X.columns.array
importances = model.coef_
popIndices = []
tunedImportances = []; tunedFeatures = []
for index, a in enumerate(importances):
    if a == 0.0:
        popIndices.append(index)
tunedImportances = np.delete(importances, popIndices)
tunedFeatures = np.delete(features, popIndices)
indices = np.argsort(tunedImportances)

plt.title('Feature Importances', fontsize = 18)
plt.barh(range(len(indices)), tunedImportances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [tunedFeatures[i] for i in indices])
plt.xlabel('Relative Importance', fontsize = 18)
plt.show()

In [None]:
mse, rmse, mae, rsquare_score = generate_metrics(y_test, predict)

## Baseline Model

In [125]:
print("Mean square error is:{}".format(np.array(MSE).mean()))
print("Root mean square error is :{}".format(np.array(RMSE).mean()))
print("Mean absolute error is :{}".format(np.array(MAE).mean()))
print("R Squared Score is :{}".format(np.array(RSQUARE).mean()))