<a href="https://colab.research.google.com/github/KaydeeJR/logistics-optimization-causal-inference/blob/Main/notebooks/gokada_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%cd /content/drive/MyDrive/Datasets/GokadaData/
driver_locations = "driver_locations_during_request.csv"
feat_eng_file = "gokada_feat_eng.csv"

/content/drive/MyDrive/Datasets/GokadaData


In [2]:
import pandas as pd
df_feat_eng = pd.read_csv(feat_eng_file)

## Missing values

In [3]:
df_feat_eng.isnull().sum() * 100 / len(df_feat_eng)

Unnamed: 0            0.000000
Trip ID               0.000000
Trip Origin           0.000000
Trip Destination      0.000000
Trip Start Time       0.696779
Trip End Time         0.000000
order_id              0.000000
driver_id             0.000000
driver_action         0.000000
lat                   0.000000
lng                   0.000000
created_at          100.000000
updated_at          100.000000
start_hour            0.696779
end_hour              0.000000
trip_distance_km      0.000000
trip_time             0.696779
Start Date            0.696779
rainy                 0.000000
holiday               0.000000
weekend               0.000000
dtype: float64

*Empty values resulted from trip start time*

In [4]:
col_to_drop = ['created_at','updated_at','Unnamed: 0', 'Start Date', 'holiday']
new_df = df_feat_eng.drop(col_to_drop, axis=1)

In [5]:
df_missing_val = new_df.dropna()

In [6]:
df_missing_val.isnull().sum()

Trip ID             0
Trip Origin         0
Trip Destination    0
Trip Start Time     0
Trip End Time       0
order_id            0
driver_id           0
driver_action       0
lat                 0
lng                 0
start_hour          0
end_hour            0
trip_distance_km    0
trip_time           0
rainy               0
weekend             0
dtype: int64

In [7]:
df_missing_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1546886 entries, 0 to 1557739
Data columns (total 16 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Trip ID           1546886 non-null  int64  
 1   Trip Origin       1546886 non-null  object 
 2   Trip Destination  1546886 non-null  object 
 3   Trip Start Time   1546886 non-null  object 
 4   Trip End Time     1546886 non-null  object 
 5   order_id          1546886 non-null  int64  
 6   driver_id         1546886 non-null  int64  
 7   driver_action     1546886 non-null  object 
 8   lat               1546886 non-null  float64
 9   lng               1546886 non-null  float64
 10  start_hour        1546886 non-null  float64
 11  end_hour          1546886 non-null  int64  
 12  trip_distance_km  1546886 non-null  float64
 13  trip_time         1546886 non-null  object 
 14  rainy             1546886 non-null  bool   
 15  weekend           1546886 non-null  bool   
dtype

In [8]:
df_missing_val.describe()

Unnamed: 0,Trip ID,order_id,driver_id,lat,lng,start_hour,end_hour,trip_distance_km
count,1546886.0,1546886.0,1546886.0,1546886.0,1546886.0,1546886.0,1546886.0,1546886.0
mean,1319561.0,419228.4,234153.0,6.536097,3.378828,13.45592,14.40968,11.82662
std,31956.11,14423.46,22704.16,0.05991502,0.05328545,3.30516,3.371022,8.410192
min,1259089.0,392001.0,121981.0,6.409333,3.076561,0.0,0.0,0.0
25%,1293251.0,406954.0,242997.0,6.498711,3.348846,11.0,12.0,5.03102
50%,1324634.0,421735.0,243589.0,6.54425,3.363504,14.0,15.0,10.14846
75%,1340384.0,429313.0,244056.0,6.593231,3.385136,16.0,17.0,17.07343
max,1570716.0,517948.0,247877.0,7.702536,8.515414,23.0,23.0,84.99244


## Outliers

In [9]:
!pip install seaborn
import seaborn as sns

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
numeric_cols = df_missing_val.select_dtypes(include=['float64','int64']) 
column_list = list(numeric_cols.columns)

In [11]:
!pip install sklearn
import sklearn
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1310 sha256=c8402b99cd8b26572e5c50a30380027bb6f89815778836b556fd0924ef99311f
  Stored in directory: /root/.cache/pip/wheels/46/ef/c3/157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [12]:
# Detecting and removing outliers using IQR
Q1 = df_missing_val[column_list].quantile(0.25)
Q3 = df_missing_val[column_list].quantile(0.75)
IQR = Q3 - Q1

# Return a boolean array of the rows with (any) non-outlier column values
condition = ~((df_missing_val[column_list]< (Q1 - 1.5 * IQR)) | (df_missing_val[column_list] > (Q3 + 1.5 * IQR))).any(axis=1)

# Filter our dataframe based on condition
clean_df = df_missing_val[condition]

In [13]:
clean_df.describe()

Unnamed: 0,Trip ID,order_id,driver_id,lat,lng,start_hour,end_hour,trip_distance_km
count,980747.0,980747.0,980747.0,980747.0,980747.0,980747.0,980747.0,980747.0
mean,1318879.0,418942.430456,243882.206206,6.54953,3.363414,13.476043,14.439124,11.651529
std,31500.89,14241.921532,758.286486,0.050183,0.023268,3.309323,3.357869,7.934069
min,1259098.0,392005.0,241760.0,6.409333,3.294447,5.0,5.0,0.0
25%,1292260.0,406510.0,243395.0,6.517622,3.347414,11.0,12.0,5.050384
50%,1324692.0,421766.0,243767.0,6.544484,3.363415,14.0,15.0,10.148461
75%,1339620.0,428892.0,244108.0,6.594491,3.375737,16.0,17.0,16.990991
max,1386520.0,448616.0,245644.0,6.728332,3.439562,23.0,23.0,35.128115


## One Hot Encoding

In [14]:
cat_columns = ['rainy','weekend']

encoded_df = pd.get_dummies(clean_df, columns = cat_columns)

In [15]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 980747 entries, 10 to 1557591
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Trip ID           980747 non-null  int64  
 1   Trip Origin       980747 non-null  object 
 2   Trip Destination  980747 non-null  object 
 3   Trip Start Time   980747 non-null  object 
 4   Trip End Time     980747 non-null  object 
 5   order_id          980747 non-null  int64  
 6   driver_id         980747 non-null  int64  
 7   driver_action     980747 non-null  object 
 8   lat               980747 non-null  float64
 9   lng               980747 non-null  float64
 10  start_hour        980747 non-null  float64
 11  end_hour          980747 non-null  int64  
 12  trip_distance_km  980747 non-null  float64
 13  trip_time         980747 non-null  object 
 14  rainy_False       980747 non-null  uint8  
 15  rainy_True        980747 non-null  uint8  
 16  weekend_False     

## Splitting dataset

In [None]:
!pip install sklearn
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size=0.20, shuffle=False)