In [1]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from geopy.geocoders import Nominatim
import time
geolocator = Nominatim(user_agent="OLABikes")

In [2]:
!ls

1. Data Cleaning (Basic).ipynb
2. Data Analysis and Cleaning (Advance).ipynb
3. Data Prep.ipynb
4. Model_Training.ipynb
data_analysis_ride_request.html


#### Reading Data from previous checkpoint

In [3]:
df = pd.read_csv('../Data/data_checkpoint/preprocessed_1.csv', compression = 'gzip')

## Data Cleaning with Business Understanding

### There can be cases when a user requests a ride, and their booking request is logged in our database but this user re-books his/her ride due to longer wait hours or driver refused booking or user by mistake added wrong pickup or drop locations. 

<hr>

### `Handle Case1 Rebooking Again to Same Location`: Keep only one request of same user to same pickup latitude longitude in 1hour time frame of first ride request.

<hr>

### `Handle Case2 Location entry mistake`: Keep only last request of user within 8mintues of first booking request.
#### A person booking a ride would generally book a ride that would take 8mins of bike ride time. 
#### Also, Calculate distance b/w pickup and drop. Based on distance and request time different remove bad data entries.

#### `Handle Case2.1`: Pick Up and Drop Lat-Long Distance less than 50meters = 0.05 kms; No user would like to ride for just 50meters trip. 

<hr>

### `Handle Case3`: Booking Location Outside operation zone of OLABikes
#### Check lat-long bounding box coordinates

In [4]:
df['ts'] = pd.to_datetime(df['ts'])
df.sort_values(by = ['number','ts'], inplace = True)
df.reset_index(inplace = True)

In [5]:
# you need convert first to numpy array by values and cast to int64 - output is in ns, so need divide by 10 ** 9

df['booking_timestamp'] = df.ts.values.astype(np.int64)// 10 ** 9

In [6]:
df.head(50)

Unnamed: 0,index,ts,number,pick_lat,pick_lng,drop_lat,drop_lng,hour,mins,day,month,year,dayofweek,booking_timestamp
0,2374378,2020-10-10 07:34:16,-1,12.975773,77.57107,12.878468,77.44533,7,34,10,10,2020,5,1602315256
1,2405894,2020-10-11 08:23:42,-1,12.930813,77.60953,12.96032,77.58721,8,23,11,10,2020,6,1602404622
2,2405895,2020-10-11 08:23:50,-1,12.930813,77.60953,12.96032,77.58721,8,23,11,10,2020,6,1602404630
3,2405896,2020-10-11 08:23:51,-1,12.930813,77.60953,12.96032,77.58721,8,23,11,10,2020,6,1602404631
4,2405897,2020-10-11 08:23:54,-1,12.930813,77.60953,12.96032,77.58721,8,23,11,10,2020,6,1602404634
5,2405898,2020-10-11 08:23:56,-1,12.930813,77.60953,12.96032,77.58721,8,23,11,10,2020,6,1602404636
6,2406076,2020-10-11 11:57:17,-1,12.960213,77.58746,12.930824,77.60961,11,57,11,10,2020,6,1602417437
7,2406077,2020-10-11 11:57:31,-1,12.960213,77.58746,12.930824,77.60961,11,57,11,10,2020,6,1602417451
8,2500477,2020-10-16 17:51:07,-1,12.924353,77.54941,12.932216,77.581825,17,51,16,10,2020,4,1602870667
9,2500478,2020-10-16 17:51:25,-1,12.924353,77.54941,12.932216,77.581825,17,51,16,10,2020,4,1602870685


In [7]:
df['shift_booking_ts'] = df.groupby('number')['booking_timestamp'].shift(1)
df['shift_booking_ts'].fillna(0, inplace = True)

In [8]:
df['shift_booking_ts'] = df['shift_booking_ts'].astype('int64')

In [9]:
df['booking_time_diff_hr'] = round((df['booking_timestamp'] - df['shift_booking_ts'])//3600)
df['booking_time_diff_min'] = round((df['booking_timestamp'] - df['shift_booking_ts'])//60)

In [10]:
##Booking time different in mins
df['booking_time_diff_min'].value_counts().to_dict()

{0: 2801749,
 1: 799993,
 2: 307831,
 3: 167954,
 4: 109141,
 5: 77918,
 6: 60416,
 7: 48342,
 8: 40404,
 9: 34407,
 10: 29977,
 11: 26091,
 12: 23336,
 13: 20839,
 14: 18945,
 15: 16872,
 16: 15467,
 17: 13885,
 18: 12804,
 19: 11870,
 20: 10869,
 21: 10105,
 22: 9397,
 23: 8810,
 24: 8185,
 25: 7752,
 26: 7255,
 27: 6898,
 28: 6610,
 29: 6263,
 30: 6034,
 31: 5773,
 1439: 5725,
 1440: 5712,
 32: 5702,
 1438: 5489,
 1437: 5402,
 1441: 5391,
 33: 5349,
 1442: 5303,
 1443: 5178,
 34: 5094,
 1436: 5076,
 35: 4965,
 1435: 4962,
 36: 4907,
 1444: 4861,
 38: 4780,
 37: 4757,
 1434: 4724,
 1433: 4662,
 1445: 4581,
 39: 4542,
 40: 4464,
 41: 4416,
 1432: 4407,
 1446: 4401,
 42: 4292,
 43: 4213,
 1431: 4170,
 44: 4133,
 1447: 4123,
 1430: 4038,
 45: 4016,
 1448: 3993,
 47: 3989,
 46: 3977,
 48: 3931,
 50: 3872,
 1449: 3838,
 1429: 3836,
 51: 3817,
 49: 3807,
 53: 3693,
 52: 3659,
 1428: 3644,
 54: 3643,
 55: 3624,
 1427: 3611,
 56: 3577,
 59: 3563,
 1450: 3552,
 1426: 3521,
 63: 3449,
 57: 344

In [11]:
##Booking time different in hours
df['booking_time_diff_hr'].value_counts().to_dict()

{0: 4849856,
 1: 164914,
 23: 152962,
 24: 132454,
 2: 118199,
 9: 104854,
 3: 90789,
 10: 86794,
 8: 84454,
 14: 83715,
 13: 80005,
 4: 71379,
 11: 69332,
 12: 69267,
 7: 64610,
 15: 61892,
 5: 58059,
 22: 56709,
 6: 55287,
 16: 45075,
 21: 42289,
 47: 41525,
 25: 38067,
 48: 36781,
 20: 36559,
 17: 36124,
 19: 33701,
 18: 32710,
 71: 27538,
 72: 24658,
 26: 23932,
 46: 20902,
 27: 17593,
 45: 17142,
 49: 16426,
 38: 16319,
 95: 15844,
 37: 15186,
 39: 15048,
 44: 14897,
 96: 14258,
 43: 14004,
 28: 13903,
 40: 13586,
 70: 13131,
 36: 13019,
 42: 12926,
 41: 12516,
 50: 11945,
 35: 11789,
 29: 11770,
 34: 11479,
 33: 11356,
 119: 10863,
 62: 10769,
 73: 10731,
 32: 10583,
 30: 10279,
 69: 10150,
 31: 9997,
 120: 9641,
 63: 9561,
 61: 9401,
 51: 9313,
 68: 8957,
 94: 8381,
 143: 8335,
 167: 8252,
 64: 8089,
 60: 7865,
 67: 7802,
 52: 7755,
 144: 7645,
 74: 7590,
 168: 7571,
 66: 7370,
 65: 7293,
 97: 7220,
 93: 6805,
 59: 6594,
 53: 6488,
 58: 6294,
 118: 6171,
 75: 6029,
 57: 5957,
 5

### We observe that there are 8315382 - 4335828 = 39,79,554 booking that happen in less than 1 hour of request by a user

In [12]:
len(df)

8315382

In [13]:
### Handling Case 1: Re-booking Again to Same Location within 1hour by same user

df = df[~((df.duplicated(subset=['number','pick_lat','pick_lng'],keep=False)) & (df.booking_time_diff_hr<=1))]

In [14]:
## Before removing Row Count
len(df)

4335828

###  Removed 3979554 rows in `Case1` we now have 4335828

In [15]:
df.to_csv('./../Data/data_checkpoint/preprocessed_2.csv',index = False, compression = 'gzip')

### Handling Case2: One user Books rides are different lat-long within 8mins time (ride time + driver arrival time)
#### Fraud User
#### Human error booking

In [16]:
df = pd.read_csv('./../Data/data_checkpoint/preprocessed_2.csv', compression = 'gzip')

In [17]:
print("Number of rides booked by same customer within 8mins time: {}".format(len(df[(df.booking_time_diff_min<8)])))
df = df[(df.booking_time_diff_min>=8)]

Number of rides booked by same customer within 8mins time: 611891


### Assuming earth as ellipsoids, calculating geodesic distance b/w pickup and drop latitude and longitude

The geodesic distance is the shortest distance on the surface of an ellipsoidal model of the earth. The default algorithm uses the method is given by [Karney (2013)](https://link.springer.com/article/10.1007/s00190-012-0578-z) (geodesic); this is accurate to round-off and always converges.

`Check Learning Resources Folder in Documentation Folder`

In [18]:
%%time
def geodestic_distance(pick_lat, pick_lng, drop_lat, drop_lng):
    # 1mile = 1.60934 Kms
    return round(geodesic((pick_lat, pick_lng), (drop_lat, drop_lng)).miles*1.60934,2)

df['geodesic_distance'] = np.vectorize(geodestic_distance)(df['pick_lat'],df['pick_lng'],df['drop_lat'],df['drop_lng'])

CPU times: user 22min 31s, sys: 3.05 s, total: 22min 34s
Wall time: 22min 36s


##### Number of rides booked but same customer within 8mins time: 875816

In [19]:
df[df['geodesic_distance']<=0.5]['geodesic_distance'].value_counts()

0.00    6619
0.01    2492
0.02    2011
0.03    1371
0.50    1210
0.04    1090
0.47    1076
0.49    1067
0.48    1064
0.46    1056
0.44     895
0.45     881
0.05     877
0.43     845
0.41     764
0.39     758
0.06     752
0.42     748
0.40     746
0.38     682
0.37     675
0.07     655
0.36     641
0.33     594
0.08     592
0.34     585
0.35     582
0.09     549
0.32     542
0.31     530
0.10     498
0.11     492
0.30     490
0.29     476
0.12     457
0.28     447
0.26     439
0.27     432
0.14     410
0.17     408
0.21     404
0.20     397
0.24     391
0.22     390
0.13     384
0.18     369
0.23     368
0.15     364
0.19     355
0.25     342
0.16     339
Name: geodesic_distance, dtype: int64

### Handle Case 2.1: Removing ride request less than 0.05 miles = 50meters

In [20]:
print("Number of Rides Requests less than 50meters: {}".format(len(df[df.geodesic_distance<=0.05])))

Number of Rides Requests less than 50meters: 14460


In [21]:
df = df[df.geodesic_distance>0.05]
df

Unnamed: 0,index,ts,number,pick_lat,pick_lng,drop_lat,drop_lng,hour,mins,day,month,year,dayofweek,booking_timestamp,shift_booking_ts,booking_time_diff_hr,booking_time_diff_min,geodesic_distance
0,2374378,2020-10-10 07:34:16,-1,12.975773,77.571070,12.878468,77.445330,7,34,10,10,2020,5,1602315256,0,445087,26705254,17.38
1,2405894,2020-10-11 08:23:42,-1,12.930813,77.609530,12.960320,77.587210,8,23,11,10,2020,6,1602404622,1602315256,24,1489,4.06
2,2406076,2020-10-11 11:57:17,-1,12.960213,77.587460,12.930824,77.609610,11,57,11,10,2020,6,1602417437,1602404636,3,213,4.04
3,2500477,2020-10-16 17:51:07,-1,12.924353,77.549410,12.932216,77.581825,17,51,16,10,2020,4,1602870667,1602417451,125,7553,3.62
4,2694503,2020-10-30 09:00:44,-1,12.945731,77.622500,12.973030,77.616840,9,0,30,10,2020,4,1604048444,1602870685,327,19629,3.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335823,5768115,2021-02-12 19:37:11,99999,13.029848,77.593400,13.063751,77.589850,19,37,12,2,2021,4,1613158631,1613068718,24,1498,3.77
4335824,6102760,2021-02-19 20:43:25,99999,13.029296,77.592580,12.927923,77.627106,20,43,19,2,2021,4,1613767405,1613158631,169,10146,11.82
4335825,6137206,2021-02-20 17:34:45,99999,12.907576,77.600685,12.925874,77.607620,17,34,20,2,2021,5,1613842485,1613767405,20,1251,2.16
4335826,6555089,2021-02-27 08:26:23,99999,12.956665,77.521870,12.948099,77.562990,8,26,27,2,2021,5,1614414383,1613842485,158,9531,4.56


In [22]:
len(df)

3709477

In [23]:
df.to_csv('./../Data/data_checkpoint/preprocessed_3.csv',index = False, compression = 'gzip')

### Handle Case3: Rides request in non-operational regions
OLA Bikes OPERATION CITY (Bangalore)

### Ride requests due to some bug or crash in app.
<hr>

#### India: 'boundingbox': ['6.2325274', '35.6745457', '68.1113787', '97.395561']
#### Bangalore:'boundingbox': ['12.8340125', '13.1436649', '77.4601025', '77.7840515']
#### Karnataka: 'boundingbox': ['11.5945587', '18.4767308', '74.0543908', '78.588083']
Source: openstreetmap

In [24]:
df = pd.read_csv('./../Data/data_checkpoint/preprocessed_3.csv', compression = 'gzip')
location = geolocator.geocode("India")
location.raw

{'place_id': 310001725,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'relation',
 'osm_id': 304716,
 'boundingbox': ['6.2325274', '35.5232328', '68.1113787', '97.395561'],
 'lat': '22.3511148',
 'lon': '78.6677428',
 'display_name': 'India',
 'class': 'boundary',
 'type': 'administrative',
 'importance': 0.957689135880987,
 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons//poi_boundary_administrative.p.20.png'}

In [25]:
df

Unnamed: 0,index,ts,number,pick_lat,pick_lng,drop_lat,drop_lng,hour,mins,day,month,year,dayofweek,booking_timestamp,shift_booking_ts,booking_time_diff_hr,booking_time_diff_min,geodesic_distance
0,2374378,2020-10-10 07:34:16,-1,12.975773,77.571070,12.878468,77.445330,7,34,10,10,2020,5,1602315256,0,445087,26705254,17.38
1,2405894,2020-10-11 08:23:42,-1,12.930813,77.609530,12.960320,77.587210,8,23,11,10,2020,6,1602404622,1602315256,24,1489,4.06
2,2406076,2020-10-11 11:57:17,-1,12.960213,77.587460,12.930824,77.609610,11,57,11,10,2020,6,1602417437,1602404636,3,213,4.04
3,2500477,2020-10-16 17:51:07,-1,12.924353,77.549410,12.932216,77.581825,17,51,16,10,2020,4,1602870667,1602417451,125,7553,3.62
4,2694503,2020-10-30 09:00:44,-1,12.945731,77.622500,12.973030,77.616840,9,0,30,10,2020,4,1604048444,1602870685,327,19629,3.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3709472,5768115,2021-02-12 19:37:11,99999,13.029848,77.593400,13.063751,77.589850,19,37,12,2,2021,4,1613158631,1613068718,24,1498,3.77
3709473,6102760,2021-02-19 20:43:25,99999,13.029296,77.592580,12.927923,77.627106,20,43,19,2,2021,4,1613767405,1613158631,169,10146,11.82
3709474,6137206,2021-02-20 17:34:45,99999,12.907576,77.600685,12.925874,77.607620,17,34,20,2,2021,5,1613842485,1613767405,20,1251,2.16
3709475,6555089,2021-02-27 08:26:23,99999,12.956665,77.521870,12.948099,77.562990,8,26,27,2,2021,5,1614414383,1613842485,158,9531,4.56


In [26]:
## How many rides outside india?
df[(df.pick_lat<=6.2325274) | (df.pick_lat>=35.6745457) | (df.pick_lng<=68.1113787) | (df.pick_lng>=97.395561) | (df.drop_lat<=6.2325274) | (df.drop_lat>=35.6745457) | (df.drop_lng<=68.1113787) | (df.drop_lng>=97.395561)]

Unnamed: 0,index,ts,number,pick_lat,pick_lng,drop_lat,drop_lng,hour,mins,day,month,year,dayofweek,booking_timestamp,shift_booking_ts,booking_time_diff_hr,booking_time_diff_min,geodesic_distance
4266,742031,2020-06-22 07:19:06,80,12.926255,77.616400,0.000000e+00,0.000000e+00,7,19,22,6,2020,0,1592810346,1590934080,521,31271,8674.57
13118,2502259,2020-10-16 21:52:28,297,12.958342,77.517876,3.890229e+00,-3.344596e+01,21,52,16,10,2020,4,1602885148,1602884658,0,8,12181.79
14555,4344697,2021-01-07 13:17:11,345,12.824208,77.684840,5.347187e+01,-1.021332e+02,13,17,7,1,2021,3,1610025431,1609843045,50,3039,12658.48
20669,8297478,2021-03-26 20:16:01,485,12.932975,77.536230,2.819774e+01,1.129942e+02,20,16,26,3,2021,4,1616789761,1616697903,25,1530,4043.95
26884,1786595,2020-09-09 09:36:43,641,12.896367,77.623800,0.000000e+00,0.000000e+00,9,36,9,9,2020,2,1599644203,0,444345,26660736,8675.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3664741,1985134,2020-09-21 14:40:51,98929,12.852040,77.675490,1.400000e-45,1.400000e-45,14,40,21,9,2020,0,1600699251,1600355752,95,5724,8680.58
3670580,834555,2020-07-02 08:40:16,99078,12.924925,77.606125,3.772864e+01,-8.920558e+01,8,40,2,7,2020,3,1593679216,1591804249,520,31249,14231.82
3671696,2331458,2020-10-06 18:12:35,99104,12.928082,77.603350,1.376042e+01,1.004980e+02,18,12,6,10,2020,1,1602007955,1601834214,48,2895,2481.04
3681558,2747830,2020-11-02 18:57:02,99379,12.984081,77.593330,0.000000e+00,0.000000e+00,18,57,2,11,2020,0,1604343422,1604309746,9,561,8672.37


### OLA Bikes is only operational in India
### Removing all rides for which pickup or drop is outside INDIA.
#### Number of such cases: 642

In [27]:
df.reset_index(inplace = True, drop = True)
outside_India = df[(df.pick_lat<=6.2325274) | (df.pick_lat>=35.6745457) | (df.pick_lng<=68.1113787) | (df.pick_lng>=97.395561) | (df.drop_lat<=6.2325274) | (df.drop_lat>=35.6745457) | (df.drop_lng<=68.1113787) | (df.drop_lng>=97.395561)]
df = df[~df.index.isin(outside_India.index)].reset_index(drop = True)

In [28]:
df

Unnamed: 0,index,ts,number,pick_lat,pick_lng,drop_lat,drop_lng,hour,mins,day,month,year,dayofweek,booking_timestamp,shift_booking_ts,booking_time_diff_hr,booking_time_diff_min,geodesic_distance
0,2374378,2020-10-10 07:34:16,-1,12.975773,77.571070,12.878468,77.445330,7,34,10,10,2020,5,1602315256,0,445087,26705254,17.38
1,2405894,2020-10-11 08:23:42,-1,12.930813,77.609530,12.960320,77.587210,8,23,11,10,2020,6,1602404622,1602315256,24,1489,4.06
2,2406076,2020-10-11 11:57:17,-1,12.960213,77.587460,12.930824,77.609610,11,57,11,10,2020,6,1602417437,1602404636,3,213,4.04
3,2500477,2020-10-16 17:51:07,-1,12.924353,77.549410,12.932216,77.581825,17,51,16,10,2020,4,1602870667,1602417451,125,7553,3.62
4,2694503,2020-10-30 09:00:44,-1,12.945731,77.622500,12.973030,77.616840,9,0,30,10,2020,4,1604048444,1602870685,327,19629,3.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3708830,5768115,2021-02-12 19:37:11,99999,13.029848,77.593400,13.063751,77.589850,19,37,12,2,2021,4,1613158631,1613068718,24,1498,3.77
3708831,6102760,2021-02-19 20:43:25,99999,13.029296,77.592580,12.927923,77.627106,20,43,19,2,2021,4,1613767405,1613158631,169,10146,11.82
3708832,6137206,2021-02-20 17:34:45,99999,12.907576,77.600685,12.925874,77.607620,17,34,20,2,2021,5,1613842485,1613767405,20,1251,2.16
3708833,6555089,2021-02-27 08:26:23,99999,12.956665,77.521870,12.948099,77.562990,8,26,27,2,2021,5,1614414383,1613842485,158,9531,4.56


In [29]:
print("Number of Good Ride Requests: {}".format(len(df)))

Number of Good Ride Requests: 3708835


In [30]:
## How many pickups and drops are outside bangalore?
pck_outside_bng = df[(df.pick_lat<=12.8340125) | (df.pick_lat>=13.1436649) | (df.pick_lng<=77.4601025) | (df.pick_lng>=77.7840515)]
drp_outside_bng = df[(df.drop_lat<=12.8340125) | (df.drop_lat>=13.1436649) | (df.drop_lng<=77.4601025) | (df.drop_lng>=77.7840515)]
print("Number of Pickup Requests Outside Bangalore: ",len(pck_outside_bng))
print("Number of Customers pickup outside Bangalore: ",len(np.unique(pck_outside_bng['number'].values)))

print("Number of Drops Requests Outside Bangalore: ",len(drp_outside_bng))
print("Number of Customers Drop outside Bangalore: ",len(np.unique(drp_outside_bng['number'].values)))

Number of Pickup Requests Outside Bangalore:  155908
Number of Customers pickup outside Bangalore:  20473
Number of Drops Requests Outside Bangalore:  167338
Number of Customers Drop outside Bangalore:  26878


In [31]:
### Bounding PickUp Lat-Long Within State Karnataka
# ['11.5945587', '18.4767308', '74.0543908', '78.588083']
pck_outside_KA = df[(df.pick_lat<=11.5945587) | (df.pick_lat>=18.4767308) | (df.pick_lng<=74.0543908) | (df.pick_lng>=78.588083)]
drp_outside_KA = df[(df.drop_lat<=11.5945587) | (df.drop_lat>=18.4767308) | (df.drop_lng<=74.0543908) | (df.drop_lng>=78.588083)]
print("Pickups Outisde KA: {} \nDrop outside KA: {}".format(len(pck_outside_KA),len(drp_outside_KA)))
print("Number of Customers Drop outside KA: ",len(np.unique(drp_outside_KA['number'].values)))
print("Number of Customers pickup outside KA: ",len(np.unique(pck_outside_KA['number'].values)))

Pickups Outisde KA: 38807 
Drop outside KA: 39585
Number of Customers Drop outside KA:  6917
Number of Customers pickup outside KA:  6302


In [32]:
total_ride_outside_KA = df[(df.pick_lat<=11.5945587) | (df.pick_lat>=18.4767308) | (df.pick_lng<=74.0543908) | (df.pick_lng>=78.588083) | (df.drop_lat<=11.5945587) | (df.drop_lat>=18.4767308) | (df.drop_lng<=74.0543908) | (df.drop_lng>=78.588083)]

In [33]:
print("Total Ride Outside Karnataka: {}".format(len(total_ride_outside_KA)))

Total Ride Outside Karnataka: 39632


### Total Ride Outside Karnataka: 39632
### OLA Bikes doesnot provide intercity requests. Considering these as system error requests

Source: https://www.olacabs.com/

In [34]:
## Rides for which geodesic distance > 500kms
## Pickup and drop not of KA (state where we have maximum booking requests and user base)
suspected_bad_rides = total_ride_outside_KA[total_ride_outside_KA.geodesic_distance > 500]
suspected_bad_rides

Unnamed: 0,index,ts,number,pick_lat,pick_lng,drop_lat,drop_lng,hour,mins,day,month,year,dayofweek,booking_timestamp,shift_booking_ts,booking_time_diff_hr,booking_time_diff_min,geodesic_distance
105,4765341,2021-01-21 23:18:35,0,12.958837,77.644485,21.149794,82.782000,23,18,21,1,2021,3,1611271115,1611219970,14,852,1058.39
7368,1756248,2020-09-08 14:49:08,154,12.922410,77.568270,25.549080,84.655800,14,49,8,9,2020,1,1599576548,1599496883,22,1327,1583.00
38148,586293,2020-06-04 18:13:54,878,12.970724,77.582420,21.504763,80.115160,18,13,4,6,2020,3,1591294434,1591266878,7,459,982.06
40544,122197,2020-04-07 11:09:24,913,13.004375,77.555210,19.075983,72.877655,11,9,7,4,2020,1,1586257764,1586102620,43,2585,837.66
40560,288777,2020-04-29 23:11:41,913,13.013597,77.545740,22.837053,69.724560,23,11,29,4,2020,2,1588201901,1588159989,11,698,1366.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3681746,770076,2020-06-25 11:32:12,99395,13.037086,77.515854,28.979435,77.689580,11,32,25,6,2020,3,1593084732,1593016475,18,1137,1765.26
3681747,1086664,2020-07-29 15:29:45,99395,12.975827,77.605644,28.979435,77.689580,15,29,29,7,2020,2,1596036585,1593084811,819,49196,1771.97
3685647,4377242,2021-01-09 14:10:14,99499,12.959066,77.656815,28.479555,77.079950,14,10,9,1,2021,5,1610201414,1610187082,3,238,1719.44
3685648,4377307,2021-01-09 14:57:02,99499,12.959036,77.656770,28.479555,77.079950,14,57,9,1,2021,5,1610204222,1610201414,0,46,1719.45


### There are 506 rides which are >500kms geodesic distance and are pickup & drop outside KA, these are suspected rides. 

In [35]:
df = df[~df.index.isin(suspected_bad_rides.index)].reset_index(drop = True)

In [36]:
print("Number of Good Ride Requests: {}".format(len(df)))

Number of Good Ride Requests: 3708329


In [37]:
dataset = df[['ts', 'number', 'pick_lat','pick_lng','drop_lat','drop_lng','geodesic_distance','hour','mins','day','month','year','dayofweek','booking_timestamp','booking_time_diff_hr', 'booking_time_diff_min']]


In [38]:
dataset.to_csv('./../Data/clean_data.csv',index = False, compression = 'gzip')

## Some Stats: Of 1year of Ride Requests Data at OLA Bikes
### Ride request of same user with same timestamp: 113540
### None user_id: 116
### Number of requests to same pickup lat-long by a user within 1hour: 39,79,554
### Number of rides by a user within 8mins of booking to different pickup lat-long: 611891
### Number of Rides Requests less than 50meters of pickup and drop: 14460
### Number of Rides pickup or drop lat-long outside India: 642

### Our majority ride state is Karnataka
#### Total Ride Outside Karnataka (pickup or drop): 39632

### Rides which are outside KA and pickup to drop distance is >500kms: 506

## Number of Good Ride Requests: 3708329