In [1]:
import pandas as pd
import requests
import time
from tqdm.auto import tqdm

import numpy as np
from sklearn.neighbors import BallTree



In [2]:
main_df = pd.read_csv('datasets/ncr_ride_bookings.csv')

Get list of unique locations

In [3]:
location_list = main_df['Pickup Location'].unique().tolist() +  main_df['Drop Location'].unique().tolist()

location_list = list(set(location_list))

location_df = pd.DataFrame(location_list)


Go to https://openrouteservice.org/ to create an account and generate an API key to get the location information.

In [4]:
api_key = 'generateyourAPIkey'

In [5]:
#input params
country_code ='IN'
return_size = 5

Retrieve pickup/dropoff location meta info through API

In [6]:
try:
    loc_meta_df = pd.read_csv('datasets/potential_loc_info.csv')
except:
    json_data = []
    success_loc = []
    fail_loc = []
    for loc in tqdm(location_list):
        r = requests.get(f'https://api.openrouteservice.org/geocode/search?api_key={api_key}&text={loc}&boundary.country={country_code}&size={return_size}')
        try:
            n = len(r.json()['features']) 
            if n >= 1:  # return at least 1 result
                json_data.extend(r.json()['features'])
                success_loc += [loc]*n
            else:
                fail_loc.append(loc)
        except:
            fail_loc.append(loc)
        time.sleep(0.5)  # too much API call will break the connection

    loc_meta_df = pd.json_normalize(json_data)[['properties.source_id','geometry.coordinates','properties.label','properties.region','properties.county','properties.locality']]
    loc_meta_df[['longitude','latitude']] = loc_meta_df['geometry.coordinates'].apply(pd.Series)
    loc_meta_df['uber_loc'] = success_loc

    loc_meta_df.rename(columns = {
        'properties.source_id':'source_id',
        'properties.label':'address',
        'properties.region':'region',
        'properties.county':'county',
        'properties.locality':'locality'
    },inplace=True)
    loc_meta_df = loc_meta_df[['source_id','longitude','latitude','uber_loc','address','region','county','locality']]
    loc_meta_df.to_csv('datasets/potential_loc_info.csv',index=False)

loc_meta_df.head(2)
    

Unnamed: 0,source_id,longitude,latitude,uber_loc,address,region,county,locality
0,way/701576004,77.321312,28.675872,Dilshad Garden,"Dilshad Garden, Delhi, India",Delhi,South West Delhi,Delhi
1,way/764108576,77.313283,28.680287,Dilshad Garden,"DGD Dilshad Garden, Delhi, India",Delhi,South West Delhi,Delhi


In [7]:
def estimate_loc(record,loc_data=None):
    pickup = record['Pickup Location']
    dropoff = record['Drop Location']
    ride_distance = record['Ride Distance']
    avail_locs = loc_data['uber_loc'].unique()
    if (pickup not in avail_locs) or (dropoff not in avail_locs):
        return np.full(10,np.nan)
    
    start_df = loc_data.loc[loc_data['uber_loc']==pickup]
    end_df = loc_data.loc[loc_data['uber_loc']==dropoff]
    start_coords = np.radians(start_df[['longitude','latitude']])
    end_coords = np.radians(end_df[['longitude','latitude']])

    # match pair and find distance between 2 locations
    # read this paper if necessary https://towardsdatascience.com/using-scikit-learns-binary-trees-to-efficiently-find-latitude-and-longitude-neighbors-909979bd929b/
    tree = BallTree(end_coords, metric='haversine')
    distances, indices = tree.query(start_coords, k=1)
    if np.isnan(ride_distance):
        start_idx = np.argmin(distances)
    else:
        start_idx = np.argmin(np.abs(distances - ride_distance))
    end_idx = indices[start_idx][0]

    start_info = start_df[['longitude','latitude',
                          'address','region','locality']].iloc[start_idx].to_numpy()
    end_info = end_df[['longitude','latitude',
                          'address','region','locality']].iloc[end_idx].to_numpy()
    
    return np.concatenate((start_info,end_info))


Combine location data to main dataset

In [8]:
try:
    main_loc_df = pd.read_csv('datasets/ncr_ride_bookings(with_loc).csv')
except:
    tqdm.pandas()
    main_df[['pick_longitude','pick_latitude','pick_address','pick_region','pick_locality',
            'drop_longitude','drop_latitude','drop_address','drop_region','drop_locality']] = main_df.progress_apply(estimate_loc,axis=1, result_type='expand',loc_data = loc_meta_df)
    main_df.to_csv('datasets/ncr_ride_bookings(with_loc).csv',index=False)
    main_loc_df = main_df.copy()
    del main_df
    
main_loc_df.head(2)

Unnamed: 0,Date,Time,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Avg VTAT,Avg CTAT,...,pick_longitude,pick_latitude,pick_address,pick_region,pick_locality,drop_longitude,drop_latitude,drop_address,drop_region,drop_locality
0,2024-03-23,12:29:38,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,,,...,77.164401,28.567593,"Palam Marg, Delhi, India",Delhi,Delhi,77.311751,28.670789,"DGD Jhilmil, Delhi, India",Delhi,Delhi
1,2024-11-29,18:01:39,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,14.0,...,73.89728,18.552377,"Shastri Nagar, Pune, MH, India",Maharashtra,Pune,77.011193,28.489101,"Gurgaon, Gurugram, HR, India",Haryana,Gurugram


Retrieve weather data

The weather data is gather from https://open-meteo.com/en/docs/historical-weather-api for all pickup and dropoff latitude/longitude.

There is no API key required but there is a limit per day/month

In [9]:
try:
    weather_df = pd.read_csv('datasets/weather_data.csv')
except:
    pickup_df = main_loc_df[['pick_latitude','pick_longitude','pick_address']]
    pickup_df.columns = ['latitude','longitude','address']
    dropof_df = main_loc_df[['drop_latitude','drop_longitude' ,'drop_address']]
    dropof_df.columns = ['latitude','longitude','address']
    loc_map_df = pd.concat([pickup_df,dropof_df],axis=0).drop_duplicates().reset_index(drop=True)

    weather_df = pd.DataFrame()
    for lat,long,address in tqdm(loc_map_df.to_numpy()):
        r = requests.get(f'https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={long}&start_date=2024-01-01&end_date=2024-12-31&hourly=temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,snowfall,wind_speed_10m')
        try:
            combine_keys = list(r.json()['hourly'].keys())

            data_df = pd.DataFrame()
            for key in combine_keys:
                add_df = pd.json_normalize(r.json(),record_path=['hourly',key])
                data_df = pd.concat([data_df,add_df],axis=1)
            data_df.columns = combine_keys
            data_df.insert(1,'address',address)

            weather_df = pd.concat([weather_df,data_df],axis=0)
            weather_df['time'] = pd.to_datetime(weather_df['time'])
        except:
            print(f'Fail to retrieve data for: {address}')
            continue

        time.sleep(2)
        
    weather_df.to_csv('datasets/weather_data.csv',index=False)

    