In [1]:
import pandas as pd
from tqdm.auto import tqdm

from project_package import data_collection as dc
from project_package import utility

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# Raw dataset
main_df = pd.read_csv('datasets/raw/ncr_ride_bookings.csv')

Get list of unique locations

In [3]:
location_list = main_df['Pickup Location'].unique().tolist() +  main_df['Drop Location'].unique().tolist()

location_list = list(set(location_list))

Go to https://openrouteservice.org/ to create an account and generate an API key to get the location information.

In [None]:
api_key = 'generateyourAPIkey'

#input params
country_code ='IN'
return_size = 5

Retrieve pickup/dropoff location meta info through API

In [5]:
try:
    loc_meta_df = utility.read_data('potential_loc_info')
except:
    loc_meta_df = dc.generate_address(location_list,api_key,country_code,return_size)
    utility.save_dataframe(loc_meta_df,file_name='potential_loc_info')
loc_meta_df.head(2)
    

Unnamed: 0,source_id,longitude,latitude,uber_loc,address,region,county,locality
0,way/701576004,77.321312,28.675872,Dilshad Garden,"Dilshad Garden, Delhi, India",Delhi,South West Delhi,Delhi
1,way/764108576,77.313283,28.680287,Dilshad Garden,"DGD Dilshad Garden, Delhi, India",Delhi,South West Delhi,Delhi


Combine location data to main dataset

In [6]:
try:
    main_loc_df = utility.read_data('ncr_ride_bookings(with_loc)')
except:
    tqdm.pandas()
    main_df[['pick_longitude','pick_latitude','pick_address','pick_region','pick_locality',
            'drop_longitude','drop_latitude','drop_address','drop_region','drop_locality']] = main_df.progress_apply(
                dc.find_best_address,axis=1, result_type='expand',loc_data = loc_meta_df
                )
    utility.save_dataframe(main_df,file_name='ncr_ride_bookings(with_loc)')
    main_loc_df = main_df.copy()
    del main_df
    
main_loc_df.head(2)

Unnamed: 0,Date,Time,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Avg VTAT,Avg CTAT,...,pick_longitude,pick_latitude,pick_address,pick_region,pick_locality,drop_longitude,drop_latitude,drop_address,drop_region,drop_locality
0,2024-03-23,12:29:38,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,,,...,77.164401,28.567593,"Palam Marg, Delhi, India",Delhi,Delhi,77.311751,28.670789,"DGD Jhilmil, Delhi, India",Delhi,Delhi
1,2024-11-29,18:01:39,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,14.0,...,73.89728,18.552377,"Shastri Nagar, Pune, MH, India",Maharashtra,Pune,77.011193,28.489101,"Gurgaon, Gurugram, HR, India",Haryana,Gurugram


Get feature data from DataCommon

In [None]:
dc_api_key  = 'generateyourAPIkey'

In [8]:
# Get city DCID
try: 
    city_df = utility.read_data('dc_city_metadata')
except:
    city_df = dc.collect_DataCommons_city_id(dc_api_key)
    utility.save_dataframe(city_df,file_name='dc_city_metadata')
city_df.head(2)

Unnamed: 0,state_dcid,state_name,city_dcid,city_name
0,wikidataId/Q1061,Gujarat,wikidataId/Q1023682,Wadhwan
1,wikidataId/Q1061,Gujarat,wikidataId/Q1070,Ahmedabad


In [9]:
feature_name_map = {
    # Demographic
    "Count_Person_PerArea": "population_density"
}

# Get demographic data using API
try:
    feature_df = utility.read_data('demographic_feature')
except:
    state_list = city_df.state_dcid.unique().tolist()

    demographic_data_to_get = list(feature_name_map.keys())
    feature_df = dc.collect_DataCommons_features_data(
        dc_api_key, state_list, demographic_data_to_get, feature_name_map
    )

    feature_df = pd.merge(feature_df,city_df[['state_dcid','state_name']].drop_duplicates(),on='state_dcid')
    utility.save_dataframe(feature_df,file_name='demographic_feature')

feature_df

Unnamed: 0,state_dcid,year,population_density,state_name
0,wikidataId/Q1061,1991,210.74,Gujarat
1,wikidataId/Q1061,2001,258.49,Gujarat
2,wikidataId/Q1061,2002,263.81,Gujarat
3,wikidataId/Q1061,2003,269.13,Gujarat
4,wikidataId/Q1061,2004,274.45,Gujarat
...,...,...,...,...
667,wikidataId/Q66743,2018,3753.65,Puducherry
668,wikidataId/Q66743,2019,3899.79,Puducherry
669,wikidataId/Q66743,2020,4058.46,Puducherry
670,wikidataId/Q66743,2021,4233.82,Puducherry


Retrieve weather data

The weather data is gather from https://open-meteo.com/en/docs/historical-weather-api for all pickup and dropoff latitude/longitude.

There is no API key required but there is a limit per day/month

In [10]:
try:
    weather_df = utility.read_data('weather_data',directory='datasets/raw/weather')
except:
    pickup_df = main_loc_df[['pick_latitude','pick_longitude','pick_address']]
    pickup_df.columns = ['latitude','longitude','address']
    dropof_df = main_loc_df[['drop_latitude','drop_longitude' ,'drop_address']]
    dropof_df.columns = ['latitude','longitude','address']
    loc_map_df = pd.concat([pickup_df,dropof_df],axis=0).drop_duplicates().reset_index(drop=True)

    weather_df = dc.generate_weather_data(loc_map_df)
    utility.save_dataframe(main_df,file_name='weather_data',directory='datasets/raw/weather')

weather_df.head(4)
    

Unnamed: 0,time,address,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,snowfall,wind_speed_10m
0,2024-01-01 00:00:00,"kuruva palam, KL, India",24.2,82,20.9,27.8,0.0,0.0,0.0,5.2
1,2024-01-01 01:00:00,"kuruva palam, KL, India",24.8,75,20.1,27.7,0.0,0.0,0.0,7.1
2,2024-01-01 02:00:00,"kuruva palam, KL, India",25.2,69,19.0,27.1,0.0,0.0,0.0,10.3
3,2024-01-01 03:00:00,"kuruva palam, KL, India",26.9,60,18.4,28.1,0.0,0.0,0.0,13.1
