In [96]:
import pandas as pd
import h3
from keplergl import KeplerGl
from datetime import datetime, timedelta
from meteostat import Point, Daily, Hourly

In [32]:
'''
Using tripdata from https://www.kaggle.com/c/nyc-taxi-trip-duration/data, converting longitude and latitude 
into h3 index and saving it in a csv file to speedup computation. 
'''

long_range = [-73.903035, -74.025710]
lat_range = [40.700372, 40.881485]

df_train = pd.read_csv('../00_data/train.csv')
df_test = pd.read_csv('../00_data/test.csv')

df_train = df_train[
    (df_train.pickup_longitude <= long_range[0]) & 
    (df_train.pickup_longitude >= long_range[1]) &
    (df_train.pickup_latitude >= lat_range[0]) &
    (df_train.pickup_latitude <= lat_range[1]) &
    (df_train.dropoff_longitude <= long_range[0]) & 
    (df_train.dropoff_longitude >= long_range[1]) &
    (df_train.dropoff_latitude >= lat_range[0]) &
    (df_train.dropoff_latitude <= lat_range[1]) 
                   ]

df_test = df_test[
    (df_test.pickup_longitude <= long_range[0]) & 
    (df_test.pickup_longitude >= long_range[1]) &
    (df_test.pickup_latitude >= lat_range[0]) &
    (df_test.pickup_latitude <= lat_range[1]) &
    (df_test.dropoff_longitude <= long_range[0]) & 
    (df_test.dropoff_longitude >= long_range[1]) &
    (df_test.dropoff_latitude >= lat_range[0]) &
    (df_test.dropoff_latitude <= lat_range[1]) 
                   ]


for h3_resolution in [7]:

    for mode in ['train', 'test']:
        
        if mode == 'train':
            df = df_train
        elif mode == 'test':
            df = df_test
        else:
            break
            
        column_pickup_id = 'pickup_h3_%s' % h3_resolution
        column_dropoff_id = 'dropoff_h3_%s' % h3_resolution

        h3_pickup_id = [h3.geo_to_h3(lat=row['pickup_latitude'],lng=row['pickup_longitude'],resolution=h3_resolution) for index, row in df.iterrows()]
        h3_dropoff_id = [h3.geo_to_h3(lat=row['dropoff_latitude'],lng=row['dropoff_longitude'],resolution=h3_resolution) for index, row in df.iterrows()]

        df[column_pickup_id] = h3_pickup_id
        df[column_dropoff_id] = h3_dropoff_id

        df.to_csv('../00_data/%s_h3.csv' % mode)

In [115]:
df_train = pd.read_csv('../00_data/train_h3.csv')
df_test = pd.read_csv('../00_data/test_h3.csv')
zone_ids = set(list(df_train['pickup_h3_7'].values)+ list(df_train['dropoff_h3_7'].values))

df_train['pickup_datetime'] =  pd.to_datetime(df_train['pickup_datetime'], format='%Y-%m-%d %H:%M:%S')
df_test['pickup_datetime'] =  pd.to_datetime(df_test['pickup_datetime'], format='%Y-%m-%d %H:%M:%S')

#df_train['dropoff_datetime'] =  pd.to_datetime(df_train['dropoff_datetime'], format='%Y-%m-%d %H:%M:%S')
#df_test['dropoff_datetime'] =  pd.to_datetime(df_test['dropoff_datetime'], format='%Y-%m-%d %H:%M:%S') # dropoff_datetime missing in datad

start = df_train['pickup_datetime'].min().replace(minute=0, second=0, microsecond=0)
end = df_train['pickup_datetime'].max().replace(minute=0, second=0, microsecond=0)

In [131]:
Manhattan = Point(40.754932, -73.984016, 2)
weather_data = Hourly(Manhattan, start, end).fetch() #  needs to be averaged over timewindow

interval_df = pd.DataFrame({'start': [_ for _ in pd.date_range(start, end, freq='4H')],})  
interval_df['end'] = interval_df['start'].shift(-1)
interval_df.dropna()
interval_df['end'].iloc[-1] = (interval_df['start'].iloc[-1] + timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)

weather_attributes = ['temp', 'dwpt', 'rhum', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'coco']
columns = ['start', 'end'] + list(zone_ids) + weather_attributes

df_zone_filter_train = pd.DataFrame(columns=columns)

for index in range(interval_df.shape[0]):
    row = interval_df.loc[index]
    event_filter = df_train[(df_train.pickup_datetime >= row['start']) & (df_train.pickup_datetime <row['end'])]
    
    weather_entries = weather_data[row['start']:row['end']][:-1].mean(axis = 0, skipna = True).values.tolist()
    counts = [event_filter[(event_filter.pickup_h3_7 == id)].shape[0] for id in zone_ids]
    
    entry = pd.Series([row['start'], row['end']] + counts + weather_entries, index=df_zone_filter_train.columns)
    df_zone_filter_train = df_zone_filter_train.append(entry, ignore_index=True)

In [132]:
df_zone_filter_train

Unnamed: 0,start,end,872a100abffffff,872a100b3ffffff,872a1009dffffff,872a100aeffffff,872a10088ffffff,872a10725ffffff,872a100d2ffffff,872a100deffffff,...,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
0,2016-01-01 00:00:00,2016-01-01 04:00:00,6,0,0,8,82,92,397,30,...,-1.025,55.600000,0.0,,282.500000,18.525000,,1019.075000,,
1,2016-01-01 04:00:00,2016-01-01 08:00:00,4,0,0,2,17,47,114,5,...,-2.450,54.850000,0.0,,297.500000,16.900000,,1018.475000,,
2,2016-01-01 08:00:00,2016-01-01 12:00:00,2,0,0,0,22,42,127,1,...,-2.525,58.200000,0.0,,257.500000,17.350000,,1017.850000,,
3,2016-01-01 12:00:00,2016-01-01 16:00:00,1,0,0,0,47,63,254,0,...,-3.350,57.425000,0.0,,262.500000,15.400000,,1018.175000,,
4,2016-01-01 16:00:00,2016-01-01 20:00:00,2,0,0,0,42,65,280,3,...,-3.850,52.625000,0.0,,267.500000,23.950000,,1016.375000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1087,2016-06-30 04:00:00,2016-06-30 08:00:00,1,0,0,0,25,49,157,0,...,13.425,57.250000,0.0,,310.000000,9.975000,,1013.950000,,
1088,2016-06-30 08:00:00,2016-06-30 12:00:00,2,0,0,1,58,62,330,1,...,14.025,66.175000,0.0,,300.000000,7.725000,,1015.275000,,
1089,2016-06-30 12:00:00,2016-06-30 16:00:00,0,0,0,0,56,53,280,1,...,12.850,47.325000,0.0,,102.500000,9.475000,,1016.350000,,
1090,2016-06-30 16:00:00,2016-06-30 20:00:00,1,0,0,0,48,74,300,2,...,11.625,35.375000,0.0,,270.000000,13.750000,,1016.075000,,
