In [None]:
import h3
import pandas
from datetime import datetime, timedelta

# Original NRT features from driver cancel table
original_features = ['supply_surge', 'surge_diff', 'demand_surge', 'eta', 'rounded_eta', 'eyeball_eta', 'trip_distance_haversine',
               'fd_eta', 'rating', 'forward_dispatched']

# Trip features from fact_trip table
trip_features = ['driver_surge_multiplier',
       'est_rider_fare_distance_miles', 'fare_distance_miles',
       'fare_duration_minutes', 'trip_distance_miles', 'trip_duration_seconds']

def ConvertHexter(data:pandas.DataFrame, resolution:int): 
    """Add columns for supply and demand hexter
    Convert (latitude, longitude) in every sample into a hexter
    """
    KEYWORDS = ['request_location_', 'supply_location_']

    hexter_tag = 'hexter_' + str(resolution)
    
    # Initialize the new columns
    for key in KEYWORDS:
        data[key + hexter_tag] = None
        
    # Convert (lat, lng) to hexter
    for index, row in data.iterrows():
        for key in KEYWORDS:
            hexter = h3.geo_to_h3(row[key + 'latitude'], row[key + 'longitude'], resolution)
            data.loc[index, key + hexter_tag] = hexter
        
    return data

def GenerateNRTFeatures(data:pandas.DataFrame, country_id:int, city_id:str, features:list):
    """ Generate NRT features based on given feature list
    Scope of NRT: last hour (determined by hour_of_day)
    Accepted trips only
    Aggregated for both request hexter and supply hexter
    Generate NRT features based on parameter features
    """
    print(data.shape)
    

    # Initialize NRT features
    for feature in features:
        if 'request_average_' + feature not in data.columns:
            data['request_average_' + feature] = None
        if 'supply_average_' + feature not in data.columns:
            data['supply_average_' + feature] = None
    
    # Calulate NRT features for each row
    for index, row in data.iterrows():
        current_date = row['datestr']
        current_hour = row['hour_of_day']
        supply_hexter = row['supply_location_hexter_9']
        request_hexter = row['request_location_hexter_9']
        if current_hour == 0:
            temp_date = datetime.strptime(current_date, '%Y-%m-%d')
            past_date = temp_date - timedelta(days=1)
            past_date = past_date.strftime('%Y-%m-%d')
            past_hour = 23
        else:
            past_date = current_date
            past_hour = current_hour - 1
        
        # Aggregated for supply hexter
        target_data = data[(data['canceled'] == False) & (data['datestr'] == past_date) & (data['hour_of_day'] == past_hour) & (data['supply_location_hexter_9'] == supply_hexter)]
        for feature in features:
            data.loc[index, 'supply_average_'+feature] = target_data[feature].mean()
        
        # Aggregated for request hexter
        target_data = data[(data['canceled'] == False) & (data['datestr'] == past_date) & (data['hour_of_day'] == past_hour) & (data['supply_location_hexter_9'] == request_hexter)]
        for feature in features:
            data.loc[index, 'request_average_'+feature] = target_data[feature].mean()
    return data
        
def GenerateAcceptRateNRT(data:pandas.DataFrame, country_id:int, city_id:str):
    """ Generate NRT features for accept rate
    Scope of NRT: last hour (determined by hour_of_day)
    Aggregate for both supply and request hexters
    Calculate the driver accept rate as NRT feature
    The difference from the above function is this one considers all trips, instead of only accepted ones
    """

    print(data.shape)
    
    # Initialize features
    data['supply_average_cancel_rate'] = None
    data['request_average_cancel_rate'] = None
    
    # Calculate NRT features for each row
    for index, row in data.iterrows():
        current_date = row['datestr']
        current_hour = row['hour_of_day']
        supply_hexter = row['supply_location_hexter_9']
        request_hexter = row['request_location_hexter_9']
        if current_hour == 0:
            temp_date = datetime.strptime(current_date, '%Y-%m-%d')
            past_date = temp_date - timedelta(days=1)
            past_date = past_date.strftime('%Y-%m-%d')
            past_hour = 23
        else:
            past_date = current_date
            past_hour = current_hour - 1
            
        # Aggregated for supply hexter
        target_data = data[(data['datestr'] == past_date) & (data['hour_of_day'] == past_hour) & (data['supply_location_hexter_9'] == supply_hexter)]
        data.loc[index, 'supply_average_cancel_rate'] = target_data['canceled'].mean()
        
        # Aggregated for request hexter
        target_data = data[(data['datestr'] == past_date) & (data['hour_of_day'] == past_hour) & (data['supply_location_hexter_9'] == request_hexter)]
        data.loc[index, 'request_average_cancel_rate'] = target_data['canceled'].mean()
    return data

def GenerateNRTFeaturesByTimestamp(data:pandas.DataFrame, country_id:int, city_id:str, features:list, NRT_period:int):
    """NRT_period in minutes, trace back all the accepted trips happened within this period (determined by timestamp)
    Accepted trips only
    Aggregated for both request hexter and supply hexter
    Generate NRT features based on parameter features"""
    
    print(data.shape)
    


    # Initialize NRT features
    for feature in features:
        if 'request_average_' + feature not in data.columns:
            data['request_average_' + feature + '_' + str(NRT_period)] = None
        if 'supply_average_' + feature not in data.columns:
            data['supply_average_' + feature  + '_' + str(NRT_period)] = None
    
    # Calulate NRT features for each row
    for index, row in data.iterrows():
        # Get the timestamp, and convert to datetime
        current_timestamp = row['timestamp']
        current_time = datetime.fromtimestamp(row['timestamp'])

        # Get hexter 
        supply_hexter = row['supply_location_hexter_9']
        request_hexter = row['request_location_hexter_9']
        
        # Get the datetime to aggregate NRT features
        past_time = current_time - timedelta(minutes=NRT_period)
        past_timestamp = datetime.timestamp(past_time)
        
        
        # Aggregated for supply hexter
        target_data = data[(data['canceled'] == False) & (data['timestamp'] >= past_timestamp) & (data['timestamp'] < current_timestamp) & (data['supply_location_hexter_9'] == supply_hexter)]
        for feature in features:
            data.loc[index, 'supply_average_'+feature + '_' + str(NRT_period)] = target_data[feature].mean()
        
        # Aggregated for request hexter
        target_data = data[(data['canceled'] == False) & (data['timestamp'] >= past_timestamp) & (data['timestamp'] < current_timestamp) & (data['supply_location_hexter_9'] == request_hexter)]
        for feature in features:
            data.loc[index, 'request_average_'+feature + '_' + str(NRT_period)] = target_data[feature].mean()
    return data

def AddNewFeatures(data:pandas.DataFrame, trips:dict, features:list):
    """ Add new features from trip data to the dataset for further NRT feature extraction
    This function is used for preprocess the dataset, by joining driver cancel data and fact trip data with same job_uuid
    """
    # Initialize trip features
    for feature in features:
        if feature not in data.columns:
            data[feature] = None
    print(data.shape)
    
    # Add values for trip feature
    for index, row in data.iterrows():
        # Get job_uuid as identifier
        job_uuid = row['job_uuid']
        
        # Search the job_uuid in trip data
        
        if job_uuid in trips:
            for feature in features:
                data.loc[index, feature] = trips[job_uuid][feature]
    return data

print('Done')        


In [None]:
# Below is an example with SF dataset, country_id = 1, city_id = '1', date: 2022-06-30 to 2022-07-15
# The preprocess is not included in this file
# The input dataset is from driver cancel table, or combined with fact_trip table

# Step 1: convert to hexter

data = pandas.read_csv('report-2022-0630-0715-1-1.csv', header=0)
print(data.shape)

# Slice the dataset first, then convert the hexters
START = datetime(2022,6,30)
END = datetime(2022,7,15)
cursor = START

while cursor <= END:
    print(cursor)
    previous = cursor - timedelta(days=1)
    filtered = data[data['datestr']==cursor.strftime('%Y-%m-%d')]
    print(filtered.shape)
    result = ConvertHexter(filtered, 9)
    result = result[result['datestr']==cursor.strftime('%Y-%m-%d')]
    result.to_csv('Sample-1-1/report-hexter-1-1-' + cursor.strftime('%Y-%m-%d') + '.csv', header=True, index=False)
    cursor = cursor + timedelta(days=1)

print('Done')    


In [None]:
# Step 2.1: generate NRT features
START = datetime(2022, 7, 1)
END = datetime(2022,7, 15)


cursor = START
while cursor <= END:
    print(cursor)
    result = pandas.DataFrame()
    previous = cursor - timedelta(days=1)
    current_data = pandas.read_csv('Sample-1-1/report-trip-hexter-1-1-'+cursor.strftime('%Y-%m-%d')+'.csv', header=0)
    previous_data = pandas.read_csv('Sample-1-1/report-trip-hexter-1-1-'+previous.strftime('%Y-%m-%d')+'.csv', header=0)
    previous_data = previous_data[previous_data['hour_of_day']==23] # Only need the last hour of the previous day

    # Process one hour each time
    for i in range(24):
        print(i)
        filtered = current_data[current_data['hour_of_day']==i]
        if i == 0:
            filtered = pandas.concat([previous_data, filtered])
        else:
            filtered = pandas.concat([current_data[current_data['hour_of_day']==(i-1)], filtered])
        temp_result = GenerateNRTFeatures(filtered, 1, '1', original_features + trip_features)
        temp_result = temp_result[temp_result['hour_of_day']==i]
        result = pandas.concat([result, temp_result])
    result = result[result['datestr']==cursor.strftime('%Y-%m-%d')]
    result.to_csv('Sample-1-1/report-trip-hexter-NRT-1-1-'+cursor.strftime('%Y-%m-%d')+'.csv', header=True, index=False)
    cursor = cursor + timedelta(days=1)

print('Done')


In [None]:
# Step 2.2: generate accept rate NRT features

START = datetime(2022, 7, 1)
END = datetime(2022,7, 15)

cursor = START
while cursor <= END:
    print(cursor)
    result = pandas.DataFrame()
    previous = cursor - timedelta(days=1)
    current_data = pandas.read_csv('Sample-1-1/report-trip-hexter-NRT-1-1-'+cursor.strftime('%Y-%m-%d')+'.csv', header=0)
    if cursor.day == 1 :
        previous_data = pandas.read_csv('Sample-1-1/report-hexter-1-1-'+previous.strftime('%Y-%m-%d')+'.csv', header=0)
    else:
        previous_data = pandas.read_csv('Sample-1-1/report-trip-hexter-NRT-1-1-'+previous.strftime('%Y-%m-%d')+'.csv', header=0)
    previous_data = previous_data[previous_data['hour_of_day']==23] # Only need the last hour of the previous day

    # Process one hour each time
    for i in range(24):
        print(i)
        filtered = current_data[current_data['hour_of_day']==i]
        if i == 0:
            filtered = pandas.concat([previous_data, filtered])
        else:
            filtered = pandas.concat([current_data[current_data['hour_of_day']==(i-1)], filtered])
        temp_result = GenerateAcceptRateNRT(filtered, 1, '1')
        temp_result = temp_result[temp_result['hour_of_day']==i]
        result = pandas.concat([result, temp_result])
    result = result[result['datestr']==cursor.strftime('%Y-%m-%d')]
    result.to_csv('Sample-1-1/report-trip-hexter-NRT-1-1-'+cursor.strftime('%Y-%m-%d')+'.csv', header=True, index=False)
    cursor = cursor + timedelta(days=1)
print('Done')

In [None]:
# Step 2.3: Generate NRT features by timestamp

START = datetime(2022, 7, 7)
END = datetime(2022,7, 15)



cursor = START
while cursor <= END:
    print(cursor)
    result = pandas.DataFrame()
    previous = cursor - timedelta(days=1)
    if previous == datetime(2022, 6, 30):
        previous_data = previous_data = pandas.read_csv('Sample-1-1/report-trip-hexter-1-1-'+previous.strftime('%Y-%m-%d')+'.csv', header=0)
    else:
        previous_data = pandas.read_csv('Sample-1-1/report-trip-hexter-NRT-1-1-'+previous.strftime('%Y-%m-%d')+'.csv', header=0)

    current_data = pandas.read_csv('Sample-1-1/report-trip-hexter-NRT-1-1-'+cursor.strftime('%Y-%m-%d')+'.csv', header=0)
    previous_data = previous_data[previous_data['hour_of_day']==23] # Only need the last hour of the previous day

    filtered = pandas.concat([previous_data, current_data])
    result = GenerateNRTFeaturesByTimestamp(filtered, 1, '1', original_features+trip_features, 30)
    result = result[result['datestr']==cursor.strftime('%Y-%m-%d')]
    result.to_csv('Sample-1-1/report-trip-hexter-NRT-1-1-'+cursor.strftime('%Y-%m-%d')+'.csv', header=True, index=False)
    cursor = cursor + timedelta(days=1)
print('Done')