In [2]:
import time
import warnings
from datetime import datetime as dt
from re import M
import ast

import git
import numpy as np
import pandas as pd
import scipy.stats as stats

import matplotlib as plt

In [None]:
pd.set_option('display.max_columns', None)

In [3]:
repo = git.Repo(".", search_parent_directories=True).git.rev_parse(
    "--show-toplevel"
)
df = pd.read_csv(f"{repo}/data/cleaning/test_1652280840.csv") 
df_stops = pd.read_excel(
    f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name="MoDstops"
)

df.head(2)

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,...,cancellation_comment,sheet_name,file_name,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index
0,5727475e-8224-4302-9228-c92b9d4a5220,f8ff0526-887a-4e48-ad96-977e12fd70c1,5483.0,1.0,4.65,4.65,0.0,False,STANDARD,11009,...,,1,b'Rides_2021-07.xlsx',,,,,,,
1,18fec0a6-b7ba-442b-8472-04bdb6ba1b86,51e1a1a8-995c-488c-84ce-3789e46f0417,3575.0,1.0,0.0,2.77,0.0,False,BAHN_CARD,2007,...,,1,b'Rides_2021-07.xlsx',,,,,,,


In [4]:
df_stops = pd.read_excel(
    f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name="MoDstops"
)
df_stops.head(2)

Unnamed: 0,MoDStop Id,MoDStop Name,MoDStop Lat,MoDStop Long,MoDStop Adresse
0,1001,Mandelgasse,49.35178,8.129,"Seilerbahn 1, 67433 Neustadt"
1,1002,Hauptfeuerwache,49.353733,8.131552,"Lindenstraße 11, 67433 Neustadt"


In [5]:
df_routes = pd.read_excel(
    f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name="Liste 2022"
)
df_routes.head(2)


Unnamed: 0,Start #,Start Name,Ende #,Ende Name,Route [m],Luftlinie [m],VRN-eTarif\nohne Bahncard,VRN-eTarif\nmit BahnCard,Qualitätszuschlag,MoD-Fahrpreis \nohne BahnCard,...,price_mod_3pers,price_mod_4pers,price_bahncard,price_bahncard_2pers,price_bahncard_3pers,price_bahncard_4pers,price_vrn_surcharge,price_vrn_surcharge_2pers,price_vrn_surcharge_3pers,price_vrn_surcharge_4pers
0,1001,Mandelgasse,1002,Hauptfeuerwache,366,285,1.65,1.24,0.9,2.55,...,6.0,7.725,2.14,3.865,5.18,6.7,0.9,2.625,3.94,5.46
1,1001,Mandelgasse,1003,Kindergarten St. Marien,994,586,1.65,1.24,0.9,2.55,...,6.0,7.725,2.14,3.865,5.18,6.7,0.9,2.625,3.94,5.46


In [6]:
##### TODO: Check in the end if too many rides, which are too short or are not likely enough 
# Check that no same start and stop address 


def generateRoute(oldRides, newRides, ridestops, routes):
    # based on analysis of rides we distinguish between workdays (Monday till Friday noon) and weekend (Friday noon till Sunday)
    newRideStops = pd.DataFrame(newRides[['scheduled_to', 'pickup_address', 'dropoff_address']], columns=['scheduled_to', 'pickup_address', 'dropoff_address'])
    newRideStops['day'] = newRideStops['scheduled_to'].apply(lambda x: dt.weekday(x))
    newRideStops['hour'] = newRideStops['scheduled_to'].apply(lambda x: x.hour)
    newRideStops['workday'] = np.where(
        (
            newRideStops['day'].isin([0,1,2,3,4]) # 0 = Monday, 6 = Sunday
            & ~(
                    (newRideStops['day'] == 4) 
                    & (newRideStops['hour'] > 13)
            )
        ),
        True,
        False
    )

    oldRidestops = pd.DataFrame(oldRides[['state', 'scheduled_to', 'pickup_address', 'dropoff_address']], columns=['state', 'scheduled_to', 'pickup_address', 'dropoff_address'])
    oldRidestops['scheduled_to'] = pd.to_datetime(oldRidestops['scheduled_to'])
    oldRidestops['day'] = oldRidestops['scheduled_to'].apply(lambda x: dt.weekday(x))
    oldRidestops['hour'] = oldRidestops['scheduled_to'].apply(lambda x: x.hour)
    oldRidestops['workday'] = np.where(
        (
            oldRidestops['day'].isin([0,1,2,3,4]) # 0 = Monday, 6 = Sunday
            & ~(
                    (oldRidestops['day'] == 4) 
                    & (oldRidestops['hour'] > 13)
            )
        ),
        True,
        False
    )
    workdayOldRides = oldRidestops[(oldRidestops['workday']==True) & (oldRidestops['state']=='completed')]
    weekendOldRides = oldRidestops[(oldRidestops['workday']==False) & (oldRidestops['state']=='completed')]

    # generate ridestops
    for h in [0] + list(range(7,24)): # rides start between 7:00 and 0:59
        # timeframe used to get ridestop distribution
        if h in [23,0]:
            timeframe = [22,23,0]
        elif h == 7:
            timeframe = [7,8,9]
        else:
            timeframe = list(range(h-1,h+2))

        ##### workday ridestop distribution #####
        # get pickup ridestop distribution of rides on workdays, which are in a +/- 1h timeframe around the planned departure; And add not considered ridestops with minimal frequency count of used stops
        distPickupWorkday = workdayOldRides[(workdayOldRides['hour'].isin(timeframe))]['pickup_address'].value_counts().rename_axis('pickup_address').reset_index(name='counts')
        distPickupWorkday = distPickupWorkday.merge(ridestops['MoDStop Id'], left_on='pickup_address', how='outer', right_on='MoDStop Id')
        distPickupWorkday['pickup_address'] = distPickupWorkday['MoDStop Id']
        distPickupWorkday = distPickupWorkday.fillna(distPickupWorkday['counts'].min())
        distPickupWorkday['probabilities'] = (distPickupWorkday.counts / distPickupWorkday.counts.sum())
        # get dropoff ridestop distribution of rides on workdays, which are in a +/- 1h timeframe around the planned departure; And add not considered ridestops with minimal frequency count of used stops
        distDropoffWorkday = workdayOldRides[(workdayOldRides['hour'].isin(timeframe))]['dropoff_address'].value_counts().rename_axis('dropoff_address').reset_index(name='counts')
        distDropoffWorkday = distDropoffWorkday.merge(ridestops['MoDStop Id'], left_on='dropoff_address', how='outer', right_on='MoDStop Id')
        distDropoffWorkday['dropoff_address'] = distDropoffWorkday['MoDStop Id']
        distDropoffWorkday = distDropoffWorkday.fillna(distDropoffWorkday['counts'].min())
        distDropoffWorkday['probabilities'] = (distDropoffWorkday.counts / distDropoffWorkday.counts.sum())

        ##### weekend ridestop distribution #####
        # get pickup ridestop distribution of rides on workdays, which are in a +/- 1h timeframe around the planned departure; And add not considered ridestops with minimal frequency count of used stops
        distPickupWeekend = weekendOldRides[(weekendOldRides['hour'].isin(timeframe))]['pickup_address'].value_counts().rename_axis('pickup_address').reset_index(name='counts')
        distPickupWeekend = distPickupWeekend.merge(ridestops['MoDStop Id'], left_on='pickup_address', how='outer', right_on='MoDStop Id')
        distPickupWeekend['pickup_address'] = distPickupWeekend['MoDStop Id']
        distPickupWeekend = distPickupWeekend.fillna(distPickupWeekend['counts'].min())
        distPickupWeekend['probabilities'] = (distPickupWeekend.counts / distPickupWeekend.counts.sum())
        # get dropoff ridestop distribution of rides on workdays, which are in a +/- 1h timeframe around the planned departure; And add not considered ridestops with minimal frequency count of used stops
        distDropoffWeekend = weekendOldRides[(weekendOldRides['hour'].isin(timeframe))]['dropoff_address'].value_counts().rename_axis('dropoff_address').reset_index(name='counts')
        distDropoffWeekend = distDropoffWeekend.merge(ridestops['MoDStop Id'], left_on='dropoff_address', how='outer', right_on='MoDStop Id')
        distDropoffWeekend['dropoff_address'] = distDropoffWeekend['MoDStop Id']
        distDropoffWeekend = distDropoffWeekend.fillna(distDropoffWeekend['counts'].min())
        distDropoffWeekend['probabilities'] = (distDropoffWeekend.counts / distDropoffWeekend.counts.sum())

        # for all new rides planned at time h choose ridestops based on the distributions
        # pickup_address:
        newRideStops['pickup_address'] = np.where(
            (newRideStops['workday'] == True)
            & (newRideStops['hour'] == h),
            np.random.choice(distPickupWorkday['pickup_address'], p=distPickupWorkday['probabilities']),
            np.where(
                (newRideStops['workday'] == False)
                & (newRideStops['hour'] == h),
                np.random.choice(distPickupWeekend['pickup_address'], p=distPickupWeekend['probabilities']),
                newRideStops['pickup_address']
            )
        )
        # dropoff_address:
        newRideStops['dropoff_address'] = np.where(
            (newRideStops['workday'] == True)
            & (newRideStops['hour'] == h),
            np.random.choice(distDropoffWorkday['dropoff_address'], p=distDropoffWorkday['probabilities']),
            np.where(
                (newRideStops['workday'] == False)
                & (newRideStops['hour'] == h),
                np.random.choice(distDropoffWeekend['dropoff_address'], p=distDropoffWeekend['probabilities']),
                newRideStops['dropoff_address']
            )
        )

    # Extract 'distance' and 'shortest_ridetime' based on generated routes
    newRideStops['distance'] = newRideStops.merge(routes, left_on=['pickup_address', 'dropoff_address'], right_on=['Start #', 'Ende #'], how='left')['Route [m]']
    newRideStops['shortest_ridetime'] = 1/(30 / (newRideStops['distance'] / 1000) )*60*60 # calculate shortest_ridetime in seconds with average speed of 30 km/h
    return newRideStops[['pickup_address', 'dropoff_address','distance', 'shortest_ridetime']]



In [51]:
def generateRoute_simple(oldRides, newRides, ridestops, routes):
    newRideStops = pd.DataFrame(newRides[['pickup_address', 'dropoff_address']], columns=['pickup_address', 'dropoff_address'])

    dist = df['pickup_address'].value_counts().rename_axis('pickup_address').reset_index(name='counts')
    dist = ridestops[['MoDStop Id', 'MoDStop Name']].merge(dist, right_on='pickup_address', how='left', left_on='MoDStop Id')
    dist['pickup_address'] = dist['MoDStop Id']
    dist = dist.fillna(dist['counts'].min())
    dist['probabilities'] = (dist.counts / dist.counts.sum())
    newRideStops['pickup_address'] = np.random.choice(dist['pickup_address'], p=dist['probabilities'], size=newRides.shape[0])

    dist = df['dropoff_address'].value_counts().rename_axis('dropoff_address').reset_index(name='counts')
    dist = ridestops[['MoDStop Id', 'MoDStop Name']].merge(dist, right_on='dropoff_address', how='left', left_on='MoDStop Id')
    dist['dropoff_address'] = dist['MoDStop Id']
    dist = dist.fillna(dist['counts'].min())
    dist['probabilities'] = (dist.counts / dist.counts.sum())
    newRideStops['dropoff_address'] = generateValues('dropoff_address', oldRides, newRides)

    # Extract 'distance' and 'shortest_ridetime' based on generated routes
    newRideStops['distance'] = newRideStops.merge(routes, left_on=['pickup_address', 'dropoff_address'], right_on=['Start #', 'Ende #'], how='left')['Route [m]']
    newRideStops['shortest_ridetime'] = 1/(30 / (newRideStops['distance'] / 1000) )*60*60 # calculate shortest_ridetime in seconds with average speed of 30 km/h
    return newRideStops[['pickup_address', 'dropoff_address','distance', 'shortest_ridetime']]

In [8]:
def generateCreatedAt(oldRides, newRides, m, y):
    # creat list with all days of the month to build up the probability distribution 
    if m == 12:
        m1 = 1
        y1 = y + 1
    else:
        m1 = m + 1
        y1 = y
    daydist = pd.DataFrame(pd.date_range(start=str(m)+'/01/'+str(y), end=str(m1)+'/01/'+str(y1),).to_pydatetime().tolist()[:-1], columns=['date'])
    daydist['weekday'] = daydist['date'].apply(lambda x: dt.weekday(x)) # use the weekday distribution to represent real occurrences of rides

    # extract all dates and their weekday, hour and minute 
    created = pd.DataFrame(pd.to_datetime(df['created_at']), columns=['created_at'])
    created['day'] = created['created_at'].apply(lambda x: dt.weekday(x))
    created['hour'] = created['created_at'].apply(lambda x: x.hour)
    created['minute'] = created['created_at'].apply(lambda x: x.minute)
    
    # get the weekday distribution of old rides
    dist_day = created['day'].value_counts().rename_axis('day').reset_index(name='counts')
    dist_day['probabilities'] = (dist_day.counts / dist_day.counts.sum())
    dist_day = dist_day.sort_values('day')

    # get the hour distribution of old rides per weekday 
    dist_hour = []
    for i in range(0,7):
        dist_hour.append(created[created['day']==i]['hour'].value_counts().rename_axis('hour').reset_index(name='counts'))
        dist_hour[i]['probabilities'] = (dist_hour[i].counts / dist_hour[i].counts.sum())
        dist_hour[i] = dist_hour[i].sort_values('hour')

    
    # get the minute distribution of old rides
    dist_minute = created['minute'].value_counts().rename_axis('minute').reset_index(name='counts')
    dist_minute['probabilities'] = (dist_minute.counts / dist_minute.counts.sum())  
    dist_minute = dist_minute.sort_values('minute')


    # match probability that a ride is on that weekday to all dates in the simulated month
    daydist['probabilities'] =  daydist['weekday'].apply(lambda x: dist_day[dist_day['day']==x]['probabilities'].values[0]) #np.where(daydist['weekday'])
    daydist['probabilities'] = daydist['probabilities']/(daydist['probabilities'].sum()) # normalization neccessary to get probability distribution (sum of odds is 1)

    # generate list of values
    values = pd.DataFrame(np.random.choice(daydist['date'], p=daydist['probabilities'], size=newRides.shape[0]), columns=['created_at'])
    values = values.sort_values('created_at')
    values = values.reset_index()
    values['day'] = values['created_at'].apply(lambda x: dt.weekday(x))
    values['created_at'] = values['created_at'] + values['day'].apply(
        lambda x: pd.Timedelta(
            hours=np.random.choice(dist_hour[x]['hour'], p=dist_hour[x]['probabilities']),
            minutes=np.random.choice(dist_minute['minute'], p=dist_minute['probabilities']),
            seconds=np.random.choice(list(range(0,60)))
        )
    )
    return values['created_at']


In [9]:
def generateScheduledTo(df, newRides):
    hours = pd.DataFrame(columns=['hour'])
    hours['hour'] = newRides['created_at'].apply(lambda x: x.hour)

    # get prebooking time
    scheduled = pd.DataFrame(df[['created_at', 'scheduled_to']], columns=['created_at', 'scheduled_to'])
    scheduled['isScheduled'] = (scheduled.created_at != scheduled.scheduled_to)
    scheduled['created_at'] = pd.to_datetime(scheduled['created_at'])
    scheduled['scheduled_to'] = pd.to_datetime(scheduled['scheduled_to'])
    scheduled['prebook_time'] = scheduled.scheduled_to - scheduled.created_at
    scheduled['prebook_time'] = scheduled['prebook_time'].apply(lambda x: x.total_seconds())
    
    # distribution of prebooked and non-prebooked rides
    dist = scheduled['isScheduled'].value_counts().rename_axis('isScheduled').reset_index(name='counts')
    dist['probabilities'] = (dist.counts / dist.counts.sum())

    # distribution of average prebook time 
    mean = scheduled[scheduled['isScheduled'] == True]['prebook_time'].mean()
    std = scheduled[scheduled['isScheduled'] == True]['prebook_time'].std()
    a = 1
    b = scheduled[scheduled['isScheduled'] == True]['prebook_time'].max()
    dist_avg_prebook_time = stats.truncnorm((a - mean) / std, (b - mean) / std, loc=mean, scale=std)

    values = [(i + pd.Timedelta(dist_avg_prebook_time.rvs(1)[0], unit='seconds')).round(freq='5T') if j in [1, 2, 3, 4, 5, 6] | np.random.choice(dist['isScheduled'], p=dist['probabilities']) else i for i, j in zip(newRides.created_at, hours.hour)]
    # we have no rides before 7
    values = [dt(i.year, i.month, i. day, 7, 0) if j in [1, 2, 3, 4, 5, 6] else i for i, j in zip(newRides.created_at, hours.hour)]

    return values

In [10]:
def generateValues(column_name, df, newRides):
    dist = df[column_name].value_counts().rename_axis(column_name).reset_index(name='counts')
    dist['probabilities'] = (dist.counts / dist.counts.sum())
    return np.random.choice(dist[column_name], p=dist['probabilities'], size=newRides.shape[0])


In [52]:
def generateRideSpecs(oldRides, newRides, ridestops, routes, n, month, year):
    timestamp = str(round(time.time()))
    newRides['id'] = [timestamp + '-' + str(x) for x in list(range(0,n))]
    newRides['user_id'] = [str(x) + '-' + timestamp for x in list(range(0,n))] # Ein Kunde mehrere Rides
    newRides['number_of_passenger'] = generateValues('number_of_passenger', oldRides, newRides)
    newRides['free_ride'] = generateValues('free_ride', oldRides, newRides)
    newRides['payment_type'] = generateValues('payment_type', oldRides, newRides)
    newRides['state'] = 'completed'
    newRides['arrival_indicator'] = generateValues('arrival_indicator', oldRides, newRides)
    newRides['rating'] = generateValues('rating', oldRides, newRides) #zufällig ratings rein, die nicht bisher gerated wurden? Oder Rating ganz raus?
    newRides['created_at'] = generateCreatedAt(oldRides, newRides, month, year)
    newRides['scheduled_to'] = generateScheduledTo(oldRides, newRides)
    #newRides[['pickup_address', 'dropoff_address','distance', 'shortest_ridetime']] = generateRoute(oldRides, newRides, ridestops, routes) # prices are not considered
    newRides[['pickup_address', 'dropoff_address','distance', 'shortest_ridetime']] = generateRoute_simple(oldRides, newRides, ridestops, routes) # prices are not considered

    return newRides


In [53]:
newRides = pd.DataFrame(columns=df.columns)
newRides = generateRideSpecs(df, newRides, df_stops, df_routes, 10000, 6, 2022)
newRides.head(3)

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,...,cancellation_comment,sheet_name,file_name,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index
0,1657530195-0,0-1657530195,5128.0,1.0,,,,False,STANDARD,10003,...,,,,,,,,,,
1,1657530195-1,1-1657530195,6744.0,1.0,,,,False,BAHN_CARD,6004,...,,,,,,,,,,
2,1657530195-2,2-1657530195,2700.0,1.0,,,,False,VRN,3001,...,,,,,,,,,,


In [54]:
newRides.to_excel("test_address_distribution.xlsx")