In [300]:
import time
import warnings
from datetime import datetime as dt
from re import M

import git
import numpy as np
import pandas as pd
import scipy.stats as stats

import matplotlib as plt

In [253]:
repo = git.Repo(".", search_parent_directories=True).git.rev_parse(
    "--show-toplevel"
)
df = pd.read_csv(f"{repo}/data/cleaning/test_1652280840.csv") 
df_stops = pd.read_excel(
    f"{repo}/data/other/MoDstops+Preismodell.xlsx", sheet_name="MoDstops"
)

df.head(2)

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,...,cancellation_comment,sheet_name,file_name,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index
0,5727475e-8224-4302-9228-c92b9d4a5220,f8ff0526-887a-4e48-ad96-977e12fd70c1,5483.0,1.0,4.65,4.65,0.0,False,STANDARD,11009,...,,1,b'Rides_2021-07.xlsx',,,,,,,
1,18fec0a6-b7ba-442b-8472-04bdb6ba1b86,51e1a1a8-995c-488c-84ce-3789e46f0417,3575.0,1.0,0.0,2.77,0.0,False,BAHN_CARD,2007,...,,1,b'Rides_2021-07.xlsx',,,,,,,


In [240]:
def generateCreatedAt(oldRides, newRides, m, y):
    # creat list with all days of the month to build up the probability distribution 
    if m == 12:
        m1 = 1
        y1 = y + 1
    else:
        m1 = m + 1
        y1 = y
    daydist = pd.DataFrame(pd.date_range(start=str(m)+'/01/'+str(y), end=str(m1)+'/01/'+str(y1),).to_pydatetime().tolist()[:-1], columns=['date'])
    daydist['weekday'] = daydist['date'].apply(lambda x: dt.weekday(x)) # use the weekday distribution to represent real occurrences of rides

    # extract all dates and their weekday, hour and minute 
    created = pd.DataFrame(pd.to_datetime(df['created_at']), columns=['created_at'])
    created['day'] = created['created_at'].apply(lambda x: dt.weekday(x))
    created['hour'] = created['created_at'].apply(lambda x: x.hour)
    created['minute'] = created['created_at'].apply(lambda x: x.minute)
    
    # get the weekday distribution of old rides
    dist_day = created['day'].value_counts().rename_axis('day').reset_index(name='counts')
    dist_day['probabilities'] = (dist_day.counts / dist_day.counts.sum())
    dist_day = dist_day.sort_values('day')

    # get the hour distribution of old rides per weekday 
    dist_hour = []
    for i in range(0,7):
        dist_hour.append(created[created['day']==i]['hour'].value_counts().rename_axis('hour').reset_index(name='counts'))
        dist_hour[i]['probabilities'] = (dist_hour[i].counts / dist_hour[i].counts.sum())
        dist_hour[i] = dist_hour[i].sort_values('hour')

    
    # get the minute distribution of old rides
    dist_minute = created['minute'].value_counts().rename_axis('minute').reset_index(name='counts')
    dist_minute['probabilities'] = (dist_minute.counts / dist_minute.counts.sum())  
    dist_minute = dist_minute.sort_values('minute')


    # match probability that a ride is on that weekday to all dates in the simulated month
    daydist['probabilities'] =  daydist['weekday'].apply(lambda x: dist_day[dist_day['day']==x]['probabilities'].values[0]) #np.where(daydist['weekday'])
    daydist['probabilities'] = daydist['probabilities']/(daydist['probabilities'].sum()) # normalization neccessary to get probability distribution (sum of odds is 1)
    np.random.choice(daydist['date'], p=daydist['probabilities'], size=newRides.shape[0])

    # generate list of values
    values = pd.DataFrame(np.random.choice(daydist['date'], p=daydist['probabilities'], size=newRides.shape[0]), columns=['created_at'])
    values = values.sort_values('created_at')
    values = values.reset_index()
    values['day'] = values['created_at'].apply(lambda x: dt.weekday(x))
    values['created_at'] = values['created_at'] + values['day'].apply(
        lambda x: pd.Timedelta(
            hours=np.random.choice(dist_hour[x]['hour'], p=dist_hour[x]['probabilities']),
            minutes=np.random.choice(dist_minute['minute'], p=dist_minute['probabilities']),
            seconds=np.random.choice(list(range(0,60)))
        )
    )
    return values['created_at']


In [321]:

scheduled = pd.DataFrame(df[['created_at', 'scheduled_to']], columns=['created_at', 'scheduled_to'])
scheduled['isScheduled'] = (scheduled.created_at != scheduled.scheduled_to)
scheduled['created_at'] = pd.to_datetime(scheduled['created_at'])
scheduled['scheduled_to'] = pd.to_datetime(scheduled['scheduled_to'])
scheduled['prebook_time'] = scheduled.scheduled_to - scheduled.created_at
scheduled['prebook_time'] = scheduled['prebook_time'].apply(lambda x: x.total_seconds())
scheduled

Unnamed: 0,created_at,scheduled_to,isScheduled,prebook_time
0,2021-06-30 23:12:47,2021-07-01 07:30:00,True,29833.0
1,2021-06-30 22:51:53,2021-07-01 08:50:00,True,35887.0
2,2021-07-01 09:21:39,2021-07-01 09:21:39,False,0.0
3,2021-07-01 11:24:01,2021-07-01 11:24:01,False,0.0
4,2021-07-01 13:12:19,2021-07-01 13:12:19,False,0.0
...,...,...,...,...
10329,2022-02-28 22:21:09,2022-02-28 23:00:00,True,2331.0
10330,2021-09-26 17:42:36,2021-09-27 08:20:00,True,52644.0
10331,2021-09-04 22:25:06,2021-09-05 09:30:00,True,39894.0
10332,2021-10-01 17:28:51,2021-10-02 17:40:00,True,87069.0


In [335]:
def generateScheduledTo(df, newRides):
    scheduled = pd.DataFrame(df[['created_at', 'scheduled_to']], columns=['created_at', 'scheduled_to'])
    scheduled['isScheduled'] = (scheduled.created_at != scheduled.scheduled_to)
    scheduled['created_at'] = pd.to_datetime(scheduled['created_at'])
    scheduled['scheduled_to'] = pd.to_datetime(scheduled['scheduled_to'])
    scheduled['prebook_time'] = scheduled.scheduled_to - scheduled.created_at
    scheduled['prebook_time'] = scheduled['prebook_time'].apply(lambda x: x.total_seconds())
    
    # distribution of prebooked and non-prebooked rides
    dist = scheduled['isScheduled'].value_counts().rename_axis('isScheduled').reset_index(name='counts')
    dist['probabilities'] = (dist.counts / dist.counts.sum())

    # distribution of average prebook time 
    mean = scheduled[scheduled['isScheduled'] == True]['prebook_time'].mean()
    std = scheduled[scheduled['isScheduled'] == True]['prebook_time'].std()
    a = 1
    b = scheduled[scheduled['isScheduled'] == True]['prebook_time'].max()
    dist_avg_prebook_time = stats.truncnorm((a - mean) / std, (b - mean) / std, loc=mean, scale=std)

    values = [(i + pd.Timedelta(dist_avg_prebook_time.rvs(1)[0], unit='seconds')).round(freq='5T') if np.random.choice(dist['isScheduled'], p=dist['probabilities']) else i for i in newRides['created_at']]
    return values

In [241]:
def generateValues(column_name, df, newRides):
    dist = df[column_name].value_counts().rename_axis(column_name).reset_index(name='counts')
    dist['probabilities'] = (dist.counts / dist.counts.sum())
    return np.random.choice(dist[column_name], p=dist['probabilities'], size=newRides.shape[0])


In [336]:
def generateRideSpecs(newRides, oldRides, n):
    timestamp = str(round(time.time()))
    newRides['id'] = [timestamp + '-' + str(x) for x in list(range(0,n))]
    newRides['user_id'] = [str(x) + '-' + timestamp for x in list(range(0,n))]
    newRides['number_of_passenger'] = generateValues('number_of_passenger', oldRides, newRides)
    newRides['free_ride'] = generateValues('free_ride', oldRides, newRides)
    newRides['payment_type'] = generateValues('payment_type', oldRides, newRides)
    newRides['state'] = 'completed'
    newRides['arrival_indicator'] = generateValues('arrival_indicator', oldRides, newRides)
    newRides['rating'] = generateValues('rating', oldRides, newRides) #zufällig ratings rein, die nicht bisher gerated wurden? 
    newRides['created_at'] = generateCreatedAt(oldRides, newRides, 2, 2022)
    newRides['scheduled_to'] = generateScheduledTo(oldRides, newRides)
    newRides[['pickup_address', 'dropoff_address', 'distance', 'price_operations', 'price_offer', 'price_payed']]#noch mehr? = generateRoute(oldeRides, newRides)
    return newRides


In [339]:
newRides = pd.DataFrame(columns=df.columns)
newRides = generateRideSpecs(newRides, df, 10)
newRides

Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,pickup_address,...,cancellation_comment,sheet_name,file_name,bahn_card_number,year_card_type,year_card_number,canceled_at,rating_question_one,rating_question_two,index
0,1653256109-0,0-1653256109,,1.0,,,,False,BAHN_CARD,,...,,,,,,,,,,
1,1653256109-1,1-1653256109,,1.0,,,,True,VRN,,...,,,,,,,,,,
2,1653256109-2,2-1653256109,,1.0,,,,False,STANDARD,,...,,,,,,,,,,
3,1653256109-3,3-1653256109,,1.0,,,,False,VRN,,...,,,,,,,,,,
4,1653256109-4,4-1653256109,,1.0,,,,False,VRN,,...,,,,,,,,,,
5,1653256109-5,5-1653256109,,2.0,,,,False,BAHN_CARD,,...,,,,,,,,,,
6,1653256109-6,6-1653256109,,1.0,,,,False,VRN,,...,,,,,,,,,,
7,1653256109-7,7-1653256109,,1.0,,,,False,STANDARD,,...,,,,,,,,,,
8,1653256109-8,8-1653256109,,1.0,,,,False,STANDARD,,...,,,,,,,,,,
9,1653256109-9,9-1653256109,,2.0,,,,False,STANDARD,,...,,,,,,,,,,
