In [1]:
import pandas as pd
import numpy as np
import git
from datetime import datetime as dt


In [49]:
repo = git.Repo(".", search_parent_directories=True).git.rev_parse("--show-toplevel")
orig_rides_df = pd.read_csv(f"{repo}/data/rides_combined.csv")
orig_rides_df.columns

  exec(code_obj, self.user_global_ns, self.user_ns)


Index(['Unnamed: 0', 'id', 'user_id', 'distance', 'number_of_passenger',
       'price_operations', 'price_offer', 'price_payed', 'free_ride',
       'payment_type', 'pickup_address', 'dropoff_address', 'state',
       'created_from_offer', 'created_at', 'scheduled_to', 'dispatched_at',
       'pickup_arrival_time', 'arriving_push', 'vehicle_arrived_at',
       'earliest_pickup_expectation', 'pickup_first_eta', 'pickup_eta',
       'pickup_at', 'dropoff_first_eta', 'dropoff_eta', 'dropoff_at',
       'updated_at', 'arrival_deviation', 'waiting_time', 'boarding_time',
       'ride_time', 'trip_time', 'shortest_ridetime', 'delay',
       'longer_route_factor', 'arrival_indicator', 'rating',
       'rating_puenktlichkeit', 'rating_sauberkeit', 'rating_fahrer',
       'rating_find_modstop', 'rating_other_comments', 'cancellation_reason',
       'cancellation_comment', 'bahn_card_number', 'year_card_type',
       'year_card_number', 'canceled_at', 'rating_question_one',
       'rating_quest

In [3]:
orig_external_df = pd.read_excel(f"{repo}/data/vehicle_data/Autofleet_Rides with External ID_2021+2022-05-15.xlsx")
orig_raw_df = pd.read_excel(f"{repo}/data/vehicle_data/MoD_Raw Data_2021+2022-05-15.xlsx")
orig_vehicle_usage_df = pd.read_excel(f"{repo}/data/vehicle_data/MoD_Vehicle Usage_2021+2022-05-15.xlsx")


In [50]:
print(f'Shape orig_rides_df: {orig_rides_df.shape}')
print(f'Shape orig_external_df: {orig_external_df.shape}')
print(f'Shape orig_raw_df: {orig_raw_df.shape}')
print(f'Shape orig_vehicle_usage_df: {orig_vehicle_usage_df.shape}')

Shape orig_rides_df: (18980, 51)
Shape orig_external_df: (18148, 32)
Shape orig_raw_df: (36393, 36)
Shape orig_vehicle_usage_df: (173041, 12)


In [51]:
rides_df = orig_rides_df
external_df = orig_external_df
raw_df = orig_raw_df
vehicle_usage_df = orig_vehicle_usage_df

In [6]:
print(rides_df['id'].isin(external_df['External Id']).unique())
print(rides_df['id'].isin(external_df['Id']).unique())
print(rides_df['id'].isin(vehicle_usage_df['Ride Id']).unique()) # Not unique because of Nan Values
print(vehicle_usage_df['Ride Id'].isin(external_df['External Id']).unique())
print(vehicle_usage_df['Ride Id'].isin(external_df['Id']).unique()) # Not unique because of Nan Values
print(vehicle_usage_df['Vehicle Id'].isin(external_df).unique())

[ True False]
[False]
[False  True]
[False  True]
[False  True]
[False]


In [7]:
filt_rides = rides_df[rides_df['id'].isin(external_df['External Id'])]
filt_rides_2 = rides_df[rides_df['id'].isin(vehicle_usage_df['Ride Id'])]
filt_rides_3 = rides_df[rides_df['id'].isin(raw_df['Ride External Id'])]

filt_vehicle_usage = vehicle_usage_df[vehicle_usage_df['Ride Id'].isin(external_df['Id'])]
filt_vehicle_usage_2 = vehicle_usage_df[vehicle_usage_df['Ride Id'].isin(external_df['External Id'])]
filt_vehicle_usage_3 = vehicle_usage_df[vehicle_usage_df['Ride Id'].isin(raw_df['Ride Id'])]


print(f"Matches between combined rides and autofleet_external id: {filt_rides['id'].count()}")
print(f"Matches between combined rides and raw_id: {filt_rides_3['id'].count()}")
print(f"Match values between combined_rides and vehicle_usage: {filt_rides_2['id'].unique()}")
print(f"Matches between vehicle_usage and autofleet_id: {filt_vehicle_usage['Ride Id'].count()}")
print(f"Matches between vehicle_usage and raw_id: {filt_vehicle_usage_3['Ride Id'].count()}")
print(f"Match values between vehicle_usage_id and autofleet_external id: {filt_vehicle_usage_2['Ride Id'].unique()}")

Matches between combined rides and autofleet_external id: 9496
Matches between combined rides and raw_id: 9496
Match values between combined_rides and vehicle_usage: [nan]
Matches between vehicle_usage and autofleet_id: 103910
Matches between vehicle_usage and raw_id: 103910
Match values between vehicle_usage_id and autofleet_external id: [nan]


In [8]:
print(external_df.columns)
# print(raw_df.columns)
print(vehicle_usage_df.columns)

Index(['Id', 'External Id', 'Ride Type', 'Matching Type', 'Dispatch Type',
       'Schedule Time (UTC)', 'Status', 'Rejection reason',
       'Contact person/passenger name', 'Contact person/passenger phone',
       'Demand Source name', 'Driver name', 'Driver external ID',
       'Vehicle plate', 'Price', 'Currency', 'Pooling', 'Rating',
       'Arrived to Pickup (UTC)', 'Pickup Completed (UTC)', 'Pickup address',
       'Pickup coordinates', 'Arrived to Dropoff (UTC)',
       'Dropoff Completed (UTC)', 'Dropoff address', 'Dropoff coordinates',
       'Actual Duration of Ride (min)', 'Planned Distance (m)',
       'Number Of Passengers', 'Number Of Items', 'Created by',
       'Created at (UTC)'],
      dtype='object')
Index(['Vehicle Id', 'Ride Type', 'Ride Id', 'Stop Point Type',
       'Stop Point Id', 'Stop Point status', 'Stop point completed (UTC)',
       'Vehicle plate', 'Lat', 'Lng', 'Actual Distance of Ride (m)',
       'Odometer Reading (m)'],
      dtype='object')


In [9]:
# vehicle_usage_df preprocessing - filteirng on Stop Point type and status + drop remaining duplicates

merge_vehicle_df = vehicle_usage_df[(vehicle_usage_df["Stop Point Type"] == "dropoff") & (vehicle_usage_df["Stop Point status"] == "completed")]
merge_vehicle_df.dropna(subset=["Ride Id"], inplace= True)
merge_vehicle_df.sort_values(by="Vehicle Id", inplace = True)
merge_vehicle_df.drop_duplicates(subset=["Ride Id"], inplace= True)
# dupl_vehicle_df = merge_vehicle_df[merge_vehicle_df.duplicated( subset=["Ride Id"] , keep= False)]
print(f"shape merge_vehicle_df: {merge_vehicle_df.shape}")
merge_vehicle_df

shape merge_vehicle_df: (14909, 12)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,Vehicle Id,Ride Type,Ride Id,Stop Point Type,Stop Point Id,Stop Point status,Stop point completed (UTC),Vehicle plate,Lat,Lng,Actual Distance of Ride (m),Odometer Reading (m)
21221,54fc7c8b-940f-4ba6-abda-9237be36e57a,passenger,37e102b9-5f5d-4938-88fd-0e16884c63af,dropoff,57679c3c-5b7b-42b1-b1e3-2751412b941a,completed,2021-12-04 09:09:07,NW-MD-3E,4.934618e+15,81419394493722,2488.0,94246180.0
23503,54fc7c8b-940f-4ba6-abda-9237be36e57a,passenger,50681c5c-4cb8-4d04-b80b-ea06c152f735,dropoff,11a564ac-fcd4-469e-ad4f-cc782efd6df6,completed,2021-11-09 07:55:12,NW-MD-3E,4.933748e+08,81315586,5725.0,94246180.0
23501,54fc7c8b-940f-4ba6-abda-9237be36e57a,passenger,06f0080a-fbc5-4cc7-8120-e92c15e759a6,dropoff,e48c008e-8601-4931-a906-ede47af60787,completed,2021-11-09 08:12:23,NW-MD-3E,4.935402e+08,81351358,3721.0,94246180.0
23499,54fc7c8b-940f-4ba6-abda-9237be36e57a,passenger,94a3310b-7e62-4e38-809d-b0f6103e8f04,dropoff,f7bda699-f311-4279-8950-51554cf39079,completed,2021-11-09 08:43:52,NW-MD-3E,4.934243e+08,81548779,3392.0,94246180.0
23497,54fc7c8b-940f-4ba6-abda-9237be36e57a,passenger,6470270d-975c-4b6a-b098-adf7e0050521,dropoff,e3d8e88a-cb30-4c06-b308-277a305b644e,completed,2021-11-09 10:31:05,NW-MD-3E,4.934129e+08,81611289,2824.0,94246180.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3415,fd0b8f54-e982-42eb-9d66-74d142807d9f,passenger,da4ed9a6-2737-4b5c-977b-8c275139bdb4,dropoff,1dbb476a-e8ec-4e35-97af-d34b148e1d9d,completed,2022-04-29 16:22:31,NW-MD-31E,4.932325e+16,812979355116079,4788.0,
9326,fd0b8f54-e982-42eb-9d66-74d142807d9f,passenger,5dc43879-138f-4180-8fbb-6963a9d72a6d,dropoff,8c0a9882-37f9-4be5-b257-e111eb0b1966,completed,2022-03-26 13:10:13,NW-MD-31E,4.934257e+15,8140528153112360,1581.0,
8048,fd0b8f54-e982-42eb-9d66-74d142807d9f,passenger,c0c95617-964d-4d72-8f48-a1c3a19c036b,dropoff,20c64010-f0bc-49c0-ae09-a6ab450b2c25,completed,2022-04-04 05:58:52,NW-MD-31E,4.935398e+15,8135189388449290,1295.0,
12347,fd0b8f54-e982-42eb-9d66-74d142807d9f,passenger,2dc84931-774d-47ac-8b40-5f2acbf01e07,dropoff,c9c3d479-5028-4651-b1b5-ba683000da73,completed,2022-03-05 12:28:11,NW-MD-31E,4.929883e+15,820998249553163,6589.0,


In [10]:
# external_df preprocessing
duplicated_external_df = external_df[external_df.duplicated( subset=["Id"] , keep= False)]
nan_external_df = external_df[external_df["Id"].isna()]
merge_external_df = external_df[external_df["Id"].isin(vehicle_usage_df['Ride Id'])]
merge_external_df
print(f"shape external_df: {external_df.shape}")
print(f"shape merge_external_df: {merge_external_df.shape}")



shape external_df: (18148, 32)
shape merge_external_df: (18104, 32)


In [42]:
# Left Join filtered vehicle df and external df
vehicle_external_merge = merge_vehicle_df.merge(merge_external_df,how='left',left_on="Ride Id",right_on="Id")
print(f"Shape vehicle_external_merge before filtering: {vehicle_external_merge.shape}")
vehicle_external_merge = vehicle_external_merge[~vehicle_external_merge["External Id"].isna()]
print(f"Shape vehicle_external_merge after External ID isna filtering: {vehicle_external_merge.shape}")
vehicle_external_merge = vehicle_external_merge[vehicle_external_merge["External Id"].isin(rides_df['id'])]
print(f"Shape vehicle_external_merge after isin rides_df filtering: {vehicle_external_merge.shape}")
vehicle_external_merge.drop_duplicates(subset=["External Id"], inplace=True)
print(f"Shape vehicle_external_merge after duplicates filtering: {vehicle_external_merge.shape}")


Shape vehicle_external_merge before filtering: (14909, 44)
Shape vehicle_external_merge after External ID isna filtering: (14618, 44)
Shape vehicle_external_merge after isin rides_df filtering: (8031, 44)
Shape vehicle_external_merge after duplicates filtering: (8031, 44)


In [55]:
print(f"Shape orig_rides_df: {rides_df.shape}")
duplicated_rides_df = rides_df[(rides_df.duplicated( subset=["id"] , keep= "last")) & ~rides_df["id"].isna()]
print(f"Shape duplicated_rides_df: {duplicated_rides_df.shape}")
rides_merge = rides_df[(~rides_df.duplicated( subset=["id"])) | (rides_df["id"].isnull())]
print(f"Shape rides_df dropped duplicates in id: {rides_merge.shape}")

Shape orig_rides_df: (18980, 51)
Shape duplicated_rides_df: (4, 51)
Shape rides_df dropped duplicates in id: (18976, 51)


In [57]:
# Left Join removed duplicates rides_df & filterd vehicle_external_merge
rides_vehicle_merge_df = rides_merge.merge(vehicle_external_merge,how='left',left_on="id",right_on="External Id")
print(f"Shape rides_vehicle_merge_df: {rides_vehicle_merge_df.shape}")
rides_vehicle_merge_df

Shape rides_vehicle_merge_df: (18976, 95)


Unnamed: 0.1,Unnamed: 0,id,user_id,distance,number_of_passenger,price_operations,price_offer,price_payed,free_ride,payment_type,...,Arrived to Dropoff (UTC),Dropoff Completed (UTC),Dropoff address,Dropoff coordinates,Actual Duration of Ride (min),Planned Distance (m),Number Of Passengers,Number Of Items,Created by,Created at (UTC)
0,0,5727475e-8224-4302-9228-c92b9d4a5220,f8ff0526-887a-4e48-ad96-977e12fd70c1,5483,1.0,4.65,4.65,0.00,0.0,STANDARD,...,2021-07-01 05:44:43,2021-07-01 05:44:44,Globus,"[49.339, 8.16]",11.0,5399,1.0,,Locomotion Service Account,2021-06-30 21:12:47
1,1,18fec0a6-b7ba-442b-8472-04bdb6ba1b86,51e1a1a8-995c-488c-84ce-3789e46f0417,3575,1.0,0.00,2.77,0.00,0.0,BAHN_CARD,...,NaT,NaT,,,,,,,,NaT
2,2,bb916271-0627-4196-8ec1-5324e0e1f71d,f07028da-ca7e-4713-9e45-743c71712e80,3040,1.0,3.45,1.55,1.55,0.0,VRN,...,2021-07-01 07:42:27,2021-07-01 07:42:29,Globus,"[49.339, 8.16]",6.0,3346,1.0,,Locomotion Service Account,2021-07-01 07:21:39
3,3,3cffa0f3-e278-4828-b0a1-f55cb35c1adb,44f61d06-8e79-42c6-9abd-0e85fcaf9d6d,7233,1.0,0.00,5.55,0.00,1.0,STANDARD,...,NaT,NaT,,,,,,,,NaT
4,4,,1a6d2ec4-7e85-4e5b-aed0-1c3693268986,3998,,,,,,STANDARD,...,NaT,NaT,,,,,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18971,18975,bf4e209a-8325-4e93-acd7-dec31f8864a6,61568918-262c-4360-91e0-1e71f4d2af4d,1901,1.0,3.10,1.45,1.45,0.0,VRN,...,NaT,NaT,,,,,,,,NaT
18972,18976,d529e378-3924-411b-8cf7-d09881d008fb,44f61d06-8e79-42c6-9abd-0e85fcaf9d6d,4152,1.0,4.31,2.16,2.16,0.0,VRN,...,NaT,NaT,,,,,,,,NaT
18973,18977,,6a2ade0c-d0a4-4173-a214-9ebe57758ae3,4227,,4.37,,,,BAHN_CARD,...,NaT,NaT,,,,,,,,NaT
18974,18978,,817aaaf0-d5e5-4838-b246-452fad2490ef,3799,,4.09,,,,VRN,...,NaT,NaT,,,,,,,,NaT


In [61]:
final_matches = rides_vehicle_merge_df[~rides_vehicle_merge_df["Vehicle Id"].isna()]["Vehicle Id"].count()
print(f"Matches between combined rides and autofleet_external id: {filt_rides['id'].count()}")
print(f"Matches between combined rides and autofleet_external id after vehicle usage match: {final_matches}")

Matches between combined rides and autofleet_external id: 9496
Matches between combined rides and autofleet_external id after vehicle usage match: 8031
