In [16]:
import pandas as pd
import plotly.express as px


data = pd.read_excel('../data/get_around_delay_analysis.xlsx')
for index, i in enumerate(data.columns):
    print(f"{data.dtypes.iloc[index]} --> {i} ")
print()
display(data.head())
display(data.shape)

int64 --> rental_id 
int64 --> car_id 
object --> checkin_type 
object --> state 
float64 --> delay_at_checkout_in_minutes 
float64 --> previous_ended_rental_id 
float64 --> time_delta_with_previous_rental_in_minutes 



Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,


(21310, 7)

In [17]:
# Pour comprendre les données, voir une seule voiture
data[data['car_id'] == 359049].sort_values('rental_id')

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
2,508131,359049,connect,ended,70.0,,
9754,537126,359049,connect,ended,-156.0,539408.0,720.0
13600,539408,359049,connect,ended,-6.0,,
8219,544433,359049,connect,ended,-110.0,,
6391,546894,359049,connect,ended,2.0,,
15866,547579,359049,connect,ended,32.0,,
17483,550645,359049,connect,ended,249.0,,
8992,553735,359049,connect,ended,33.0,550645.0,390.0
5697,561550,359049,connect,canceled,,,
14550,563861,359049,connect,canceled,,,


In [18]:
display(data.isna().sum())
display(data["state"].value_counts())


rental_id                                         0
car_id                                            0
checkin_type                                      0
state                                             0
delay_at_checkout_in_minutes                   4964
previous_ended_rental_id                      19469
time_delta_with_previous_rental_in_minutes    19469
dtype: int64

state
ended       18045
canceled     3265
Name: count, dtype: int64

In [19]:
display(data.describe().round(2))

Unnamed: 0,rental_id,car_id,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
count,21310.0,21310.0,16346.0,1841.0,1841.0
mean,549712.88,350030.6,59.7,550127.41,279.29
std,13863.45,58206.25,1002.56,13184.02,254.59
min,504806.0,159250.0,-22433.0,505628.0,0.0
25%,540613.25,317639.0,-36.0,540896.0,60.0
50%,550350.0,368717.0,9.0,550567.0,180.0
75%,560468.5,394928.0,67.0,560823.0,540.0
max,576401.0,417675.0,71084.0,575053.0,720.0


# Description des données : 

## Features 

- ###  delay_at_checkout_in_minutes

Temps réel du checkin de la reservation : 

- Si supérieur à zéro => Checkin en retard
- Si inférieur à zéro => Checkin en avance

- ###  time_delta_with_previous_rental_in_minutes 

Temps estimé entre les deux reservations, c'est à dire le delta entre l'horaire de fin programmé et l'horaire de début programmé de la reservation.

Si NaN, pas d'enchainement de reservation donc pas de problème de retard



In [20]:
print("Distribution des ended et canceled :")
print()
print(f"% de 'ended' : {round((data["state"].value_counts().iloc[0] / data.shape[0])*100,2)} %")
print(f"% de 'canceled' : {round((data["state"].value_counts().iloc[1] / data.shape[0])*100,2)} %")
plot = px.bar(data.value_counts(subset='state').reset_index(),x='state',y='count',title="Distribution ended / canceled",width=400)
plot.show()

Distribution des ended et canceled :

% de 'ended' : 84.68 %
% de 'canceled' : 15.32 %


In [21]:
print(f"% de 'mobile' : {round((data["checkin_type"].value_counts().iloc[0] / data.shape[0])*100,2)} %")
print(f"% de 'connect' : {round((data["checkin_type"].value_counts().iloc[1] / data.shape[0])*100,2)} %")
plot = px.bar(data.value_counts(subset='checkin_type').reset_index(),x='checkin_type',y='count',title="Distribution mobile / connect",width=400)
plot.show()

% de 'mobile' : 79.79 %
% de 'connect' : 20.21 %


In [22]:
# data_encoded = data.copy()
# data_encoded['state'] = data_encoded['state'].apply(lambda x : 1 if x=="ended" else 0)
# data_encoded['checkin_type'] = data_encoded['checkin_type'].apply(lambda x : 1 if x=="mobile" else 0)
# corr = data_encoded[['state','checkin_type','delay_at_checkout_in_minutes','time_delta_with_previous_rental_in_minutes']].corr('pearson')

# corr_df_fig = px.imshow(corr, text_auto=True,
#     color_continuous_scale="RdBu_r",
#     title="Matrice de corrélation (Pearson)",
#     zmin=-1, zmax=1,width=700)
# corr_df_fig.show()

In [23]:
print("Taux de canceled en fonction du device :")
mobile = data[data['checkin_type'] == 'mobile']
connect = data[data['checkin_type'] == 'connect']
print(f"Shape de mobile : {mobile.shape}")
print(f"Shape de connect : {connect.shape}")

print(f"Dans checkin mobile il y a : {((mobile[mobile['state'] == "canceled"]['state'].count() / mobile['rental_id'].count())*100):.2f} % de canceled")
print(f"Dans checkin connect (sans propriétaire présent ) il y a : {((connect[connect['state'] == "canceled"]['state'].count() / connect['rental_id'].count())*100):.2f} % de canceled")

print(f"Cela représente  : {mobile[mobile['state'] == "canceled"]['state'].count()/data.shape[0]*100:.2f} % de canceled par rapport au nombre total de reservations")
print(f"Cela représente : {connect[connect['state'] == "canceled"]['state'].count() / data.shape[0]*100:.2f}  % de canceled par rapport au nombre total de reservations")

Taux de canceled en fonction du device :
Shape de mobile : (17003, 7)
Shape de connect : (4307, 7)
Dans checkin mobile il y a : 14.51 % de canceled
Dans checkin connect (sans propriétaire présent ) il y a : 18.53 % de canceled
Cela représente  : 11.58 % de canceled par rapport au nombre total de reservations
Cela représente : 3.74  % de canceled par rapport au nombre total de reservations


In [24]:
print("On regarde les données quand le delai checkout et supérieur 0 :")
checkout_supp_zero = data[data['delay_at_checkout_in_minutes'].notna() >= 0]
print(f"Nombre de reservations en retard au checkout : {checkout_supp_zero.count()['rental_id']}")
cancel_supp_zero = checkout_supp_zero[checkout_supp_zero['state'] == 'canceled']
print(f"Nombre d'annulation quand en retard : {cancel_supp_zero.count()['rental_id']}") 


On regarde les données quand le delai checkout et supérieur 0 :
Nombre de reservations en retard au checkout : 21310
Nombre d'annulation quand en retard : 3265


In [25]:
print("Moyenne des retards")
checkout_supp_zero = data[(data["delay_at_checkout_in_minutes"] > 0) & (data["delay_at_checkout_in_minutes"].notna())]
print(f"Moyenne des retards (sans inf à zero) : {checkout_supp_zero['delay_at_checkout_in_minutes'].mean()} minutes")
print(f"Médiane des retard (sans inf à zero) : {checkout_supp_zero['delay_at_checkout_in_minutes'].median()} minutes")


Moyenne des retards
Moyenne des retards (sans inf à zero) : 201.79147171416417 minutes
Médiane des retard (sans inf à zero) : 53.0 minutes


In [26]:
print("Moyenne de delai checkin en fonction du mode de checkin")
print(data[data['delay_at_checkout_in_minutes'] > 0].groupby('checkin_type')['delay_at_checkout_in_minutes'].mean())
print("Médiane de delai checkin en fonction du mode de checkin")
print(data[data['delay_at_checkout_in_minutes'] > 0].groupby('checkin_type')['delay_at_checkout_in_minutes'].median())

Moyenne de delai checkin en fonction du mode de checkin
checkin_type
connect     80.109664
mobile     224.136816
Name: delay_at_checkout_in_minutes, dtype: float64
Médiane de delai checkin en fonction du mode de checkin
checkin_type
connect    41.0
mobile     56.0
Name: delay_at_checkout_in_minutes, dtype: float64


In [27]:
print(data.groupby('state')['checkin_type'].value_counts())
print(data[(data["delay_at_checkout_in_minutes"] > 0) | (data["delay_at_checkout_in_minutes"].notna())].shape)

state     checkin_type
canceled  mobile           2467
          connect           798
ended     mobile          14536
          connect          3509
Name: count, dtype: int64
(16346, 7)


In [38]:
data_ended = data[data['state'] == "ended"].copy()
print(f"Shape des ended : {data_ended.shape}")
# on enlève les NaN car sinon pas d'enchainement de reservations
multiple_reservations = data_ended[data_ended['time_delta_with_previous_rental_in_minutes'].notna()].copy()
print(f"Nouvelle shape : {multiple_reservations.shape}")
print(f"Moyenne du delai de checkin : {multiple_reservations['delay_at_checkout_in_minutes'].mean():.2f}")
print(f"Médiane du delai de checkin : {multiple_reservations['delay_at_checkout_in_minutes'].median():.2f}")
print(f"Moyenne des retards (reservations avec delay_at_checkout > 0) : {multiple_reservations[multiple_reservations['delay_at_checkout_in_minutes'] > 0]['delay_at_checkout_in_minutes'].mean():.2f}")
print(f"Moyenne des retards (reservations avec delay_at_checkout > 0) : {multiple_reservations[multiple_reservations['delay_at_checkout_in_minutes'] > 0]['delay_at_checkout_in_minutes'].median():.2f}")

multiple_reservations['se_chevauche'] = multiple_reservations['delay_at_checkout_in_minutes'] - multiple_reservations['time_delta_with_previous_rental_in_minutes']
multiple_reservations['en_retard'] = (multiple_reservations['se_chevauche'] > 0)
print("")
print("Description de la colonne 'se_chevauche' :")
print(f"{multiple_reservations[multiple_reservations['se_chevauche'] > 0]['se_chevauche'].describe()}")


Shape des ended : (18045, 7)
Nouvelle shape : (1612, 7)
Moyenne du delai de checkin : 27.53
Médiane du delai de checkin : 4.00
Moyenne des retards (reservations avec delay_at_checkout > 0) : 159.58
Moyenne des retards (reservations avec delay_at_checkout > 0) : 50.00

Description de la colonne 'se_chevauche' :
count     270.000000
mean      289.203704
std       815.822763
min         1.000000
25%        23.250000
50%        64.500000
75%       186.750000
max      9697.000000
Name: se_chevauche, dtype: float64


Quel est la part des reservation qui n'aurait pas été possible si Threshold = 15 minutes, 30 minutes, 1 heure, etc

In [None]:
thresolds = [15,20,25,30,45,60]

for thresold in thresolds : 
    impact_reservation = multiple_reservations[multiple_reservations['se_chevauche'].between(0,thresold)]
    print("")
    print("")
    print(f"Shape si thresold = {thresold} minutes : {impact_reservation.shape[0]}")
    print("")
    
    # print(impact_reservation.groupby('state')['checkin_type'].value_counts())
    print(pd.crosstab(impact_reservation['state'], impact_reservation['checkin_type']))
    print(f"% de reservation qui aurait été bloqué si thresold = {thresold} : {round(((impact_reservation.shape[0] / multiple_reservations.shape[0])*100),2)} %")



Shape si thresold = 15 minutes : 53

checkin_type  connect  mobile
state                        
ended              13      40


NameError: name 'data_impact' is not defined

Sachant que la moyenne des retard 

In [None]:
# How often are drivers late for the next check-in? How does it impact the next driver?

print(f" Nombre de conducteur en retard au chekin précedent : {data[data["time_delta_with_previous_rental_in_minutes"] > 0].count()['rental_id']}")
print(f" Nombre de conducteur en retard au chekin précedent et en retard au checkin : {data[(data["time_delta_with_previous_rental_in_minutes"] > 0) & (data['delay_at_checkout_in_minutes'].notna() > 0)].count()['rental_id']}")
print(f" Les reservations qui sont en retard au checkin  : {data[(data["time_delta_with_previous_rental_in_minutes"] > 0) & (data['delay_at_checkout_in_minutes'].notna() > 0)]['delay_at_checkout_in_minutes'].mean().round(2)}")

 Nombre de conducteur en retard au chekin précedent : 1562
 Nombre de conducteur en retard au chekin précedent et en retard au checkin : 1288
 Les reservations qui sont en retard au checkin  : 24.23
