# 1-Librerias

In [1]:
import pandas as pd
import numpy as np

# opcion de pandas para ver todas las columnas de un dataframe sin puntos suspensivos
pd.set_option('display.max_columns', None)


# 2-Carga de datos

In [2]:
df = pd.read_csv("../data/finanzas-hotel-bookings.csv")
df.head()  

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,,,0,Transient,0.0,0,0,Check-Out,1/07/15
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,,,0,Transient,0.0,0,0,Check-Out,1/07/15
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,,,0,Transient,75.0,0,0,Check-Out,2/07/15
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,304.0,,0,Transient,75.0,0,0,Check-Out,2/07/15
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,240.0,,0,Transient,98.0,0,1,Check-Out,3/07/15


# 3-Analisis previo

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 31 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

Ajustamos el tipo de datos a la toma de requerimientos

In [4]:
## company registra El DNI de la agencia de viajes que hizo la reserva. Aparece como float, pero es un string
df["company"].value_counts()

40.0     927
223.0    784
67.0     267
45.0     250
153.0    215
        ... 
104.0      1
531.0      1
160.0      1
413.0      1
386.0      1
Name: company, Length: 352, dtype: int64

In [5]:
def transforma_a_entero(col_):
    """ transforma a entero el valor de una columna

    Args:
        col_ (_type_): valor de una columna 

    Returns:
        _type_:devuelve un entero o por el contrario si no es posible devuelve un 0
    """
    try:
        return int(col_)
    except:
        return 0
        

In [6]:
# pasamos los valores de la columna company a string 
df["company"] = df["company"].apply(transforma_a_entero)
# pasamos columna a str 
df["company"] = df["company"].astype(str)


In [7]:
df["company"].value_counts()

0      112593
40        927
223       784
67        267
45        250
        ...  
32          1
11          1
487         1
101         1
376         1
Name: company, Length: 353, dtype: int64

In [8]:
# si el valor de la columna es 0, se reemplazar por "sin identificar"
df["company"] = df["company"].apply(lambda x: "sin identificar" if x == "0" else x)
df["company"].value_counts()

sin identificar    112593
40                    927
223                   784
67                    267
45                    250
                    ...  
32                      1
11                      1
487                     1
101                     1
376                     1
Name: company, Length: 353, dtype: int64

In [9]:
# procedemos de la misma forma con agent
## pasamos los valores de la columna agent a string 
df["agent"] = df["agent"].apply(transforma_a_entero)
## pasamos columna a str 
df["agent"] = df["agent"].astype(str)

## si el valor de la columna es 0, se reemplazar por "sin identificar"
df["agent"] = df["agent"].apply(lambda x: "sin identificar" if x == "0" else x)
df["agent"].value_counts()

9                  31961
sin identificar    16340
240                13922
1                   7191
14                  3640
                   ...  
280                    1
285                    1
289                    1
265                    1
497                    1
Name: agent, Length: 334, dtype: int64

In [10]:
# is_canceled" es una columna booleana por lo que vamos a pasarlo a una columna categórica con dos categorías: "Canceled" y "Not Canceled"
df["is_canceled"] = df["is_canceled"].apply(lambda x: "Canceled" if x == 1  else "Not Canceled")

# Analisis descriptivo 

In [11]:
df_descriptivo = df.describe().T # describe solo para las columnas numericas 
df_descriptivo

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lead_time,119390.0,104.011416,106.863097,0.0,18.0,69.0,160.0,737.0
arrival_date_year,119390.0,2016.156554,0.707476,2015.0,2016.0,2016.0,2017.0,2017.0
arrival_date_week_number,119390.0,27.165173,13.605138,1.0,16.0,28.0,38.0,53.0
arrival_date_day_of_month,119390.0,15.798241,8.780829,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,119390.0,0.927599,0.998613,0.0,0.0,1.0,2.0,19.0
stays_in_week_nights,119390.0,2.500302,1.908286,0.0,1.0,2.0,3.0,50.0
adults,119390.0,1.856403,0.579261,0.0,2.0,2.0,2.0,55.0
children,119386.0,0.10389,0.398561,0.0,0.0,0.0,0.0,10.0
babies,119390.0,0.007949,0.097436,0.0,0.0,0.0,0.0,10.0
is_repeated_guest,119390.0,0.031912,0.175767,0.0,0.0,0.0,0.0,1.0


In [12]:
df_descriptivo = df.describe().T
df_descriptivo

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lead_time,119390.0,104.011416,106.863097,0.0,18.0,69.0,160.0,737.0
arrival_date_year,119390.0,2016.156554,0.707476,2015.0,2016.0,2016.0,2017.0,2017.0
arrival_date_week_number,119390.0,27.165173,13.605138,1.0,16.0,28.0,38.0,53.0
arrival_date_day_of_month,119390.0,15.798241,8.780829,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,119390.0,0.927599,0.998613,0.0,0.0,1.0,2.0,19.0
stays_in_week_nights,119390.0,2.500302,1.908286,0.0,1.0,2.0,3.0,50.0
adults,119390.0,1.856403,0.579261,0.0,2.0,2.0,2.0,55.0
children,119386.0,0.10389,0.398561,0.0,0.0,0.0,0.0,10.0
babies,119390.0,0.007949,0.097436,0.0,0.0,0.0,0.0,10.0
is_repeated_guest,119390.0,0.031912,0.175767,0.0,0.0,0.0,0.0,1.0


## 3.1-Analisis de Nulos 

In [13]:
## calculamos el porcentaje de nulos
df_descriptivo["count"] = ((df_descriptivo["count"]/df.shape[0])*100).round(2)

## ronombramos la columna count por % nulos 
df_descriptivo.rename(columns={"count": "% nulos"}, inplace=True)
df_descriptivo

Unnamed: 0,% nulos,mean,std,min,25%,50%,75%,max
lead_time,100.0,104.011416,106.863097,0.0,18.0,69.0,160.0,737.0
arrival_date_year,100.0,2016.156554,0.707476,2015.0,2016.0,2016.0,2017.0,2017.0
arrival_date_week_number,100.0,27.165173,13.605138,1.0,16.0,28.0,38.0,53.0
arrival_date_day_of_month,100.0,15.798241,8.780829,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,100.0,0.927599,0.998613,0.0,0.0,1.0,2.0,19.0
stays_in_week_nights,100.0,2.500302,1.908286,0.0,1.0,2.0,3.0,50.0
adults,100.0,1.856403,0.579261,0.0,2.0,2.0,2.0,55.0
children,100.0,0.10389,0.398561,0.0,0.0,0.0,0.0,10.0
babies,100.0,0.007949,0.097436,0.0,0.0,0.0,0.0,10.0
is_repeated_guest,100.0,0.031912,0.175767,0.0,0.0,0.0,0.0,1.0


## 3.2-Analisis de Dupliacados

In [14]:
# numero de duplicados
df.duplicated().sum()

31999

In [15]:
# porcentaje de duplicados 
(df.duplicated().sum()/df.shape[0]).round(2)*100

27.0

In [16]:
#  registros duplicados 
df_duplicados = df[df.duplicated()]
df_duplicados

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
5,Resort Hotel,Not Canceled,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,98.00,0,1,Check-Out,3/07/15
22,Resort Hotel,Not Canceled,72,2015,July,27,1,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,A,A,1,250,sin identificar,0,Transient,84.67,0,1,Check-Out,7/07/15
43,Resort Hotel,Not Canceled,70,2015,July,27,2,2,3,2,0.0,0,HB,ROU,Direct,Direct,0,0,0,E,E,0,250,sin identificar,0,Transient,137.00,0,1,Check-Out,7/07/15
138,Resort Hotel,Canceled,5,2015,July,28,5,1,0,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,D,D,0,240,sin identificar,0,Transient,97.00,0,0,Canceled,1/07/15
200,Resort Hotel,Not Canceled,0,2015,July,28,7,0,1,1,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,109.80,0,3,Check-Out,8/07/15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119349,City Hotel,Not Canceled,186,2017,August,35,31,0,3,2,0.0,0,BB,DEU,Online TA,TA/TO,0,0,0,D,D,0,9,sin identificar,0,Transient,126.00,0,2,Check-Out,3/09/17
119352,City Hotel,Not Canceled,63,2017,August,35,31,0,3,3,0.0,0,BB,SWE,Online TA,TA/TO,0,0,0,D,D,0,9,sin identificar,0,Transient-Party,195.33,0,2,Check-Out,3/09/17
119353,City Hotel,Not Canceled,63,2017,August,35,31,0,3,3,0.0,0,BB,SWE,Online TA,TA/TO,0,0,0,D,D,0,9,sin identificar,0,Transient-Party,195.33,0,2,Check-Out,3/09/17
119354,City Hotel,Not Canceled,63,2017,August,35,31,0,3,3,0.0,0,BB,SWE,Online TA,TA/TO,0,0,0,D,D,0,9,sin identificar,0,Transient-Party,195.33,0,2,Check-Out,3/09/17


In [17]:
# eliminar duplicados 
df.drop_duplicates(inplace=True)
# reseteamos el index
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,Not Canceled,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,sin identificar,sin identificar,0,Transient,0.00,0,0,Check-Out,1/07/15
1,Resort Hotel,Not Canceled,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,sin identificar,sin identificar,0,Transient,0.00,0,0,Check-Out,1/07/15
2,Resort Hotel,Not Canceled,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,sin identificar,sin identificar,0,Transient,75.00,0,0,Check-Out,2/07/15
3,Resort Hotel,Not Canceled,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,304,sin identificar,0,Transient,75.00,0,0,Check-Out,2/07/15
4,Resort Hotel,Not Canceled,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,98.00,0,1,Check-Out,3/07/15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87386,City Hotel,Not Canceled,23,2017,August,35,30,2,5,2,0.0,0,BB,BEL,Offline TA/TO,TA/TO,0,0,0,A,A,0,394,sin identificar,0,Transient,96.14,0,0,Check-Out,6/09/17
87387,City Hotel,Not Canceled,102,2017,August,35,31,2,5,3,0.0,0,BB,FRA,Online TA,TA/TO,0,0,0,E,E,0,9,sin identificar,0,Transient,225.43,0,2,Check-Out,7/09/17
87388,City Hotel,Not Canceled,34,2017,August,35,31,2,5,2,0.0,0,BB,DEU,Online TA,TA/TO,0,0,0,D,D,0,9,sin identificar,0,Transient,157.71,0,4,Check-Out,7/09/17
87389,City Hotel,Not Canceled,109,2017,August,35,31,2,5,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,89,sin identificar,0,Transient,104.40,0,0,Check-Out,7/09/17


## 3.3-Analisis de DateTime 

In [18]:

# creamos la columna de datre arrival a partir de las tres columnas de arrival
df["arrival_date"] = df["arrival_date_year"].astype(str) + "-" + df["arrival_date_month"] + "-" + df["arrival_date_day_of_month"].astype(str)
df["arrival_date"] = pd.to_datetime(df["arrival_date"])
df.drop(columns=["arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"], inplace=True)

# cambiamos el formato de las fechas de arrival_date a datetime con formato dia mes año 
df["arrival_date"] = pd.to_datetime(df["arrival_date"], format="%d/%B/%Y")
df["reservation_status_date"] = pd.to_datetime(df["reservation_status_date"])
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date
0,Resort Hotel,Not Canceled,342,27,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,sin identificar,sin identificar,0,Transient,0.00,0,0,Check-Out,2015-01-07,2015-07-01
1,Resort Hotel,Not Canceled,737,27,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,sin identificar,sin identificar,0,Transient,0.00,0,0,Check-Out,2015-01-07,2015-07-01
2,Resort Hotel,Not Canceled,7,27,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,sin identificar,sin identificar,0,Transient,75.00,0,0,Check-Out,2015-02-07,2015-07-01
3,Resort Hotel,Not Canceled,13,27,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,304,sin identificar,0,Transient,75.00,0,0,Check-Out,2015-02-07,2015-07-01
4,Resort Hotel,Not Canceled,14,27,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,98.00,0,1,Check-Out,2015-03-07,2015-07-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87386,City Hotel,Not Canceled,23,35,2,5,2,0.0,0,BB,BEL,Offline TA/TO,TA/TO,0,0,0,A,A,0,394,sin identificar,0,Transient,96.14,0,0,Check-Out,2017-06-09,2017-08-30
87387,City Hotel,Not Canceled,102,35,2,5,3,0.0,0,BB,FRA,Online TA,TA/TO,0,0,0,E,E,0,9,sin identificar,0,Transient,225.43,0,2,Check-Out,2017-07-09,2017-08-31
87388,City Hotel,Not Canceled,34,35,2,5,2,0.0,0,BB,DEU,Online TA,TA/TO,0,0,0,D,D,0,9,sin identificar,0,Transient,157.71,0,4,Check-Out,2017-07-09,2017-08-31
87389,City Hotel,Not Canceled,109,35,2,5,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,89,sin identificar,0,Transient,104.40,0,0,Check-Out,2017-07-09,2017-08-31


In [19]:
# Se apriecian discrepancias sobre las fechas de "reservation_status_date","arrival_date"
df[["reservation_status_date","arrival_date"]].sample(40)

Unnamed: 0,reservation_status_date,arrival_date
83050,2017-06-18,2017-06-15
5158,2016-04-28,2016-07-13
13066,2015-09-13,2015-09-06
68249,2016-02-09,2016-09-01
66762,2016-09-08,2016-08-02
28283,2017-06-03,2017-03-02
87072,2017-08-28,2017-08-26
16057,2016-01-26,2016-01-26
13776,2015-10-19,2015-10-12
46735,2017-07-01,2017-02-12


In [20]:
# Creamos una columna con la diferencia de dias entre arrival_date y reservation_status_date 
df["diferencia_dias"] = (df["reservation_status_date"] - df["arrival_date"]).dt.days
df.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
3940,Resort Hotel,Canceled,92,17,1,2,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,68.0,0,0,Canceled,2016-04-21,2016-04-22,-1
78715,City Hotel,Not Canceled,80,13,2,2,2,0.0,0,BB,ESP,Offline TA/TO,TA/TO,0,0,0,A,A,0,52,sin identificar,0,Transient,100.1,0,0,Check-Out,2017-05-04,2017-04-01,33
70555,City Hotel,Not Canceled,0,42,1,0,2,0.0,0,BB,PRT,Complementary,TA/TO,0,0,0,E,E,0,1,sin identificar,0,Transient,0.0,0,0,Check-Out,2016-11-10,2016-10-10,31
61130,City Hotel,Not Canceled,39,17,2,5,2,0.0,0,BB,POL,Online TA,TA/TO,0,0,0,D,D,0,9,sin identificar,0,Transient,117.98,0,1,Check-Out,2016-04-28,2016-04-21,7
70962,City Hotel,Not Canceled,359,42,2,3,2,0.0,0,BB,DEU,Offline TA/TO,TA/TO,0,0,0,A,B,1,21,sin identificar,0,Transient-Party,78.0,0,3,Check-Out,2016-10-19,2016-10-14,5
27534,Resort Hotel,Not Canceled,5,6,0,2,2,0.0,0,HB,PRT,Direct,Direct,0,0,0,A,D,1,250,sin identificar,0,Transient,80.0,0,1,Check-Out,2017-12-02,2017-02-10,295
52505,City Hotel,Canceled,98,34,1,3,2,0.0,0,SC,GBR,Online TA,TA/TO,0,0,0,A,A,0,9,sin identificar,0,Transient,125.0,0,2,Canceled,2017-05-16,2017-08-21,-97
70722,City Hotel,Not Canceled,36,42,0,1,1,0.0,0,SC,PRT,Online TA,TA/TO,0,0,0,A,A,0,8,sin identificar,0,Transient-Party,102.6,0,2,Check-Out,2016-10-14,2016-10-13,1
48095,City Hotel,Canceled,38,15,1,1,2,0.0,0,BB,FRA,Online TA,TA/TO,0,0,0,A,A,0,9,sin identificar,0,Transient,140.0,0,0,Canceled,2017-08-03,2017-04-10,115
62536,City Hotel,Not Canceled,1,22,1,0,2,0.0,0,SC,GBR,Direct,Direct,0,0,0,A,A,1,sin identificar,sin identificar,0,Transient,149.0,0,0,Check-Out,2016-05-24,2016-05-23,1


In [21]:
# para futuros podria ser interesante analizar este conjunto de datosa en los que los dias de diferencia son negativos. Posible fraude ????
df_dataq = df[df["diferencia_dias"] < 0]
df_dataq


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
0,Resort Hotel,Not Canceled,342,27,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,sin identificar,sin identificar,0,Transient,0.00,0,0,Check-Out,2015-01-07,2015-07-01,-175
1,Resort Hotel,Not Canceled,737,27,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,sin identificar,sin identificar,0,Transient,0.00,0,0,Check-Out,2015-01-07,2015-07-01,-175
2,Resort Hotel,Not Canceled,7,27,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,sin identificar,sin identificar,0,Transient,75.00,0,0,Check-Out,2015-02-07,2015-07-01,-144
3,Resort Hotel,Not Canceled,13,27,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,304,sin identificar,0,Transient,75.00,0,0,Check-Out,2015-02-07,2015-07-01,-144
4,Resort Hotel,Not Canceled,14,27,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,98.00,0,1,Check-Out,2015-03-07,2015-07-01,-116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87386,City Hotel,Not Canceled,23,35,2,5,2,0.0,0,BB,BEL,Offline TA/TO,TA/TO,0,0,0,A,A,0,394,sin identificar,0,Transient,96.14,0,0,Check-Out,2017-06-09,2017-08-30,-82
87387,City Hotel,Not Canceled,102,35,2,5,3,0.0,0,BB,FRA,Online TA,TA/TO,0,0,0,E,E,0,9,sin identificar,0,Transient,225.43,0,2,Check-Out,2017-07-09,2017-08-31,-53
87388,City Hotel,Not Canceled,34,35,2,5,2,0.0,0,BB,DEU,Online TA,TA/TO,0,0,0,D,D,0,9,sin identificar,0,Transient,157.71,0,4,Check-Out,2017-07-09,2017-08-31,-53
87389,City Hotel,Not Canceled,109,35,2,5,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,89,sin identificar,0,Transient,104.40,0,0,Check-Out,2017-07-09,2017-08-31,-53


In [22]:
# eliminamos los registros con diferencia de dias negativa o dicho de otra forma sobre escribimos aquellos registros con diferencia de dias positiva
df_final = df[df["diferencia_dias"] >= 0]
df_final.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
20,Resort Hotel,Not Canceled,72,27,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,A,A,1,250,sin identificar,0,Transient,84.67,0,1,Check-Out,2015-07-07,2015-07-01,6
21,Resort Hotel,Not Canceled,72,27,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,D,D,1,250,sin identificar,0,Transient,99.67,0,1,Check-Out,2015-07-07,2015-07-01,6
23,Resort Hotel,Not Canceled,78,27,2,5,2,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,0,0,D,D,0,5,sin identificar,0,Transient,63.6,1,0,Check-Out,2015-08-07,2015-07-01,37
24,Resort Hotel,Not Canceled,48,27,2,5,2,0.0,0,BB,IRL,Offline TA/TO,TA/TO,0,0,0,D,D,0,8,sin identificar,0,Contract,79.5,0,0,Check-Out,2015-08-07,2015-07-01,37
25,Resort Hotel,Canceled,60,27,2,5,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,E,E,0,240,sin identificar,0,Transient,107.0,0,2,Canceled,2015-11-05,2015-07-01,127


In [23]:
# parece raro que algunos registros de la columnas diferencia_dias sea positiva, indicaria que la fecha de cancelacion seria posterior al de llegada. por lo que agregaremos los registros a df_dataq para analizarlos mas adelante. 
# de la misma manera procederemos a agregar aquellos registros de df_dataq que tengan la fecha de cancelacion anterior a la de llegada.

df_cancelaciones_positivas = df_final[df_final["reservation_status"]== "Canceled"]
df_cancelaciones_positivas.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
25,Resort Hotel,Canceled,60,27,2,5,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,E,E,0,240,sin identificar,0,Transient,107.0,0,2,Canceled,2015-11-05,2015-07-01,127
68,Resort Hotel,Canceled,101,27,0,2,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,73.8,0,1,Canceled,2015-12-06,2015-07-03,156
71,Resort Hotel,Canceled,51,27,0,2,3,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,242,sin identificar,0,Transient,110.3,0,0,Canceled,2015-09-06,2015-07-03,65
80,Resort Hotel,Canceled,81,27,3,7,2,0.0,0,HB,PRT,Direct,Direct,0,0,0,A,A,2,250,sin identificar,0,Transient,124.0,0,1,Canceled,2015-09-06,2015-07-03,65
102,Resort Hotel,Canceled,26,27,2,5,2,2.0,0,BB,PRT,Online TA,TA/TO,0,0,0,H,H,0,240,sin identificar,0,Transient,163.0,0,0,Canceled,2015-09-06,2015-07-04,64


In [24]:
df_cancelaciones_negativas = df_dataq[df_dataq["reservation_status"] == "Canceled"]
df_cancelaciones_negativas.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
7,Resort Hotel,Canceled,85,27,0,3,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,82.0,0,1,Canceled,2015-06-05,2015-07-01,-26
8,Resort Hotel,Canceled,75,27,0,3,2,0.0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,D,D,0,15,sin identificar,0,Transient,105.5,0,0,Canceled,2015-04-22,2015-07-01,-70
9,Resort Hotel,Canceled,23,27,0,4,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,E,E,0,240,sin identificar,0,Transient,123.0,0,0,Canceled,2015-06-23,2015-07-01,-8
30,Resort Hotel,Canceled,96,27,2,8,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,E,E,0,sin identificar,sin identificar,0,Transient,108.3,0,2,Canceled,2015-05-29,2015-07-01,-33
32,Resort Hotel,Canceled,45,27,1,3,3,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,D,D,0,241,sin identificar,0,Transient,108.8,0,1,Canceled,2015-05-19,2015-07-02,-44


In [25]:
# actualiozamos los registros de df_final sin los datos de df_cancelaciones_positivas
df_final = df_final[df_final["reservation_status"]!= "Canceled"]
df_final.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
20,Resort Hotel,Not Canceled,72,27,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,A,A,1,250,sin identificar,0,Transient,84.67,0,1,Check-Out,2015-07-07,2015-07-01,6
21,Resort Hotel,Not Canceled,72,27,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,D,D,1,250,sin identificar,0,Transient,99.67,0,1,Check-Out,2015-07-07,2015-07-01,6
23,Resort Hotel,Not Canceled,78,27,2,5,2,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,0,0,D,D,0,5,sin identificar,0,Transient,63.6,1,0,Check-Out,2015-08-07,2015-07-01,37
24,Resort Hotel,Not Canceled,48,27,2,5,2,0.0,0,BB,IRL,Offline TA/TO,TA/TO,0,0,0,D,D,0,8,sin identificar,0,Contract,79.5,0,0,Check-Out,2015-08-07,2015-07-01,37
26,Resort Hotel,Not Canceled,77,27,2,5,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,94.0,0,0,Check-Out,2015-08-07,2015-07-01,37


In [26]:
df_dataq = df_dataq[df_dataq["reservation_status"] != "Canceled"]
df_dataq.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
0,Resort Hotel,Not Canceled,342,27,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,sin identificar,sin identificar,0,Transient,0.0,0,0,Check-Out,2015-01-07,2015-07-01,-175
1,Resort Hotel,Not Canceled,737,27,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,sin identificar,sin identificar,0,Transient,0.0,0,0,Check-Out,2015-01-07,2015-07-01,-175
2,Resort Hotel,Not Canceled,7,27,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,sin identificar,sin identificar,0,Transient,75.0,0,0,Check-Out,2015-02-07,2015-07-01,-144
3,Resort Hotel,Not Canceled,13,27,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,304,sin identificar,0,Transient,75.0,0,0,Check-Out,2015-02-07,2015-07-01,-144
4,Resort Hotel,Not Canceled,14,27,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,98.0,0,1,Check-Out,2015-03-07,2015-07-01,-116


In [27]:
# concatenamos los datos de df_dataq y df_cancelaciones_positivas
df_dataq = pd.concat([df_dataq, df_cancelaciones_positivas])
df_final = pd.concat([df_final, df_cancelaciones_negativas])

In [28]:
df_final

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
20,Resort Hotel,Not Canceled,72,27,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,A,A,1,250,sin identificar,0,Transient,84.67,0,1,Check-Out,2015-07-07,2015-07-01,6
21,Resort Hotel,Not Canceled,72,27,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,D,D,1,250,sin identificar,0,Transient,99.67,0,1,Check-Out,2015-07-07,2015-07-01,6
23,Resort Hotel,Not Canceled,78,27,2,5,2,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,0,0,D,D,0,5,sin identificar,0,Transient,63.60,1,0,Check-Out,2015-08-07,2015-07-01,37
24,Resort Hotel,Not Canceled,48,27,2,5,2,0.0,0,BB,IRL,Offline TA/TO,TA/TO,0,0,0,D,D,0,8,sin identificar,0,Contract,79.50,0,0,Check-Out,2015-08-07,2015-07-01,37
26,Resort Hotel,Not Canceled,77,27,2,5,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,94.00,0,0,Check-Out,2015-08-07,2015-07-01,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70754,City Hotel,Canceled,1,4,0,2,1,0.0,0,BB,PRT,Corporate,Corporate,1,1,3,A,A,0,sin identificar,40,0,Transient,65.00,0,1,Canceled,2017-01-23,2017-01-24,-1
76842,City Hotel,Canceled,14,21,0,1,1,0.0,0,BB,PRT,Complementary,Direct,1,1,3,A,A,0,sin identificar,45,0,Transient,0.00,0,1,Canceled,2017-05-22,2017-05-24,-2
77439,City Hotel,Canceled,1,24,2,1,1,0.0,0,BB,ITA,Aviation,Corporate,1,1,2,D,D,0,sin identificar,153,0,Transient,110.00,0,0,Canceled,2017-06-16,2017-06-17,-1
77906,City Hotel,Canceled,13,22,1,2,1,0.0,0,BB,SWE,Direct,Direct,1,1,1,A,A,0,sin identificar,sin identificar,0,Transient,95.00,0,0,Canceled,2017-05-17,2017-05-29,-12


## 3.3.1-Analisis de DateTime para reservation_status

- Analisis del ultimo estado de la reserva No-Show - el cliente no se ha registrado y ha informado al hotel del motivo

In [29]:
toto = pd.DataFrame(df[df["reservation_status"] == "No-Show"]["diferencia_dias"])

# importamos seaborn
import seaborn as sns
# hacemos una histograma con plotly express 
import plotly.express as px
fig = px.histogram(toto, x="diferencia_dias", nbins=100)
fig.show()


Tampoco parece tener sentido mantener aquellos registros que reflejen una notificacion de aquellos clientes que no se han registrado pese a que lo notifiquen dias despues de su supesta llegada en el df_final. 

In [30]:
filtro1 = df["reservation_status"]=="No-Show"
filtro2 = df["diferencia_dias"] < 0

df[filtro1 & filtro2]

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
119,Resort Hotel,Canceled,36,27,0,1,1,0.0,0,BB,PRT,Direct,Direct,0,0,0,E,G,1,250,sin identificar,0,Transient,123.00,0,0,No-Show,2015-04-07,2015-07-04,-88
1469,Resort Hotel,Canceled,48,36,2,8,2,2.0,0,BB,PRT,Direct,Direct,0,0,0,G,I,0,250,sin identificar,0,Transient,153.00,0,0,No-Show,2015-03-09,2015-09-03,-178
1512,Resort Hotel,Canceled,94,36,2,5,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,G,G,0,sin identificar,sin identificar,0,Transient,153.00,0,0,No-Show,2015-05-09,2015-09-05,-119
1571,Resort Hotel,Canceled,144,37,3,7,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,75.96,0,0,No-Show,2015-07-09,2015-09-07,-60
2134,Resort Hotel,Canceled,77,41,0,3,2,2.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,0,250,sin identificar,0,Transient,66.00,0,2,No-Show,2015-08-10,2015-10-08,-59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,Canceled,2,50,2,5,1,0.0,0,BB,AGO,Direct,Direct,0,0,0,E,E,0,sin identificar,sin identificar,0,Transient,104.00,0,1,No-Show,2015-11-12,2015-12-11,-29
59793,City Hotel,Canceled,6,46,0,1,1,0.0,0,BB,PRT,Corporate,Corporate,1,0,5,D,D,0,sin identificar,219,0,Transient,94.00,0,1,No-Show,2016-09-11,2016-11-09,-59
61857,City Hotel,Canceled,26,45,0,1,1,0.0,0,BB,PRT,Corporate,Corporate,1,0,10,A,E,0,sin identificar,219,0,Transient,88.00,0,1,No-Show,2016-02-11,2016-11-02,-265
75076,City Hotel,Canceled,1,5,0,0,1,0.0,0,BB,AUT,Aviation,Corporate,1,0,1,A,A,0,sin identificar,153,0,Transient,0.00,0,1,No-Show,2017-01-02,2017-02-01,-30


## 4-Filtrado de datos 

Tras tener claro cuales son los datos del objeto de estudio procedemos a la creacion de filtros para quedarnos solo con aquellos datos de valor. 


In [31]:


# cancelaciones realizadas antes de la fecha de llegada, por lo que se procede a agregarlos a df_final
filtro1 = df["reservation_status"]=="Canceled"
filtro2 = df["diferencia_dias"] < 0

df_canceled = df[filtro1 & filtro2]
df_canceled

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
7,Resort Hotel,Canceled,85,27,0,3,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,82.0,0,1,Canceled,2015-06-05,2015-07-01,-26
8,Resort Hotel,Canceled,75,27,0,3,2,0.0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,D,D,0,15,sin identificar,0,Transient,105.5,0,0,Canceled,2015-04-22,2015-07-01,-70
9,Resort Hotel,Canceled,23,27,0,4,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,E,E,0,240,sin identificar,0,Transient,123.0,0,0,Canceled,2015-06-23,2015-07-01,-8
30,Resort Hotel,Canceled,96,27,2,8,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,E,E,0,sin identificar,sin identificar,0,Transient,108.3,0,2,Canceled,2015-05-29,2015-07-01,-33
32,Resort Hotel,Canceled,45,27,1,3,3,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,D,D,0,241,sin identificar,0,Transient,108.8,0,1,Canceled,2015-05-19,2015-07-02,-44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70754,City Hotel,Canceled,1,4,0,2,1,0.0,0,BB,PRT,Corporate,Corporate,1,1,3,A,A,0,sin identificar,40,0,Transient,65.0,0,1,Canceled,2017-01-23,2017-01-24,-1
76842,City Hotel,Canceled,14,21,0,1,1,0.0,0,BB,PRT,Complementary,Direct,1,1,3,A,A,0,sin identificar,45,0,Transient,0.0,0,1,Canceled,2017-05-22,2017-05-24,-2
77439,City Hotel,Canceled,1,24,2,1,1,0.0,0,BB,ITA,Aviation,Corporate,1,1,2,D,D,0,sin identificar,153,0,Transient,110.0,0,0,Canceled,2017-06-16,2017-06-17,-1
77906,City Hotel,Canceled,13,22,1,2,1,0.0,0,BB,SWE,Direct,Direct,1,1,1,A,A,0,sin identificar,sin identificar,0,Transient,95.0,0,0,Canceled,2017-05-17,2017-05-29,-12


In [32]:

# clientes que hicieron checout y tienen valores positivos en diferencia de dias , por lo que se procede a agregarlos a df_final
filtro1 = df["reservation_status"]=="Check-Out"
filtro2 = df["diferencia_dias"] >= 0

df_checkout = df[filtro1 & filtro2]
df_checkout

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
20,Resort Hotel,Not Canceled,72,27,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,A,A,1,250,sin identificar,0,Transient,84.67,0,1,Check-Out,2015-07-07,2015-07-01,6
21,Resort Hotel,Not Canceled,72,27,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,D,D,1,250,sin identificar,0,Transient,99.67,0,1,Check-Out,2015-07-07,2015-07-01,6
23,Resort Hotel,Not Canceled,78,27,2,5,2,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,0,0,D,D,0,5,sin identificar,0,Transient,63.60,1,0,Check-Out,2015-08-07,2015-07-01,37
24,Resort Hotel,Not Canceled,48,27,2,5,2,0.0,0,BB,IRL,Offline TA/TO,TA/TO,0,0,0,D,D,0,8,sin identificar,0,Contract,79.50,0,0,Check-Out,2015-08-07,2015-07-01,37
26,Resort Hotel,Not Canceled,77,27,2,5,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,94.00,0,0,Check-Out,2015-08-07,2015-07-01,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87242,City Hotel,Not Canceled,95,35,0,2,2,0.0,0,SC,PRT,Online TA,TA/TO,0,0,0,D,E,1,89,sin identificar,0,Transient,95.76,0,0,Check-Out,2017-08-31,2017-08-29,2
87243,City Hotel,Not Canceled,175,35,2,2,1,2.0,0,BB,IRL,Online TA,TA/TO,0,0,0,D,D,1,9,sin identificar,0,Transient,175.50,0,1,Check-Out,2017-08-31,2017-08-27,4
87244,City Hotel,Not Canceled,19,35,0,2,1,0.0,0,SC,DEU,Online TA,TA/TO,0,0,0,A,A,1,9,sin identificar,0,Transient,140.00,0,0,Check-Out,2017-08-31,2017-08-29,2
87245,City Hotel,Not Canceled,5,35,0,2,1,0.0,0,HB,PRT,Groups,TA/TO,0,0,0,A,A,4,19,sin identificar,0,Transient-Party,104.00,0,0,Check-Out,2017-08-31,2017-08-29,2


In [33]:
# clientes que notificaron que no se registrarian en el hotel antes de su llegada teniendo positivos en diferencia de dias , por lo que se procede a agregarlos a df_final
filtro1 = df["reservation_status"]=="No-Show"
filtro2 = df["diferencia_dias"] < 0

df_noshow = df[filtro1 & filtro2]
df_noshow

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
119,Resort Hotel,Canceled,36,27,0,1,1,0.0,0,BB,PRT,Direct,Direct,0,0,0,E,G,1,250,sin identificar,0,Transient,123.00,0,0,No-Show,2015-04-07,2015-07-04,-88
1469,Resort Hotel,Canceled,48,36,2,8,2,2.0,0,BB,PRT,Direct,Direct,0,0,0,G,I,0,250,sin identificar,0,Transient,153.00,0,0,No-Show,2015-03-09,2015-09-03,-178
1512,Resort Hotel,Canceled,94,36,2,5,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,G,G,0,sin identificar,sin identificar,0,Transient,153.00,0,0,No-Show,2015-05-09,2015-09-05,-119
1571,Resort Hotel,Canceled,144,37,3,7,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,75.96,0,0,No-Show,2015-07-09,2015-09-07,-60
2134,Resort Hotel,Canceled,77,41,0,3,2,2.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,0,250,sin identificar,0,Transient,66.00,0,2,No-Show,2015-08-10,2015-10-08,-59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,Canceled,2,50,2,5,1,0.0,0,BB,AGO,Direct,Direct,0,0,0,E,E,0,sin identificar,sin identificar,0,Transient,104.00,0,1,No-Show,2015-11-12,2015-12-11,-29
59793,City Hotel,Canceled,6,46,0,1,1,0.0,0,BB,PRT,Corporate,Corporate,1,0,5,D,D,0,sin identificar,219,0,Transient,94.00,0,1,No-Show,2016-09-11,2016-11-09,-59
61857,City Hotel,Canceled,26,45,0,1,1,0.0,0,BB,PRT,Corporate,Corporate,1,0,10,A,E,0,sin identificar,219,0,Transient,88.00,0,1,No-Show,2016-02-11,2016-11-02,-265
75076,City Hotel,Canceled,1,5,0,0,1,0.0,0,BB,AUT,Aviation,Corporate,1,0,1,A,A,0,sin identificar,153,0,Transient,0.00,0,1,No-Show,2017-01-02,2017-02-01,-30


In [34]:
# concatenacion de dataframes 
df_final = pd.concat([df_checkout, df_canceled, df_noshow])
df_final

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,diferencia_dias
20,Resort Hotel,Not Canceled,72,27,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,A,A,1,250,sin identificar,0,Transient,84.67,0,1,Check-Out,2015-07-07,2015-07-01,6
21,Resort Hotel,Not Canceled,72,27,2,4,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,D,D,1,250,sin identificar,0,Transient,99.67,0,1,Check-Out,2015-07-07,2015-07-01,6
23,Resort Hotel,Not Canceled,78,27,2,5,2,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,0,0,D,D,0,5,sin identificar,0,Transient,63.60,1,0,Check-Out,2015-08-07,2015-07-01,37
24,Resort Hotel,Not Canceled,48,27,2,5,2,0.0,0,BB,IRL,Offline TA/TO,TA/TO,0,0,0,D,D,0,8,sin identificar,0,Contract,79.50,0,0,Check-Out,2015-08-07,2015-07-01,37
26,Resort Hotel,Not Canceled,77,27,2,5,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,240,sin identificar,0,Transient,94.00,0,0,Check-Out,2015-08-07,2015-07-01,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,Canceled,2,50,2,5,1,0.0,0,BB,AGO,Direct,Direct,0,0,0,E,E,0,sin identificar,sin identificar,0,Transient,104.00,0,1,No-Show,2015-11-12,2015-12-11,-29
59793,City Hotel,Canceled,6,46,0,1,1,0.0,0,BB,PRT,Corporate,Corporate,1,0,5,D,D,0,sin identificar,219,0,Transient,94.00,0,1,No-Show,2016-09-11,2016-11-09,-59
61857,City Hotel,Canceled,26,45,0,1,1,0.0,0,BB,PRT,Corporate,Corporate,1,0,10,A,E,0,sin identificar,219,0,Transient,88.00,0,1,No-Show,2016-02-11,2016-11-02,-265
75076,City Hotel,Canceled,1,5,0,0,1,0.0,0,BB,AUT,Aviation,Corporate,1,0,1,A,A,0,sin identificar,153,0,Transient,0.00,0,1,No-Show,2017-01-02,2017-02-01,-30


In [35]:
# reordenamos las columnas 
df_final = df_final[['hotel', 
                         'previous_cancellations',
                         'previous_bookings_not_canceled', 
                         'booking_changes',               
                         'is_canceled', 
                         'reservation_status', 
                         'reservation_status_date', 
                         'arrival_date',
                         'arrival_date_week_number',
                         'diferencia_dias',

                         'stays_in_weekend_nights',
                         'stays_in_week_nights',

                         'country', 
                         'is_repeated_guest', 
                         'adults', 
                         'children',
                         'babies',
                         'customer_type', 
                         'days_in_waiting_list', 
                         'lead_time',    
                         
                         'market_segment', 
                         'distribution_channel',
                         'agent', 
                         'company',

                         'reserved_room_type',
                         'assigned_room_type', 
                         'meal', 
                         'required_car_parking_spaces', 
                         'total_of_special_requests',

                         'adr']]

df_final


Unnamed: 0,hotel,previous_cancellations,previous_bookings_not_canceled,booking_changes,is_canceled,reservation_status,reservation_status_date,arrival_date,arrival_date_week_number,diferencia_dias,stays_in_weekend_nights,stays_in_week_nights,country,is_repeated_guest,adults,children,babies,customer_type,days_in_waiting_list,lead_time,market_segment,distribution_channel,agent,company,reserved_room_type,assigned_room_type,meal,required_car_parking_spaces,total_of_special_requests,adr
20,Resort Hotel,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,2,4,PRT,0,2,0.0,0,Transient,0,72,Direct,Direct,250,sin identificar,A,A,BB,0,1,84.67
21,Resort Hotel,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,2,4,PRT,0,2,0.0,0,Transient,0,72,Direct,Direct,250,sin identificar,D,D,BB,0,1,99.67
23,Resort Hotel,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,2,5,PRT,0,2,0.0,0,Transient,0,78,Offline TA/TO,TA/TO,5,sin identificar,D,D,BB,1,0,63.60
24,Resort Hotel,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,2,5,IRL,0,2,0.0,0,Contract,0,48,Offline TA/TO,TA/TO,8,sin identificar,D,D,BB,0,0,79.50
26,Resort Hotel,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,2,5,PRT,0,2,0.0,0,Transient,0,77,Online TA,TA/TO,240,sin identificar,A,A,BB,0,0,94.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,0,0,0,Canceled,No-Show,2015-11-12,2015-12-11,50,-29,2,5,AGO,0,1,0.0,0,Transient,0,2,Direct,Direct,sin identificar,sin identificar,E,E,BB,0,1,104.00
59793,City Hotel,0,5,0,Canceled,No-Show,2016-09-11,2016-11-09,46,-59,0,1,PRT,1,1,0.0,0,Transient,0,6,Corporate,Corporate,sin identificar,219,D,D,BB,0,1,94.00
61857,City Hotel,0,10,0,Canceled,No-Show,2016-02-11,2016-11-02,45,-265,0,1,PRT,1,1,0.0,0,Transient,0,26,Corporate,Corporate,sin identificar,219,A,E,BB,0,1,88.00
75076,City Hotel,0,1,0,Canceled,No-Show,2017-01-02,2017-02-01,5,-30,0,0,AUT,1,1,0.0,0,Transient,0,1,Aviation,Corporate,sin identificar,153,A,A,BB,0,1,0.00


## 5-Creacion de columna noches de estancia; overnight_stays. 

In [36]:
# creamos la columna overnight stays  sumando  los valores stays_in_weekend_nights y stays_in_week_nights
df_final["overnight_stays"] = df_final["stays_in_weekend_nights"] + df_final["stays_in_week_nights"]
df_final



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,hotel,previous_cancellations,previous_bookings_not_canceled,booking_changes,is_canceled,reservation_status,reservation_status_date,arrival_date,arrival_date_week_number,diferencia_dias,stays_in_weekend_nights,stays_in_week_nights,country,is_repeated_guest,adults,children,babies,customer_type,days_in_waiting_list,lead_time,market_segment,distribution_channel,agent,company,reserved_room_type,assigned_room_type,meal,required_car_parking_spaces,total_of_special_requests,adr,overnight_stays
20,Resort Hotel,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,2,4,PRT,0,2,0.0,0,Transient,0,72,Direct,Direct,250,sin identificar,A,A,BB,0,1,84.67,6
21,Resort Hotel,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,2,4,PRT,0,2,0.0,0,Transient,0,72,Direct,Direct,250,sin identificar,D,D,BB,0,1,99.67,6
23,Resort Hotel,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,2,5,PRT,0,2,0.0,0,Transient,0,78,Offline TA/TO,TA/TO,5,sin identificar,D,D,BB,1,0,63.60,7
24,Resort Hotel,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,2,5,IRL,0,2,0.0,0,Contract,0,48,Offline TA/TO,TA/TO,8,sin identificar,D,D,BB,0,0,79.50,7
26,Resort Hotel,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,2,5,PRT,0,2,0.0,0,Transient,0,77,Online TA,TA/TO,240,sin identificar,A,A,BB,0,0,94.00,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,0,0,0,Canceled,No-Show,2015-11-12,2015-12-11,50,-29,2,5,AGO,0,1,0.0,0,Transient,0,2,Direct,Direct,sin identificar,sin identificar,E,E,BB,0,1,104.00,7
59793,City Hotel,0,5,0,Canceled,No-Show,2016-09-11,2016-11-09,46,-59,0,1,PRT,1,1,0.0,0,Transient,0,6,Corporate,Corporate,sin identificar,219,D,D,BB,0,1,94.00,1
61857,City Hotel,0,10,0,Canceled,No-Show,2016-02-11,2016-11-02,45,-265,0,1,PRT,1,1,0.0,0,Transient,0,26,Corporate,Corporate,sin identificar,219,A,E,BB,0,1,88.00,1
75076,City Hotel,0,1,0,Canceled,No-Show,2017-01-02,2017-02-01,5,-30,0,0,AUT,1,1,0.0,0,Transient,0,1,Aviation,Corporate,sin identificar,153,A,A,BB,0,1,0.00,0


# Creacion de columnas number_of_guests 

In [37]:
# se aprecian valoresnp.nan 
df_final["children"].unique()

array([ 0.,  1.,  2.,  3., nan])

In [38]:
# aplicamos transforma_a_entero a la columna df_final["children"] 
df_final["children"] = df_final["children"].apply(transforma_a_entero)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [39]:
# comprobamos
df_final["children"].unique()

array([0, 1, 2, 3])

In [40]:
# creamnos la columna number_of_guests sumando los valores de adults, children y babies
df_final["number_of_guests"] = df_final["adults"] + df_final["children"] + df_final["babies"]
df_final



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,hotel,previous_cancellations,previous_bookings_not_canceled,booking_changes,is_canceled,reservation_status,reservation_status_date,arrival_date,arrival_date_week_number,diferencia_dias,stays_in_weekend_nights,stays_in_week_nights,country,is_repeated_guest,adults,children,babies,customer_type,days_in_waiting_list,lead_time,market_segment,distribution_channel,agent,company,reserved_room_type,assigned_room_type,meal,required_car_parking_spaces,total_of_special_requests,adr,overnight_stays,number_of_guests
20,Resort Hotel,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,2,4,PRT,0,2,0,0,Transient,0,72,Direct,Direct,250,sin identificar,A,A,BB,0,1,84.67,6,2
21,Resort Hotel,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,2,4,PRT,0,2,0,0,Transient,0,72,Direct,Direct,250,sin identificar,D,D,BB,0,1,99.67,6,2
23,Resort Hotel,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,2,5,PRT,0,2,0,0,Transient,0,78,Offline TA/TO,TA/TO,5,sin identificar,D,D,BB,1,0,63.60,7,2
24,Resort Hotel,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,2,5,IRL,0,2,0,0,Contract,0,48,Offline TA/TO,TA/TO,8,sin identificar,D,D,BB,0,0,79.50,7,2
26,Resort Hotel,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,2,5,PRT,0,2,0,0,Transient,0,77,Online TA,TA/TO,240,sin identificar,A,A,BB,0,0,94.00,7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,0,0,0,Canceled,No-Show,2015-11-12,2015-12-11,50,-29,2,5,AGO,0,1,0,0,Transient,0,2,Direct,Direct,sin identificar,sin identificar,E,E,BB,0,1,104.00,7,1
59793,City Hotel,0,5,0,Canceled,No-Show,2016-09-11,2016-11-09,46,-59,0,1,PRT,1,1,0,0,Transient,0,6,Corporate,Corporate,sin identificar,219,D,D,BB,0,1,94.00,1,1
61857,City Hotel,0,10,0,Canceled,No-Show,2016-02-11,2016-11-02,45,-265,0,1,PRT,1,1,0,0,Transient,0,26,Corporate,Corporate,sin identificar,219,A,E,BB,0,1,88.00,1,1
75076,City Hotel,0,1,0,Canceled,No-Show,2017-01-02,2017-02-01,5,-30,0,0,AUT,1,1,0,0,Transient,0,1,Aviation,Corporate,sin identificar,153,A,A,BB,0,1,0.00,0,1


In [41]:
# reordenamos las columnas 
df_final = df_final[['hotel', 
                        'market_segment', 
                        'distribution_channel',
                        'agent', 
                        'company',
                        'previous_cancellations',
                        'previous_bookings_not_canceled', 
                        'booking_changes',               
                        'is_canceled', 
                        'reservation_status', 
                        'reservation_status_date', 
                        'arrival_date',
                        'arrival_date_week_number',
                        'diferencia_dias',

                        'overnight_stays',
                        'stays_in_weekend_nights',
                        'stays_in_week_nights',
                        
                        'country', 
                        'is_repeated_guest', 
                        'adults', 
                        'children',
                        'babies',
                        'number_of_guests',
                        'customer_type', 
                        'days_in_waiting_list', 
                        'lead_time',    
                        
                        'reserved_room_type',
                        'assigned_room_type', 
                        'meal', 
                        'required_car_parking_spaces', 
                        'total_of_special_requests',
 
                        'adr']]

df_final


Unnamed: 0,hotel,market_segment,distribution_channel,agent,company,previous_cancellations,previous_bookings_not_canceled,booking_changes,is_canceled,reservation_status,reservation_status_date,arrival_date,arrival_date_week_number,diferencia_dias,overnight_stays,stays_in_weekend_nights,stays_in_week_nights,country,is_repeated_guest,adults,children,babies,number_of_guests,customer_type,days_in_waiting_list,lead_time,reserved_room_type,assigned_room_type,meal,required_car_parking_spaces,total_of_special_requests,adr
20,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,PRT,0,2,0,0,2,Transient,0,72,A,A,BB,0,1,84.67
21,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,PRT,0,2,0,0,2,Transient,0,72,D,D,BB,0,1,99.67
23,Resort Hotel,Offline TA/TO,TA/TO,5,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,PRT,0,2,0,0,2,Transient,0,78,D,D,BB,1,0,63.60
24,Resort Hotel,Offline TA/TO,TA/TO,8,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,IRL,0,2,0,0,2,Contract,0,48,D,D,BB,0,0,79.50
26,Resort Hotel,Online TA,TA/TO,240,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,PRT,0,2,0,0,2,Transient,0,77,A,A,BB,0,0,94.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,Direct,Direct,sin identificar,sin identificar,0,0,0,Canceled,No-Show,2015-11-12,2015-12-11,50,-29,7,2,5,AGO,0,1,0,0,1,Transient,0,2,E,E,BB,0,1,104.00
59793,City Hotel,Corporate,Corporate,sin identificar,219,0,5,0,Canceled,No-Show,2016-09-11,2016-11-09,46,-59,1,0,1,PRT,1,1,0,0,1,Transient,0,6,D,D,BB,0,1,94.00
61857,City Hotel,Corporate,Corporate,sin identificar,219,0,10,0,Canceled,No-Show,2016-02-11,2016-11-02,45,-265,1,0,1,PRT,1,1,0,0,1,Transient,0,26,A,E,BB,0,1,88.00
75076,City Hotel,Aviation,Corporate,sin identificar,153,0,1,0,Canceled,No-Show,2017-01-02,2017-02-01,5,-30,0,0,0,AUT,1,1,0,0,1,Transient,0,1,A,A,BB,0,1,0.00


## 6-Creacion de nuevas columnas con paises 

In [42]:
import pycountry

def convert_iso_alpha3_to_country_name(iso_alpha3):
    """ la fucnion convierte el codigo iso alpha 3 a nombre de pais 

    Args:
        iso_alpha3 (str): codigo iso alpha 3 perteneciente a un pais

    Returns:
        str: nombre del pais
    """
    try:
        country = pycountry.countries.get(alpha_3=iso_alpha3)
        return country.name
    except:
        return np.nan

In [43]:
# aplicamos la funcion convert_iso_alpha3_to_country_name a la columna country
df_final["country"] = df_final["country"].apply(lambda x: convert_iso_alpha3_to_country_name(x))
df_final

Unnamed: 0,hotel,market_segment,distribution_channel,agent,company,previous_cancellations,previous_bookings_not_canceled,booking_changes,is_canceled,reservation_status,reservation_status_date,arrival_date,arrival_date_week_number,diferencia_dias,overnight_stays,stays_in_weekend_nights,stays_in_week_nights,country,is_repeated_guest,adults,children,babies,number_of_guests,customer_type,days_in_waiting_list,lead_time,reserved_room_type,assigned_room_type,meal,required_car_parking_spaces,total_of_special_requests,adr
20,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,Portugal,0,2,0,0,2,Transient,0,72,A,A,BB,0,1,84.67
21,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,Portugal,0,2,0,0,2,Transient,0,72,D,D,BB,0,1,99.67
23,Resort Hotel,Offline TA/TO,TA/TO,5,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Portugal,0,2,0,0,2,Transient,0,78,D,D,BB,1,0,63.60
24,Resort Hotel,Offline TA/TO,TA/TO,8,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Ireland,0,2,0,0,2,Contract,0,48,D,D,BB,0,0,79.50
26,Resort Hotel,Online TA,TA/TO,240,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Portugal,0,2,0,0,2,Transient,0,77,A,A,BB,0,0,94.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,Direct,Direct,sin identificar,sin identificar,0,0,0,Canceled,No-Show,2015-11-12,2015-12-11,50,-29,7,2,5,Angola,0,1,0,0,1,Transient,0,2,E,E,BB,0,1,104.00
59793,City Hotel,Corporate,Corporate,sin identificar,219,0,5,0,Canceled,No-Show,2016-09-11,2016-11-09,46,-59,1,0,1,Portugal,1,1,0,0,1,Transient,0,6,D,D,BB,0,1,94.00
61857,City Hotel,Corporate,Corporate,sin identificar,219,0,10,0,Canceled,No-Show,2016-02-11,2016-11-02,45,-265,1,0,1,Portugal,1,1,0,0,1,Transient,0,26,A,E,BB,0,1,88.00
75076,City Hotel,Aviation,Corporate,sin identificar,153,0,1,0,Canceled,No-Show,2017-01-02,2017-02-01,5,-30,0,0,0,Austria,1,1,0,0,1,Transient,0,1,A,A,BB,0,1,0.00


# Creacion de columnas room_modification

In [44]:
# creamos una columna que indique si se modifico la habitacion tras la reserva   
df_final["room_modification"] = np.where(df_final["reserved_room_type"] == df_final["assigned_room_type"], "Si", "No")
df_final


Unnamed: 0,hotel,market_segment,distribution_channel,agent,company,previous_cancellations,previous_bookings_not_canceled,booking_changes,is_canceled,reservation_status,reservation_status_date,arrival_date,arrival_date_week_number,diferencia_dias,overnight_stays,stays_in_weekend_nights,stays_in_week_nights,country,is_repeated_guest,adults,children,babies,number_of_guests,customer_type,days_in_waiting_list,lead_time,reserved_room_type,assigned_room_type,meal,required_car_parking_spaces,total_of_special_requests,adr,room_modification
20,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,Portugal,0,2,0,0,2,Transient,0,72,A,A,BB,0,1,84.67,Si
21,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,Portugal,0,2,0,0,2,Transient,0,72,D,D,BB,0,1,99.67,Si
23,Resort Hotel,Offline TA/TO,TA/TO,5,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Portugal,0,2,0,0,2,Transient,0,78,D,D,BB,1,0,63.60,Si
24,Resort Hotel,Offline TA/TO,TA/TO,8,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Ireland,0,2,0,0,2,Contract,0,48,D,D,BB,0,0,79.50,Si
26,Resort Hotel,Online TA,TA/TO,240,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Portugal,0,2,0,0,2,Transient,0,77,A,A,BB,0,0,94.00,Si
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,Direct,Direct,sin identificar,sin identificar,0,0,0,Canceled,No-Show,2015-11-12,2015-12-11,50,-29,7,2,5,Angola,0,1,0,0,1,Transient,0,2,E,E,BB,0,1,104.00,Si
59793,City Hotel,Corporate,Corporate,sin identificar,219,0,5,0,Canceled,No-Show,2016-09-11,2016-11-09,46,-59,1,0,1,Portugal,1,1,0,0,1,Transient,0,6,D,D,BB,0,1,94.00,Si
61857,City Hotel,Corporate,Corporate,sin identificar,219,0,10,0,Canceled,No-Show,2016-02-11,2016-11-02,45,-265,1,0,1,Portugal,1,1,0,0,1,Transient,0,26,A,E,BB,0,1,88.00,No
75076,City Hotel,Aviation,Corporate,sin identificar,153,0,1,0,Canceled,No-Show,2017-01-02,2017-02-01,5,-30,0,0,0,Austria,1,1,0,0,1,Transient,0,1,A,A,BB,0,1,0.00,Si


In [46]:
# reordenamos las columnas 
df_final = df_final[['hotel', 
                    'market_segment', 
                    'distribution_channel',
                    'agent', 
                    'company',
                    'previous_cancellations',
                    'previous_bookings_not_canceled', 
                    'booking_changes',               
                    'is_canceled', 
                    'reservation_status', 
                    'reservation_status_date', 
                    'arrival_date',
                    'arrival_date_week_number',
                    'diferencia_dias',

                    'overnight_stays',
                    'stays_in_weekend_nights',
                    'stays_in_week_nights',
                    
                    'country', 

                    'is_repeated_guest', 
                    'adults', 
                    'children',
                    'babies',
                    'number_of_guests',
                    'customer_type', 
                    'days_in_waiting_list', 
                    'lead_time',    
                    
                    'reserved_room_type',
                    'assigned_room_type', 
                    'room_modification',
                    'meal', 
                    'required_car_parking_spaces', 
                    'total_of_special_requests',
                    
                    'adr']]

df_final

Unnamed: 0,hotel,market_segment,distribution_channel,agent,company,previous_cancellations,previous_bookings_not_canceled,booking_changes,is_canceled,reservation_status,reservation_status_date,arrival_date,arrival_date_week_number,diferencia_dias,overnight_stays,stays_in_weekend_nights,stays_in_week_nights,country,is_repeated_guest,adults,children,babies,number_of_guests,customer_type,days_in_waiting_list,lead_time,reserved_room_type,assigned_room_type,room_modification,meal,required_car_parking_spaces,total_of_special_requests,adr
20,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,Portugal,0,2,0,0,2,Transient,0,72,A,A,Si,BB,0,1,84.67
21,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,Portugal,0,2,0,0,2,Transient,0,72,D,D,Si,BB,0,1,99.67
23,Resort Hotel,Offline TA/TO,TA/TO,5,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Portugal,0,2,0,0,2,Transient,0,78,D,D,Si,BB,1,0,63.60
24,Resort Hotel,Offline TA/TO,TA/TO,8,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Ireland,0,2,0,0,2,Contract,0,48,D,D,Si,BB,0,0,79.50
26,Resort Hotel,Online TA,TA/TO,240,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Portugal,0,2,0,0,2,Transient,0,77,A,A,Si,BB,0,0,94.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,Direct,Direct,sin identificar,sin identificar,0,0,0,Canceled,No-Show,2015-11-12,2015-12-11,50,-29,7,2,5,Angola,0,1,0,0,1,Transient,0,2,E,E,Si,BB,0,1,104.00
59793,City Hotel,Corporate,Corporate,sin identificar,219,0,5,0,Canceled,No-Show,2016-09-11,2016-11-09,46,-59,1,0,1,Portugal,1,1,0,0,1,Transient,0,6,D,D,Si,BB,0,1,94.00
61857,City Hotel,Corporate,Corporate,sin identificar,219,0,10,0,Canceled,No-Show,2016-02-11,2016-11-02,45,-265,1,0,1,Portugal,1,1,0,0,1,Transient,0,26,A,E,No,BB,0,1,88.00
75076,City Hotel,Aviation,Corporate,sin identificar,153,0,1,0,Canceled,No-Show,2017-01-02,2017-02-01,5,-30,0,0,0,Austria,1,1,0,0,1,Transient,0,1,A,A,Si,BB,0,1,0.00


## 7-Creacion de columna con Geolocalizacion de los paises 


In [47]:
lista_paises = list(df_final["country"].unique())

In [48]:
# importamos geopy 
from geopy.geocoders import Nominatim
import tqdm as tqdm 

In [49]:
# creamos una instancia de Nominatim
geolocator = Nominatim(user_agent="my_app")

# creamos una funcion para obtener la latitud y longitud de un pais
def get_lat_lon(country):
    """ la funcion recibe un pais y devuelve la latitud y longitud


    Args:
        country (str): nombre del pais a buscar
    Returns:
        tuple: devuelve una tuple con la latitud y longitud
    """
    try:
        location = geolocator.geocode(country)
        return location.latitude, location.longitude
    except:
        return np.nan, np.nan
    

In [50]:
# creamos un diccionario con los paises y sus coordenadas 
diccionario_coordenadas = {}
for pais in lista_paises:
    lat, lon = get_lat_lon(pais)
    diccionario_coordenadas[pais] = [lat, lon]
    print(pais, lat, lon)
    

Portugal 39.6621648 -8.1353519
Ireland 52.865196 -7.9794599
nan 46.3144754 11.0480288
United Kingdom 54.7023545 -3.2765753
Romania 45.9852129 24.6859225
Spain 39.3260685 -4.8379791
Norway 61.1529386 8.7876653
Oman 21.0000287 57.0036901
Germany 51.1638175 10.4478313
France 46.603354 1.8883335
Switzerland 46.7985624 8.2319736
United States 39.7837304 -100.445882
Greece 38.9953683 21.9877132
Italy 42.6384261 12.674297
Netherlands 52.2434979 5.6343227
Denmark 55.670249 10.3333283
Argentina -34.9964963 -64.9672817
Russian Federation 64.6863136 97.7453061
Sweden 59.6749712 14.5208584
Poland 52.215933 19.134422
Australia -24.7761086 134.755
Estonia 58.7523778 25.3319078
Czechia 49.7439047 15.3381061
Brazil -10.3333333 -53.2
Belgium 50.6402809 4.6667145
Finland 63.2467777 25.9209164
Mozambique -19.302233 34.9144977
Botswana -23.1681782 24.5928742
Luxembourg 49.8158683 6.1296751
Slovenia 46.1199444 14.8153333
Albania 41.000028 19.9999619
India 22.3511148 78.6677428
China 35.000074 104.999927
Me

In [51]:
df_final["latitud"] = df_final["country"].apply(lambda x: diccionario_coordenadas[x][0])
df_final["longitud"] = df_final["country"].apply(lambda x: diccionario_coordenadas[x][1])




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [52]:
df_final

Unnamed: 0,hotel,market_segment,distribution_channel,agent,company,previous_cancellations,previous_bookings_not_canceled,booking_changes,is_canceled,reservation_status,reservation_status_date,arrival_date,arrival_date_week_number,diferencia_dias,overnight_stays,stays_in_weekend_nights,stays_in_week_nights,country,is_repeated_guest,adults,children,babies,number_of_guests,customer_type,days_in_waiting_list,lead_time,reserved_room_type,assigned_room_type,room_modification,meal,required_car_parking_spaces,total_of_special_requests,adr,latitud,longitud
20,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,Portugal,0,2,0,0,2,Transient,0,72,A,A,Si,BB,0,1,84.67,39.662165,-8.135352
21,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,Portugal,0,2,0,0,2,Transient,0,72,D,D,Si,BB,0,1,99.67,39.662165,-8.135352
23,Resort Hotel,Offline TA/TO,TA/TO,5,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Portugal,0,2,0,0,2,Transient,0,78,D,D,Si,BB,1,0,63.60,39.662165,-8.135352
24,Resort Hotel,Offline TA/TO,TA/TO,8,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Ireland,0,2,0,0,2,Contract,0,48,D,D,Si,BB,0,0,79.50,52.865196,-7.979460
26,Resort Hotel,Online TA,TA/TO,240,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Portugal,0,2,0,0,2,Transient,0,77,A,A,Si,BB,0,0,94.00,39.662165,-8.135352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,Direct,Direct,sin identificar,sin identificar,0,0,0,Canceled,No-Show,2015-11-12,2015-12-11,50,-29,7,2,5,Angola,0,1,0,0,1,Transient,0,2,E,E,Si,BB,0,1,104.00,-11.877577,17.569124
59793,City Hotel,Corporate,Corporate,sin identificar,219,0,5,0,Canceled,No-Show,2016-09-11,2016-11-09,46,-59,1,0,1,Portugal,1,1,0,0,1,Transient,0,6,D,D,Si,BB,0,1,94.00,39.662165,-8.135352
61857,City Hotel,Corporate,Corporate,sin identificar,219,0,10,0,Canceled,No-Show,2016-02-11,2016-11-02,45,-265,1,0,1,Portugal,1,1,0,0,1,Transient,0,26,A,E,No,BB,0,1,88.00,39.662165,-8.135352
75076,City Hotel,Aviation,Corporate,sin identificar,153,0,1,0,Canceled,No-Show,2017-01-02,2017-02-01,5,-30,0,0,0,Austria,1,1,0,0,1,Transient,0,1,A,A,Si,BB,0,1,0.00,47.593970,14.124560


In [53]:
# reordenamos las columnas 
df_final = df_final[['hotel', 
                    'market_segment', 
                    'distribution_channel',
                    'agent', 
                    'company',
                    'previous_cancellations',
                    'previous_bookings_not_canceled', 
                    'booking_changes',               
                    'is_canceled', 
                    'reservation_status', 
                    'reservation_status_date', 
                    'arrival_date',
                    'arrival_date_week_number',
                    'diferencia_dias',

                    'overnight_stays',
                    'stays_in_weekend_nights',
                    'stays_in_week_nights',
                    
                    'country', 
                    'latitud',
                    'longitud',
                    'is_repeated_guest', 
                    'adults', 
                    'children',
                    'babies',
                    'number_of_guests',
                    'customer_type', 
                    'days_in_waiting_list', 
                    'lead_time',    
                    
                    'reserved_room_type',
                    'assigned_room_type', 
                    'meal', 
                    'required_car_parking_spaces', 
                    'total_of_special_requests',
                    
                    'adr']]

df_final

Unnamed: 0,hotel,market_segment,distribution_channel,agent,company,previous_cancellations,previous_bookings_not_canceled,booking_changes,is_canceled,reservation_status,reservation_status_date,arrival_date,arrival_date_week_number,diferencia_dias,overnight_stays,stays_in_weekend_nights,stays_in_week_nights,country,latitud,longitud,is_repeated_guest,adults,children,babies,number_of_guests,customer_type,days_in_waiting_list,lead_time,reserved_room_type,assigned_room_type,meal,required_car_parking_spaces,total_of_special_requests,adr
20,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,Portugal,39.662165,-8.135352,0,2,0,0,2,Transient,0,72,A,A,BB,0,1,84.67
21,Resort Hotel,Direct,Direct,250,sin identificar,0,0,1,Not Canceled,Check-Out,2015-07-07,2015-07-01,27,6,6,2,4,Portugal,39.662165,-8.135352,0,2,0,0,2,Transient,0,72,D,D,BB,0,1,99.67
23,Resort Hotel,Offline TA/TO,TA/TO,5,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Portugal,39.662165,-8.135352,0,2,0,0,2,Transient,0,78,D,D,BB,1,0,63.60
24,Resort Hotel,Offline TA/TO,TA/TO,8,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Ireland,52.865196,-7.979460,0,2,0,0,2,Contract,0,48,D,D,BB,0,0,79.50
26,Resort Hotel,Online TA,TA/TO,240,sin identificar,0,0,0,Not Canceled,Check-Out,2015-08-07,2015-07-01,27,37,7,2,5,Portugal,39.662165,-8.135352,0,2,0,0,2,Transient,0,77,A,A,BB,0,0,94.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56255,City Hotel,Direct,Direct,sin identificar,sin identificar,0,0,0,Canceled,No-Show,2015-11-12,2015-12-11,50,-29,7,2,5,Angola,-11.877577,17.569124,0,1,0,0,1,Transient,0,2,E,E,BB,0,1,104.00
59793,City Hotel,Corporate,Corporate,sin identificar,219,0,5,0,Canceled,No-Show,2016-09-11,2016-11-09,46,-59,1,0,1,Portugal,39.662165,-8.135352,1,1,0,0,1,Transient,0,6,D,D,BB,0,1,94.00
61857,City Hotel,Corporate,Corporate,sin identificar,219,0,10,0,Canceled,No-Show,2016-02-11,2016-11-02,45,-265,1,0,1,Portugal,39.662165,-8.135352,1,1,0,0,1,Transient,0,26,A,E,BB,0,1,88.00
75076,City Hotel,Aviation,Corporate,sin identificar,153,0,1,0,Canceled,No-Show,2017-01-02,2017-02-01,5,-30,0,0,0,Austria,47.593970,14.124560,1,1,0,0,1,Transient,0,1,A,A,BB,0,1,0.00


In [54]:
# guardamos el dataframe en un csv
df_final.to_csv("../data/finanzas-hotel-bookings-limpio.csv", index=False)