In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# Librerías para trabajar con fechas
# -----------------------------------------------------------------------
from datetime import datetime

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames
# pd.set_option('display.max_rows', None) # para poder visualizar todas las filas de los DataFrames

In [2]:
df = pd.read_csv('../data/finanzas-hotel-bookings.csv', index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
def exploracion_df(dataframe):
    """
    Realiza un análisis exploratorio básico de un DataFrame, mostrando información sobre duplicados,
    valores nulos, tipos de datos, valores únicos para columnas categóricas y estadísticas descriptivas
    para columnas categóricas y numéricas, agrupadas por la columna de control.

    Parámetros:
    - dataframe (DataFrame): El DataFrame que se va a explorar.
    
    Returns: 
    No devuelve nada directamente, pero imprime en la consola la información exploratoria.
    """
    
    #Echamos un vistazo
    print("Primeras filas:")
    display(dataframe.head(5))
    print("\n......................\n")

    print("Últimas filas:")
    display(dataframe.tail(5))
    print("\n.....................\n")

    print("Filas aleatorias:")
    display(dataframe.sample(5))
    print("\n.....................\n")

    print(f"El dataframe tiene {dataframe.shape[0]} filas y {dataframe.shape[1]} columnas")
    print("\n.....................\n")    

    print("Tipos de datos y nulos:")
    display(dataframe.info())
    print("\n.....................\n")
    
    print("Características columnas númericas:")
    display(dataframe.describe().T)
    print("\n.....................\n")

    print("Características columnas de texto")
    try:
        display(dataframe.describe(include='O').T)  
    except:
        print("No hay columnas categóricas")
    print("\n.....................\n")

    print("Duplicados:")
    display(dataframe.duplicated().sum())
    print("\n.....................\n")

    # generamos un DataFrame para los valores nulos
    print("Los nulos que tenemos en el conjunto de datos son:")
    df_nulos = pd.DataFrame(dataframe.isnull().sum() / dataframe.shape[0] * 100, columns = ["%_nulos"])
    display(df_nulos[df_nulos["%_nulos"] > 0])

    print("\n.....................\n")
    print("Los valores que tenemos para las columnas categóricas son: ")
    try:
        df_categoricas = dataframe.select_dtypes(include = "O")

        for col in df_categoricas.columns:
            print(f"La columna {col.upper()} tiene las siguientes valore únicos:")
            display(pd.DataFrame(dataframe[col].value_counts()/dataframe[col].shape[0])*100) 
    except:
        print("No hay columnas categóricas")
    print("\n.....................\n")  

In [None]:
# exploracion_df(df)

In [None]:
def get_info(df: pd.DataFrame):
    info = df.dtypes.to_frame('dtypes')
    info['non_null'] = df.count()
    info['null'] = 182877 -df.count()
    info['unique_values'] = df.apply(lambda srs: len(srs.unique()))
    return info

df_info=get_info(df)
df_info.to_csv("info.csv")

In [3]:
df.drop_duplicates(inplace = True)

In [4]:
df.dropna(how='all', inplace=True)

In [None]:
def str_to_nan(columna, cadena):
    """
    Convierte un string a un nulo de numpy

    Args:
        columna (Series): columna de un dataframe donde cambiar un string por np.nan

    Returns:
        columna (Series): devuelve la columna del dataframe con los strings especificados cambiados por nulos 
    """  
    return columna.replace(cadena, np.nan)

In [5]:
df['reservation_status_date'][df['reservation_status_date'] > '2020-01-01']

313       2030-02-30
7646      2030-11-31
9139      2029-04-31
9493      2030-06-31
18350     2030-09-31
21580     2025-02-30
22854     2029-11-31
24063     2023-06-31
24395     2029-06-31
34886     2029-02-30
35541     2022-09-31
41367     2024-09-31
44568     2028-04-31
48657     2024-06-31
48916     2025-11-31
50821     2026-09-31
51467     2027-09-31
53791     2023-11-31
55521     2028-09-31
62556     2028-02-30
63204     2026-06-31
65957     2024-04-31
72373     2023-04-31
74397     2025-04-31
84676     2027-04-31
85059     2022-06-31
100411    2024-02-30
101167    2025-09-31
109572    2027-02-30
110258    2030-04-31
121013    2023-09-31
122688    2027-06-31
124058    2026-11-31
126379    2028-11-31
131036    2028-06-31
138971    2026-04-31
147148    2022-11-31
152300    2025-06-31
154884    2029-09-31
159155    2023-02-30
177526    2026-02-30
179002    2022-04-31
180857    2022-02-30
181291    2027-11-31
182004    2024-11-31
Name: reservation_status_date, dtype: object

In [6]:
def a_fecha(cadena):
    try:
        return pd.to_datetime(cadena, format='%Y-%m-%d %H:%M:%S')
    except:
        return np.nan    

In [7]:
df['reservation_status_date'] = df['reservation_status_date'].apply(a_fecha)

In [8]:
df['reservation_status_date'] = df['reservation_status_date'].dt.year
df['reservation_status_date'].apply

<bound method Series.apply of 0         2015.0
1         2015.0
2         2015.0
3         2015.0
4         2015.0
           ...  
182872       NaN
182873       NaN
182874       NaN
182875       NaN
182876       NaN
Name: reservation_status_date, Length: 119836, dtype: float64>

In [9]:
df['arrival_date_year'] = df['arrival_date_year'].fillna(df['reservation_status_date'])
df['arrival_date_year'][df['arrival_date_year'].isna()]

20       NaN
46       NaN
67       NaN
105      NaN
108      NaN
          ..
182872   NaN
182873   NaN
182874   NaN
182875   NaN
182876   NaN
Name: arrival_date_year, Length: 7760, dtype: float64

In [None]:
display(df['arrival_date_year'].isna().sum()/df.shape[0]*100)

In [None]:
df['country'].unique()

In [10]:
codigo_paises = {
    'PRT': 'Portugal',
    np. nan: np.nan,
    'GBR': 'United Kingdom',
    'USA': 'United States',
    'ESP': 'Spain',
    'IRL': 'Ireland',
    'FRA': 'France',
    'ROU': 'Romania',
    'NOR': 'Norway',
    'OMN': 'Oman',
    'ARG': 'Argentina',
    'DEU': 'Germany',
    'CHE': 'Switzerland',
    'GRC': 'Greece',
    'NLD': 'Netherlands',
    'DNK': 'Denmark',
    'RUS': 'Russia',
    'POL': 'Poland',
    'AUS': 'Australia',
    'EST': 'Estonia',
    'CZE': 'Czech Republic',
    'BRA': 'Brazil',
    'BEL': 'Belgium',
    'CN': 'China',
    'SWE': 'Sweden',
    'FIN': 'Finland',
    'MOZ': 'Mozambique',
    'SVN': 'Slovenia',
    'MAR': 'Morocco',
    'ITA': 'Italy',
    'UKR': 'Ukraine',
    'SMR': 'San Marino',
    'LVA': 'Latvia',
    'PRI': 'Puerto Rico',
    'CHL': 'Chile',
    'CHN': 'China',
    'LTU': 'Lithuania',
    'LUX': 'Luxembourg',
    'AUT': 'Austria',
    'TUR': 'Turkey',
    'MEX': 'Mexico',
    'ZAF': 'South Africa',
    'AGO': 'Angola',
    'ISR': 'Israel',
    'IND': 'India',
    'CYM': 'Cayman Islands',
    'ZMB': 'Zambia',
    'CPV': 'Cape Verde',
    'ZWE': 'Zimbabwe',
    'DZA': 'Algeria',
    'KOR': 'South Korea',
    'HUN': 'Hungary',
    'ARE': 'United Arab Emirates',
    'TUN': 'Tunisia',
    'JAM': 'Jamaica',
    'ALB': 'Albania',
    'HRV': 'Croatia',
    'HKG': 'Hong Kong',
    'AND': 'Andorra',
    'GIB': 'Gibraltar',
    'URY': 'Uruguay',
    'BLR': 'Belarus',
    'JEY': 'Jersey',
    'CYP': 'Cyprus',
    'MDV': 'Maldives',
    'FJI': 'Fiji',
    'KAZ': 'Kazakhstan',
    'PAK': 'Pakistan',
    'IDN': 'Indonesia',
    'LBN': 'Lebanon',
    'PHL': 'Philippines',
    'COL': 'Colombia',
    'SEN': 'Senegal',
    'GEO': 'Georgia',
    'AZE': 'Azerbaijan',
    'BHR': 'Bahrain',
    'NZL': 'New Zealand',
    'THA': 'Thailand',
    'DOM': 'Dominican Republic',
    'MYS': 'Malaysia',
    'VEN': 'Venezuela',
    'ARM': 'Armenia',
    'LKA': 'Sri Lanka',
    'CUB': 'Cuba',
    'CMR': 'Cameroon',
    'IRN': 'Iran',
    'BIH': 'Bosnia and Herzegovina',
    'NGA': 'Nigeria',
    'COM': 'Comoros',
    'BGR': 'Bulgaria',
    'CIV': 'Ivory Coast',
    'SRB': 'Serbia',
    'JOR': 'Jordan',
    'SYR': 'Syria',
    'BDI': 'Burundi',
    'SGP': 'Singapore',
    'KWT': 'Kuwait',
    'PLW': 'Palau',
    'QAT': 'Qatar',
    'SVK': 'Slovakia',
    'SUR': 'Suriname',
    'MLT': 'Malta',
    'MWI': 'Malawi',
    'MDG': 'Madagascar',
    'ISL': 'Iceland',
    'JPN': 'Japan',
    'CAF': 'Central African Republic',
    'TGO': 'Togo',
    'TWN': 'Taiwan',
    'DJI': 'Djibouti',
    'VNM': 'Vietnam',
    'PER': 'Peru',
    'EGY': 'Egypt',
    'SAU': 'Saudi Arabia',
    'KNA': 'Saint Kitts and Nevis',
    'ETH': 'Ethiopia',
    'ECU': 'Ecuador',
    'IRQ': 'Iraq',
    'KHM': 'Cambodia',
    'MCO': 'Monaco',
    'BGD': 'Bangladesh',
    'TJK': 'Tajikistan',
    'NIC': 'Nicaragua',
    'GGY': 'Guernsey',
    'BEN': 'Benin',
    'VGB': 'British Virgin Islands',
    'CRI': 'Costa Rica',
    'TZA': 'Tanzania',
    'GAB': 'Gabon',
    'MKD': 'North Macedonia',
    'TMP': 'East Timor',
    'GLP': 'Guadeloupe',
    'LIE': 'Liechtenstein',
    'GNB': 'Guinea-Bissau',
    'MAC': 'Macau',
    'IMN': 'Isle of Man',
    'UMI': 'U.S. Minor Outlying Islands',
    'MYT': 'Mayotte',
    'GHA': 'Ghana',
    'FRO': 'Faroe Islands',
    'MMR': 'Myanmar',
    'PAN': 'Panama',
    'MUS': 'Mauritius',
    'LBY': 'Libya',
    'NAM': 'Namibia',
    'BOL': 'Bolivia',
    'PRY': 'Paraguay',
    'BRB': 'Barbados',
    'ABW': 'Aruba',
    'AIA': 'Anguilla',
    'DMA': 'Dominica',
    'UGA': 'Uganda',
    'MNE': 'Montenegro',
    'GTM': 'Guatemala',
    'ASM': 'American Samoa',
    'KEN': 'Kenya',
    'NCL': 'New Caledonia',
    'STP': 'Sao Tome and Principe',
    'KIR': 'Kiribati',
    'SDN': 'Sudan',
    'ATF': 'French Southern Territories',
    'SLE': 'Sierra Leone',
    'SLV': 'El Salvador',
    'LAO': 'Laos'
}


In [11]:
df["country"]=df["country"].map(codigo_paises)

In [12]:
mapa_repeted = {1 :'Recurrent', 0 : 'First time', np.nan : np.nan}

In [13]:
df["is_repeated_guest"]=df["is_repeated_guest"].map(mapa_repeted)
df['is_repeated_guest'].unique()

array(['First time', nan, 'Recurrent'], dtype=object)

In [14]:
mapa_cancel = {True :'Cancelado', False : 'No Cancelado', np.nan: np.nan}

In [15]:
df["is_canceled"]=df["is_canceled"].map(mapa_cancel)
df['is_canceled'].unique()

array(['No Cancelado', 'Cancelado', nan], dtype=object)

In [16]:
df['arrival_date_month'] = df['arrival_date_month'].str.replace('1', 'January').str.replace('2', 'February').str.replace('3', 'March')
df['arrival_date_month'].unique()

array(['July', 'August', 'September', 'October', 'November', 'December',
       'January', 'February', 'March', 'April', 'May', 'June', nan],
      dtype=object)

In [None]:
df['previous_bookings_not_canceled'].value_counts()/df.shape[0]*100

In [17]:
df_rooms = df[['reserved_room_type', 'assigned_room_type']][df['reserved_room_type'].notna()&df['assigned_room_type'].notna()]
#df_channel = df_channels.groupby(['market_segment', 'distribution_channel']).size().reset_index(name='Freq').sort_values(by='distribution_channel')
df_rooms

Unnamed: 0,reserved_room_type,assigned_room_type
0,C,C
2,A,C
3,A,A
4,A,A
5,A,A
...,...,...
119382,G,G
119384,A,A
119385,A,A
119386,E,E


In [18]:
df['change_room'] = np.where(df['reserved_room_type']== df['assigned_room_type'], 'equal', 'change')
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,0,change_room
0,Resort Hotel,No Cancelado,342.000000,2015.0,July,27.0,1.0,0.000000,0.000000,2.0,,0.0,BB,Portugal,,Direct,First time,,0.0,C,C,3.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015.0,,equal
1,Resort Hotel,No Cancelado,737.000000,2015.0,July,27.0,1.0,0.000000,0.000000,2.0,,0.0,BB,,,Direct,First time,0.0,0.0,,C,4.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015.0,,change
2,Resort Hotel,No Cancelado,7.000000,2015.0,July,27.0,1.0,0.000000,1.000000,1.0,0.0,0.0,BB,United Kingdom,,Direct,First time,0.0,0.0,A,C,0.0,,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015.0,,change
3,Resort Hotel,No Cancelado,13.000000,2015.0,July,27.0,1.0,0.000000,1.000000,1.0,,0.0,BB,United Kingdom,Corporate,Corporate,First time,0.0,0.0,A,A,0.0,304.0,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015.0,,equal
4,Resort Hotel,No Cancelado,14.000000,2015.0,July,,1.0,0.000000,2.000000,2.0,,0.0,BB,,Online TA,TA/TO,First time,0.0,0.0,A,A,0.0,240.0,,0.0,Transient,98.0,0.0,1.0,Check-Out,2015.0,,equal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182872,,,224.337762,,,,,19.005545,19.819823,,,,,,,,,,,,,,,,,,,,,,,,change
182873,,,390.141963,,,,,14.751794,19.989726,,,,,,,,,,,,,,,,,,,,,,,,change
182874,,,230.689826,,,,,11.409496,20.461372,,,,,,,,,,,,,,,,,,,,,,,,change
182875,,,304.888534,,,,,16.744472,15.400773,,,,,,,,,,,,,,,,,,,,,,,,change


In [19]:
df['change_room'].value_counts()/df.shape[0]*100

equal     56.867719
change    43.132281
Name: change_room, dtype: float64

In [None]:
df['days_in_waiting_list'].value_counts()/df.shape[0]*100

In [20]:
df['adr'][df['adr']<0] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['adr'][df['adr']<0] = np.nan


In [21]:
df['adr'][df['adr']<0]

Series([], Name: adr, dtype: float64)

In [None]:
df['adr'].value_counts()/df.shape[0]*100

In [22]:
df_canceled = df[['is_canceled', 'reservation_status']][df['is_canceled'].notna()&df['reservation_status'].notna()]
df_canceled[(df['reservation_status'] == 'Canceled') & (df['is_canceled'] != 'Cancelado')]

  df_canceled[(df['reservation_status'] == 'Canceled') & (df['is_canceled'] != 'Cancelado')]


Unnamed: 0,is_canceled,reservation_status


In [None]:
df['booking_changes'].value_counts()/df.shape[0]*100

In [None]:
df['adults'].value_counts()/df.shape[0]*100

In [None]:
df['babies'].value_counts()/df.shape[0]*100

In [None]:
df['lead_time'].value_counts()/df.shape[0]*100

In [None]:
df['stays_in_weekend_nights'].value_counts()/df.shape[0]*100

In [None]:
df['stays_in_week_nights'].value_counts()/df.shape[0]*100

In [None]:
df_nights= df[['stays_in_week_nights', 'stays_in_weekend_nights']][df['stays_in_week_nights'].notna()&df['stays_in_weekend_nights'].notna()]
df_nights

In [23]:
col_desconocido = ['company', 'market_segment', 'country', 'reserved_room_type', 'customer_type', 'agent', 'distribution_channel', 'hotel', 'is_canceled', 'arrival_date_month', 'meal', 'assigned_room_type', 'reservation_status']

for col in col_desconocido:
    df[col]=df[col].fillna('Unknown')

df[col_desconocido].isna().sum()

company                 0
market_segment          0
country                 0
reserved_room_type      0
customer_type           0
agent                   0
distribution_channel    0
hotel                   0
is_canceled             0
arrival_date_month      0
meal                    0
assigned_room_type      0
reservation_status      0
dtype: int64

In [34]:
col_cero= ['children', 'previous_cancellations', 'previous_bookings_not_canceled', 'days_in_waiting_list', 'required_car_parking_spaces', 'total_of_special_requests', 'booking_changes', 'babies']

for columna in col_cero:
    moda = df[columna].mode()
    print(columna, moda)
    df[columna]=df[columna].fillna(0)

df[col_cero].isna().sum()

children 0    0.0
dtype: float64
previous_cancellations 0    0.0
dtype: float64
previous_bookings_not_canceled 0    0.0
dtype: float64
days_in_waiting_list 0    0.0
dtype: float64
required_car_parking_spaces 0    0.0
dtype: float64
total_of_special_requests 0    0.0
dtype: float64
booking_changes 0    0.0
dtype: float64
babies 0    0.0
dtype: float64


In [37]:
col_moda = ['is_repeated_guest', 'adults']

for col_m in col_moda:
    moda = df[col_m].mode()[0]
    print(col_m, moda)
    df[col_m]=df[col_m].fillna(moda)

df[col_moda].isna().sum()

is_repeated_guest First time
adults 2.0


is_repeated_guest    0
adults               0
dtype: int64

In [42]:
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,change_room
0,Resort Hotel,No Cancelado,342.000000,2015.0,July,27.0,1.0,0.000000,0.000000,2.0,0.0,0.0,BB,Portugal,Unknown,Direct,First time,0.0,0.0,C,C,3.0,Unknown,Unknown,0.0,Transient,0.0,0.0,0.0,Check-Out,equal
1,Resort Hotel,No Cancelado,737.000000,2015.0,July,27.0,1.0,0.000000,0.000000,2.0,0.0,0.0,BB,Unknown,Unknown,Direct,First time,0.0,0.0,Unknown,C,4.0,Unknown,Unknown,0.0,Transient,0.0,0.0,0.0,Check-Out,change
2,Resort Hotel,No Cancelado,7.000000,2015.0,July,27.0,1.0,0.000000,1.000000,1.0,0.0,0.0,BB,United Kingdom,Unknown,Direct,First time,0.0,0.0,A,C,0.0,Unknown,Unknown,0.0,Transient,75.0,0.0,0.0,Check-Out,change
3,Resort Hotel,No Cancelado,13.000000,2015.0,July,27.0,1.0,0.000000,1.000000,1.0,0.0,0.0,BB,United Kingdom,Corporate,Corporate,First time,0.0,0.0,A,A,0.0,304.0,Unknown,0.0,Transient,75.0,0.0,0.0,Check-Out,equal
4,Resort Hotel,No Cancelado,14.000000,2015.0,July,,1.0,0.000000,2.000000,2.0,0.0,0.0,BB,Unknown,Online TA,TA/TO,First time,0.0,0.0,A,A,0.0,240.0,Unknown,0.0,Transient,98.0,0.0,1.0,Check-Out,equal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182872,Unknown,Unknown,224.337762,,Unknown,,,19.005545,19.819823,2.0,0.0,0.0,Unknown,Unknown,Unknown,Unknown,First time,0.0,0.0,Unknown,Unknown,0.0,Unknown,Unknown,0.0,Unknown,,0.0,0.0,Unknown,change
182873,Unknown,Unknown,390.141963,,Unknown,,,14.751794,19.989726,2.0,0.0,0.0,Unknown,Unknown,Unknown,Unknown,First time,0.0,0.0,Unknown,Unknown,0.0,Unknown,Unknown,0.0,Unknown,,0.0,0.0,Unknown,change
182874,Unknown,Unknown,230.689826,,Unknown,,,11.409496,20.461372,2.0,0.0,0.0,Unknown,Unknown,Unknown,Unknown,First time,0.0,0.0,Unknown,Unknown,0.0,Unknown,Unknown,0.0,Unknown,,0.0,0.0,Unknown,change
182875,Unknown,Unknown,304.888534,,Unknown,,,16.744472,15.400773,2.0,0.0,0.0,Unknown,Unknown,Unknown,Unknown,First time,0.0,0.0,Unknown,Unknown,0.0,Unknown,Unknown,0.0,Unknown,,0.0,0.0,Unknown,change


In [41]:
col_eliminar= ['reservation_status_date','0']
for col_el in col_eliminar:
    df.drop(columns = col_eliminar, axis=1, inplace=True)

KeyError: "['reservation_status_date' '0'] not found in axis"

In [43]:
col_impute = ['adr', 'lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights']

In [44]:
df_copia = df.copy()

In [45]:
## IterativeImputer
# instanciamos las clases
imputer_iterative = IterativeImputer(max_iter = 20, random_state = 42)

# ajustamos y tranformamos los datos
imputer_iterative_imputado = imputer_iterative.fit_transform(df_copia[col_impute])

# asignamos los datos a columnas nuevas
df_copia[['adr_i', 'lead_time_i', 'stays_in_weekend_nights_i', 'stays_in_week_nights_i']] = imputer_iterative_imputado

In [46]:
## KNNImputer
# instanciamos la clase del KNNImputer
imputer_knn = KNNImputer(n_neighbors = 5)

# ajustamos y transformamos los datos
imputer_knn_imputado = imputer_knn.fit_transform(df_copia[col_impute])

# asignamos los datos a columnas nuevas
df_copia[['adr_k', 'lead_time_k', 'stays_in_weekend_nights_k', 'stays_in_week_nights_k']] = imputer_knn_imputado


In [47]:
df_copia.describe()[['adr','adr_i', 'adr_k', 'lead_time', 'lead_time_i','lead_time_k', 'stays_in_weekend_nights','stays_in_weekend_nights_i','stays_in_weekend_nights_k',  'stays_in_week_nights', 'stays_in_week_nights_i', 'stays_in_week_nights_k']]

Unnamed: 0,adr,adr_i,adr_k,lead_time,lead_time_i,lead_time_k,stays_in_weekend_nights,stays_in_weekend_nights_i,stays_in_weekend_nights_k,stays_in_week_nights,stays_in_week_nights_i,stays_in_week_nights_k
count,118004.0,119836.0,119836.0,118105.0,119836.0,119836.0,118105.0,119836.0,119836.0,118105.0,119836.0,119836.0
mean,101.917853,101.951758,101.898759,103.126238,103.126238,103.126238,0.944238,0.944238,0.944238,2.520092,2.520092,2.520092
std,50.722452,50.346828,50.342569,106.344216,105.573359,105.573359,1.084869,1.077005,1.077005,1.984821,1.970434,1.970434
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,69.12,70.0,70.0,18.0,18.0,18.0,0.0,0.0,0.0,1.0,1.0,1.0
50%,94.5,95.0,95.0,68.0,70.0,70.0,1.0,1.0,1.0,2.0,2.0,2.0
75%,126.0,125.9,125.5025,159.0,158.0,158.0,2.0,2.0,2.0,3.0,3.0,3.0
max,5400.0,5400.0,5400.0,737.0,737.0,737.0,19.914715,19.914715,19.914715,50.0,50.0,50.0


In [49]:
df_copia.drop(['adr','adr_i', 'lead_time', 'lead_time_i', 'stays_in_weekend_nights','stays_in_weekend_nights_i',  'stays_in_week_nights', 'stays_in_week_nights_i'], axis = 1, inplace = True)


In [50]:
new_column_name = {'adr_k' :'adr', 'lead_time_k':'lead_time','stays_in_weekend_nights_k':'stays_in_weekend_nights','stays_in_week_nights_k':'stays_in_week_nights'}

df_copia.rename(columns = new_column_name, inplace = True)

In [51]:
df_copia.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119836 entries, 0 to 182876
Data columns (total 31 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119836 non-null  object 
 1   is_canceled                     119836 non-null  object 
 2   arrival_date_year               112076 non-null  float64
 3   arrival_date_month              119836 non-null  object 
 4   arrival_date_week_number        99667 non-null   float64
 5   arrival_date_day_of_month       117886 non-null  float64
 6   adults                          119836 non-null  float64
 7   children                        119836 non-null  float64
 8   babies                          119836 non-null  float64
 9   meal                            119836 non-null  object 
 10  country                         119836 non-null  object 
 11  market_segment                  119836 non-null  object 
 12  distribution_cha

In [68]:
# cambiar tipos de datos

float_to_object = ['company', 'agent']

float_to_int = ['children', 'previous_cancellations', 'previous_bookings_not_canceled', 'days_in_waiting_list', 'required_car_parking_spaces', 'total_of_special_requests', 'booking_changes', 'adults', 'babies', 'lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights']
# tienen nulos, no se pueden pasar a int: 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month',

In [67]:
df_copia[float_to_int].isna().sum()

children                             0
previous_cancellations               0
arrival_date_day_of_month         1950
previous_bookings_not_canceled       0
days_in_waiting_list                 0
required_car_parking_spaces          0
total_of_special_requests            0
booking_changes                      0
adults                               0
babies                               0
lead_time                            0
stays_in_weekend_nights              0
stays_in_week_nights                 0
dtype: int64

In [59]:
# columna a integer:
def to_int(num):
    """
    Convierte una cadena que representa un número a tipo de dato integer

    Args:
        num (str): Una cadena que representa un número entero.

    Returns:
        int: El número equivalente en formato de tipo integer.
    """
    try:
        return int(num)
    except:
        return np.nan

In [71]:
for col_float in float_to_int:
    df_copia[col_float] = df_copia[col_float].apply(to_int).astype(int)

In [72]:
df_copia.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119836 entries, 0 to 182876
Data columns (total 31 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119836 non-null  object 
 1   is_canceled                     119836 non-null  object 
 2   arrival_date_year               112076 non-null  float64
 3   arrival_date_month              119836 non-null  object 
 4   arrival_date_week_number        99667 non-null   float64
 5   arrival_date_day_of_month       117886 non-null  float64
 6   adults                          119836 non-null  int64  
 7   children                        119836 non-null  int64  
 8   babies                          119836 non-null  int64  
 9   meal                            119836 non-null  object 
 10  country                         119836 non-null  object 
 11  market_segment                  119836 non-null  object 
 12  distribution_cha

In [74]:
df_copia['total_nights'] = df_copia['stays_in_weekend_nights'] + df_copia['stays_in_week_nights']
df_copia

Unnamed: 0,hotel,is_canceled,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,reservation_status,change_room,adr,lead_time,stays_in_weekend_nights,stays_in_week_nights,total_nights
0,Resort Hotel,No Cancelado,2015.0,July,27.0,1.0,2,0,0,BB,Portugal,Unknown,Direct,First time,0,0,C,C,3,Unknown,Unknown,0,Transient,0,0,Check-Out,equal,0.000,342,0,0,0
1,Resort Hotel,No Cancelado,2015.0,July,27.0,1.0,2,0,0,BB,Unknown,Unknown,Direct,First time,0,0,Unknown,C,4,Unknown,Unknown,0,Transient,0,0,Check-Out,change,0.000,737,0,0,0
2,Resort Hotel,No Cancelado,2015.0,July,27.0,1.0,1,0,0,BB,United Kingdom,Unknown,Direct,First time,0,0,A,C,0,Unknown,Unknown,0,Transient,0,0,Check-Out,change,75.000,7,0,1,1
3,Resort Hotel,No Cancelado,2015.0,July,27.0,1.0,1,0,0,BB,United Kingdom,Corporate,Corporate,First time,0,0,A,A,0,304.0,Unknown,0,Transient,0,0,Check-Out,equal,75.000,13,0,1,1
4,Resort Hotel,No Cancelado,2015.0,July,,1.0,2,0,0,BB,Unknown,Online TA,TA/TO,First time,0,0,A,A,0,240.0,Unknown,0,Transient,0,1,Check-Out,equal,98.000,14,0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182872,Unknown,Unknown,,Unknown,,,2,0,0,Unknown,Unknown,Unknown,Unknown,First time,0,0,Unknown,Unknown,0,Unknown,Unknown,0,Unknown,0,0,Unknown,change,93.054,224,19,19,38
182873,Unknown,Unknown,,Unknown,,,2,0,0,Unknown,Unknown,Unknown,Unknown,First time,0,0,Unknown,Unknown,0,Unknown,Unknown,0,Unknown,0,0,Unknown,change,62.622,390,14,19,33
182874,Unknown,Unknown,,Unknown,,,2,0,0,Unknown,Unknown,Unknown,Unknown,First time,0,0,Unknown,Unknown,0,Unknown,Unknown,0,Unknown,0,0,Unknown,change,91.604,230,11,20,31
182875,Unknown,Unknown,,Unknown,,,2,0,0,Unknown,Unknown,Unknown,Unknown,First time,0,0,Unknown,Unknown,0,Unknown,Unknown,0,Unknown,0,0,Unknown,change,135.770,304,16,15,31


In [75]:
df_copia.to_csv('../data/bookings_clean.csv')