In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_raw = pd.read_parquet ("../data/reservas_hoteles.parquet")

In [3]:
df = df_raw.copy()

In [4]:
df[["fecha_reserva","inicio_estancia", "final_estancia"]] = df[["fecha_reserva","inicio_estancia", "final_estancia"]].apply(pd.to_datetime)
df["ciudad"].replace("",np.nan, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["ciudad"].replace("",np.nan, inplace=True)


In [5]:
df_competencia = df[df["competencia"] == True]
df_competencia = df_competencia[["id_hotel"]].drop_duplicates()
df_competencia = df_competencia.reset_index(drop=True)

In [6]:
df_scrapeo_competencia = pd.read_parquet ("../data/scrapeo_hoteles.parquet")
df_scrapeo_competencia ["precio_noche"] = df_scrapeo_competencia ["precio_noche"].astype(float)
df_scrapeo_competencia ["estrellas"] = df_scrapeo_competencia ["estrellas"].astype(float)


In [7]:
df_competencia_unido = pd.concat([df_competencia, df_scrapeo_competencia], axis=1)


In [8]:
# rellenar df con los datos de df_competencia_unido
df_final = df.merge(df_competencia_unido, on= "id_hotel", how="left", suffixes=("", "_nuevo"))
columnas_actualizar = ["fecha_reserva", "nombre_hotel", "precio_noche", "estrellas"]
for col in columnas_actualizar:
    df_final[col] = df_final[col].mask(
        df_final[col].isna() | (df_final[col] == ""),
        df_final[f"{col}_nuevo"]
    )
df_final = df_final[df.columns]


In [9]:
df_final.loc[df_final["competencia"] == True, "inicio_estancia"] = df_final["inicio_estancia"].fillna("2025-03-01")
df_final.loc[df_final["competencia"] == True, "final_estancia"] = df_final["final_estancia"].fillna("2025-03-02")
df_final.loc[df_final["competencia"] == True, "ciudad"] = df_final["ciudad"].fillna("Madrid")


In [10]:
df_propios = df[(df["competencia"] == False)]


In [11]:
df_propios_unicos = df_propios [["nombre_hotel"]].drop_duplicates().reset_index(drop=True)
df_propios_unicos ["id_hotel"] = range(1,len(df_propios_unicos)+1)
df_propios_unicos = df_propios_unicos.set_index("nombre_hotel")["id_hotel"]


In [12]:
precio_medio = df_propios.groupby ("nombre_hotel")["precio_noche"].mean().round(2)


In [13]:
valoracion_media = df_propios.groupby ("nombre_hotel")["estrellas"].mean().round(2)


In [14]:
df_final.loc[(df["competencia"] == False), "precio_noche"] = df_final["nombre_hotel"].map(precio_medio)
df_final.loc[(df["competencia"] == False), "estrellas"] = df_final["nombre_hotel"].map(valoracion_media)
df_final.loc[(df["competencia"] == False), "id_hotel"] = df_final["nombre_hotel"].map(df_propios_unicos)


In [15]:
df_clientes_unicos = df [["mail"]].drop_duplicates().reset_index(drop=True)
df_clientes_unicos ["id_cliente"] = range(1,len(df_clientes_unicos)+1)
df_clientes_unicos = df_clientes_unicos.set_index("mail")["id_cliente"]



In [16]:
df_final["id_cliente"] = df_final ["mail"].map(df_clientes_unicos)


In [17]:
df_final["id_ciudad"] = 1


In [18]:
ciudad_conteo = df_final ["ciudad"].value_counts()


In [19]:
df_ciudad = df_final [["ciudad", "id_ciudad"]].drop_duplicates()


In [20]:
df_ciudad.to_parquet("../data/ciudad.parquet", index=False)

In [21]:
df_final.to_parquet("../data/dataset_hoteles_limpio.parquet", index=False)

In [22]:
df_hotel = df_final[["id_hotel", "nombre_hotel", "competencia", "estrellas", "id_ciudad"]].drop_duplicates()


In [23]:
df_hotel.to_parquet("../data/hoteles.parquet", index=False)

In [24]:
df_cliente = df_final [["id_cliente", "nombre", "apellido", "mail"]].drop_duplicates()


In [25]:
df_cliente.to_parquet("../data/clientes.parquet", index=False)