### Librerias necesarias

In [63]:
import mysql.connector
import pandas as pd
import numpy as np
from datetime import datetime
from mysql.connector import errorcode


### Conexion a MySQL para extraer tabla

In [None]:
try:
    # Intentar conectar a la base de datos
    cnx = mysql.connector.connect(
        user='INSERTAR',
        password='INSERTAR',
        database='INSERTAR',
        host='INSERTAR',
        charset='utf8mb4'
    )
    mycursor = cnx.cursor()  # Crear cursor

    # Nombre de la tabla a extraer
    tabla = "Tourist_Accommodation"

    # Ejecutar consulta para obtener los datos
    mycursor.execute(f"SELECT * FROM {tabla}")
    columnas = [desc[0] for desc in mycursor.description]  # Obtener nombres de columnas
    datos = mycursor.fetchall()  # Obtener datos

    # Crear DataFrame
    df = pd.DataFrame(datos, columns=columnas)

    # Imprimir los primeros registros
    print(df.head())

except mysql.connector.Error as err:
    # Manejo de errores específicos
    if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
        print("Usuario o contraseña incorrectos")
    elif err.errno == errorcode.ER_BAD_DB_ERROR:
        print("La base de datos especificada no existe")
    else:
        print(f"Error inesperado: {err}")

finally:
    # Cerrar conexión
    try:
        if 'mycursor' in locals() and mycursor:
            mycursor.close()
        if 'cnx' in locals() and cnx.is_connected():
            cnx.close()
    except Exception as e:
        print(f"Error al cerrar la conexión: {e}")

## Trabajo en Pandas

### Comprobar dataset

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7001 entries, 0 to 7000
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   apartment_id                 7001 non-null   int64  
 1   name                         6998 non-null   object 
 2   description                  6972 non-null   object 
 3   host_id                      7001 non-null   int64  
 4   neighbourhood_name           7001 non-null   object 
 5   neighbourhood_district       4241 non-null   object 
 6   room_type                    7001 non-null   object 
 7   accommodates                 7001 non-null   int64  
 8   bathrooms                    6969 non-null   object 
 9   bedrooms                     6972 non-null   object 
 10  beds                         6998 non-null   float64
 11  amenities_list               6984 non-null   object 
 12  price                        6870 non-null   float64
 13  minimum_nights    

### Cambio de tipo de datos de fechas

In [66]:
df['last_review_date'] = pd.to_datetime(df['last_review_date'], format='%d/%m/%Y')
df['insert_date'] = pd.to_datetime(df['insert_date'], format='%d/%m/%Y')

### Funcion para borrar duplicados basados en la fecha mas reciente

In [67]:
def mantener_fecha_mas_reciente(df, columna_id, columna_fecha):
    """
    Elimina duplicados basados en un ID, manteniendo la fila con la fecha más reciente.
    
    Parámetros:
    df (DataFrame): El DataFrame que contiene los datos.
    columna_id (str): El nombre de la columna que identifica los duplicados.
    columna_fecha (str): El nombre de la columna que contiene las fechas.

    Retorna:
    DataFrame: Un nuevo DataFrame con los IDs únicos y sus fechas más recientes.
    """

    # Ordenar por fecha descendente
    df = df.sort_values(by=columna_fecha, ascending=False)

    # Eliminar duplicados manteniendo el más reciente
    df = df.drop_duplicates(subset=columna_id, keep='first')

    return df



### Ejecutar funcion al dataset con el ID del apartamento y la fecha de insersion 

In [68]:
df_limpio = mantener_fecha_mas_reciente(df, columna_id='apartment_id', columna_fecha='insert_date')
df_limpio

Unnamed: 0,apartment_id,name,description,host_id,neighbourhood_name,neighbourhood_district,room_type,accommodates,bathrooms,bedrooms,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,is_instant_bookable,reviews_per_month,country,city,insert_date
4067,18263766,Apartamento 1a linea mar con parquing,"Apartamento de 90 m2, primera l�nea de mar, en...",126077779,Castell-Platja d'Aro,,Entire home/apt,6,2,3,...,90.0,100.0,100.0,100.0,90.0,FALSO,101.0,spain,girona,2021-02-27
5294,21577762,Rustic Chic Studio Downtown (C80),Welcome to our brand new apartment in the cent...,5890675,Centro,,Entire home/apt,2,1,1,...,100.0,100.0,100.0,100.0,100.0,VERDADERO,245.0,spain,malaga,2021-02-27
461,1584877,Sea view penthouse in Calan Porter,Lovely and cozy sea view penthouse in the resi...,8443770,Alaior,,Entire home/apt,4,1,2,...,90.0,90.0,100.0,90.0,90.0,VERDADERO,55.0,spain,menorca,2021-02-27
5637,22626728,Atic Colera,"Atico con mucha mucha luz, con 2 terrazas exte...",166569559,Colera,,Entire home/apt,6,2,3,...,90.0,100.0,100.0,100.0,100.0,FALSO,31.0,spain,girona,2021-02-27
5078,20854917,Estudio para 2 personas zona Arenal,Este estudio de 25 m2 presenta una decoraci�n ...,137959507,Arenal,Casco Antiguo,Hotel room,2,1,1,...,100.0,100.0,100.0,100.0,100.0,VERDADERO,4.0,spain,sevilla,2021-02-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,12265320,"Vila Olimp,Strandblick,Privatzimmer",Grosse&helle Wohnung. 2 private Zimmer (4 Pers...,13561936,la Vila Ol�mpica del Poblenou,Sant Mart�,Private room,2,2,1,...,100.0,100.0,100.0,100.0,100.0,FALSO,272.0,spain,barcelona,2017-01-04
880,3771149,"Central, double/twin, private bath","Classical-Style apartment, in the centre of Ba...",19327835,l'Antiga Esquerra de l'Eixample,Eixample,Private room,2,3,1,...,100.0,90.0,90.0,90.0,90.0,FALSO,228.0,spain,barcelona,2017-01-04
895,3810717,Amazing flat in the very center,"Modern, beautiful and renewed flat at Born. Ve...",19587234,"Sant Pere, Santa Caterina i la Ribera",Ciutat Vella,Entire home/apt,2,1,1,...,100.0,100.0,100.0,100.0,100.0,FALSO,33.0,spain,barcelona,2017-01-04
902,3847290,Banyoles,Amplia habitaci�n exterior muy luminosa co...,19681580,el Barri G�tic,Ciutat Vella,Private room,2,1,1,...,100.0,100.0,100.0,100.0,100.0,FALSO,2.0,spain,barcelona,2017-01-04


#### Numero de registros duplicados

In [69]:
total_registros = len(df)
valores_unicos = df['apartment_id'].nunique()
duplicados = total_registros - valores_unicos
print(f"Registros duplicados: {duplicados}")
print(f"Registros que deberían quedar: {valores_unicos}")

Registros duplicados: 268
Registros que deberían quedar: 6733


### Normalizacion de "Amenities"

#### Diccionario de equivalencias 

In [70]:
equivalencias = {
    # Categoría: Conectividad
    "wifi": ["wifi", "wi-fi", "wireless connection", "wireless internet", "wifi – 100 mbps", "pocket wifi"],
    
    # Categoría: Entretenimiento
    "tv": ["tv", "television", "cable tv", "smart tv", "netflix", "43 hdtv with netflix", "dvd player"],
    
    # Categoría: Climatización
    "air conditioning": ["air conditioning", "ac", "central air conditioning", "standing valet", "electric profiling bed", "ceiling fan"],
    "heating": ["heating", "central heating"],

    # Categoría: Cocina
    "nespresso machine": ["nespresso machine", "espresso machine", "coffee maker", "pour-over coffee", "pour over coffee"],
    "oven": ["oven", "convection oven", "stainless steel oven", "double oven"],
    "stove": ["stove", "electric stove", "gas oven", "stainless steel stove"],
    "toaster": ["toaster", "bread maker"],
    "baking sheet": ["baking sheet"],
    "dishwasher": ["dishwasher"],
    "kitchen": ["kitchen", "full kitchen", "chef's kitchen", "kitchenette", "cooking basics"],
    "microwave": ["microwave", "microwave oven"],

    # Categoría: Habitaciones y camas
    "bed linens": ["bed linens", "extra pillows and blankets", "memory foam mattress", "pillow-top mattress", "firm mattress"],
    "crib": ["crib", "pack u2019n play/travel crib", "pack 'n play/travel crib"],

    # Categoría: Baños
    "bathtub": ["bathtub", "soaking tub", "hot tub"],
    "shower": ["roll-in shower", "rain shower", "shower chair", "step-free shower", "handheld shower head"],
    "bathroom essentials": ["toilet paper", "conditioner", "shampoo", "body soap", "bathroom essentials"],
    "hot water": ["hot water", "hot water kettle"],

    # Categoría: Accesibilidad
    "wheelchair accessible": ["wheelchair accessible", "flat path to guest entrance", "no stairs or steps to enter", "step-free access"],
    "wide clearance": ["wide clearance to shower & toilet", "wide clearance to bed", "wide doorway", "wide hallways"],
    "accessible-height bed": ["accessible-height bed", "accessible-height toilet"],

    # Categoría: Exterior
    "balcony": ["balcony", "patio or balcony"],
    "garden or backyard": ["garden or backyard", "shared garden or backyard"],
    "outdoor seating": ["outdoor seating", "terrace", "bbq grill"],
    "beachfront": ["beachfront", "beach essentials", "waterfront"],

    # Categoría: Niños y familia
    "children's amenities": ["children s books and toys", "children s dinnerware", "baby bath", "changing table", "stair gates", "table corner guards"],
    "high chair": ["high chair"],

    # Categoría: Seguridad
    "fire safety": ["fire extinguisher", "smoke alarm", "carbon monoxide alarm", "carbon monoxide detector", "fireplace guards"],
    "security system": ["security system"],

    # Categoría: Gimnasio y lujo
    "gym": ["gym", "exercise equipment"],
    "bluetooth sound system": ["bluetooth sound system", "sound system"],
    "heated towel rack": ["heated towel rack"],
    "wine glasses": ["wine glasses"],

    # Categoría: Estacionamiento
    "free parking": ["free parking on premises", "free parking on street", "free street parking"],
    "paid parking": ["paid parking on premises", "paid parking off premises", "paid parking garage"],

    # Categoría: Otros
    "iron": ["iron", "ironing board"],
    "workspace": ["dedicated workspace", "laptop-friendly workspace", "office", "printer"],
    "lockbox": ["lockbox"],
    "hangers": ["hangers"]
}
# Se puede actualizar el diccionario, segun se ingresen mas caracteristicas 



#### Funcion que normaliza las amenities


In [71]:
def normalizar_amenities(amenities, equivalencias):
    if not amenities:  # Si 'amenities' es None o vacío
        return ''  # Retornamos un string vacío
    normalized = set()  # Usamos un conjunto para evitar duplicados
    for key, synonyms in equivalencias.items():
        for synonym in synonyms:
            if synonym.lower() in amenities.lower():
                normalized.add(key)
    return ', '.join(normalized)


#### Normalizamos las amenities y creamos una nueva columna con las amenities normalizadas 

In [76]:
# Aplicamos la función a la columna 'amenities'
df_limpio['normalized_amenities'] = df_limpio['amenities_list'].apply(lambda x: normalizar_amenities(x, equivalencias))

# Mostramos el resultado
df_limpio.head()

Unnamed: 0,apartment_id,name,description,host_id,neighbourhood_name,neighbourhood_district,room_type,accommodates,bathrooms,bedrooms,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,is_instant_bookable,reviews_per_month,country,city,insert_date,normalized_amenities
4067,18263766,Apartamento 1a linea mar con parquing,"Apartamento de 90 m2, primera l�nea de mar, en...",126077779,Castell-Platja d'Aro,,Entire home/apt,6,2,3,...,100.0,100.0,100.0,90.0,FALSO,101.0,spain,girona,2021-02-27,"air conditioning, wifi, dishwasher, bed linens..."
5294,21577762,Rustic Chic Studio Downtown (C80),Welcome to our brand new apartment in the cent...,5890675,Centro,,Entire home/apt,2,1,1,...,100.0,100.0,100.0,100.0,VERDADERO,245.0,spain,malaga,2021-02-27,"air conditioning, wifi, bed linens, heating, m..."
461,1584877,Sea view penthouse in Calan Porter,Lovely and cozy sea view penthouse in the resi...,8443770,Alaior,,Entire home/apt,4,1,2,...,90.0,100.0,90.0,90.0,VERDADERO,55.0,spain,menorca,2021-02-27,"microwave, iron, balcony, bathtub, lockbox, ha..."
5637,22626728,Atic Colera,"Atico con mucha mucha luz, con 2 terrazas exte...",166569559,Colera,,Entire home/apt,6,2,3,...,100.0,100.0,100.0,100.0,FALSO,31.0,spain,girona,2021-02-27,"heating, microwave, iron, fire safety, balcony..."
5078,20854917,Estudio para 2 personas zona Arenal,Este estudio de 25 m2 presenta una decoraci�n ...,137959507,Arenal,Casco Antiguo,Hotel room,2,1,1,...,100.0,100.0,100.0,100.0,VERDADERO,4.0,spain,sevilla,2021-02-27,"air conditioning, wifi, heating, high chair, l..."


#### Mostrar la diferencia entre amenities unicas con y sin normalizar

In [77]:
# Extraer las amenidades originales únicas
amenidades_originales = set()
df_limpio['amenities_list'].dropna().apply(lambda x: [amenidades_originales.add(i.strip().lower()) for i in x.split(',')])

# Extraer las amenidades normalizadas únicas
amenidades_normalizadas = set()
df_limpio['normalized_amenities'].dropna().apply(lambda x: [amenidades_normalizadas.add(i.strip().lower()) for i in x.split(',')])

# Mostrar las amenidades únicas
print("Amenidades originales únicas:", len(amenidades_originales))
print("Amenidades normalizadas únicas:", len(amenidades_normalizadas))

Amenidades originales únicas: 308
Amenidades normalizadas únicas: 40
