### Librerias necesarias

In [1]:
import mysql.connector
import pandas as pd
import numpy as np
import os
import re
from mysql.connector import errorcode
from dotenv import load_dotenv
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

In [2]:
load_dotenv()

DB_HOST = os.getenv('host')
DB_USER = os.getenv('usuario')
DB_PASSWORD = os.getenv('clave')
DB_DATABASE = os.getenv('db')

### Conexion a MySQL para extraer tabla

In [3]:
try:
    # Intentar conectar a la base de datos
    cnx = mysql.connector.connect(
        host=DB_HOST,
        user=DB_USER,
        password=DB_PASSWORD,
        database=DB_DATABASE,
        charset='utf8mb4')

    tabla = "Tourist_Accommodation22042025"

    # Ejecutar consulta para obtener los datos
    mycursor = cnx.cursor(dictionary=True)  # Crear cursor habilitando diccionario
    mycursor.execute(f"SELECT * FROM {tabla}")
    df = pd.DataFrame(mycursor.fetchall())

    # Imprimir los primeros registros
    print(df.head(5))

except mysql.connector.Error as err:
    # Manejo de errores específicos
    if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
        print("Usuario o contraseña incorrectos")
    elif err.errno == errorcode.ER_BAD_DB_ERROR:
        print("La base de datos especificada no existe")
    else:
        print(f"Error inesperado: {err}")

finally:
    # Cerrar conexión
    try:
        if 'mycursor' in locals() and mycursor:
            mycursor.close()
        if 'cnx' in locals() and cnx.is_connected():
            cnx.close()
    except Exception as e:
        print(f"Error al cerrar la conexión: {e}")

   apartment_id                                               name  \
0         11964                                 A ROOM WITH A VIEW   
1         21853                               Bright and airy room   
2         32347  Explore Cultural Sights from a Family-Friendly...   
3         35379                  Double 02 CasanovaRooms Barcelona   
4         35801                  Can Torras Farmhouse Studio Suite   

                                         description  host_id  \
0  Private bedroom in our attic apartment. Right ...    45553   
1  We have a quiet and sunny room with a good vie...    83531   
2  Open French doors and step onto a plant-filled...   139939   
3  Room at a my apartment. Kitchen and 2 bathroom...   152232   
4  Lay in bed & watch sunlight change the mood of...   153805   

                neighbourhood_name neighbourhood_district        room_type  \
0                           Centro                   None     Private room   
1                         C�rmen

## Trabajo en Pandas

### Comprobar dataset

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   apartment_id                 8000 non-null   int64  
 1   name                         7997 non-null   object 
 2   description                  7946 non-null   object 
 3   host_id                      8000 non-null   int64  
 4   neighbourhood_name           8000 non-null   object 
 5   neighbourhood_district       4861 non-null   object 
 6   room_type                    8000 non-null   object 
 7   accommodates                 8000 non-null   int64  
 8   bathrooms                    7957 non-null   object 
 9   bedrooms                     7961 non-null   object 
 10  beds                         7992 non-null   float64
 11  amenities_list               7983 non-null   object 
 12  price                        7829 non-null   float64
 13  minimum_nights    

### Cambio de tipo de datos de fechas

In [5]:
print(df[['first_review_date','last_review_date','insert_date']])

     first_review_date last_review_date insert_date
0           02/01/2010       05/09/2017  31/07/2018
1           10/10/2014       15/07/2018  10/01/2020
2           05/01/2011       22/07/2019  29/07/2019
3           13/03/2012       04/01/2020  10/01/2020
4           08/07/2011       08/08/2018  19/02/2019
...                ...              ...         ...
7995              None             None  16/10/2019
7996        17/03/2019       09/09/2020  31/01/2021
7997              None             None  24/04/2019
7998        04/03/2019       28/07/2019  12/08/2019
7999        21/04/2019       29/06/2019  31/08/2019

[8000 rows x 3 columns]


In [6]:
df['first_review_date'] = pd.to_datetime(df['first_review_date'], format='%d/%m/%Y')
df['last_review_date'] = pd.to_datetime(df['last_review_date'], format='%d/%m/%Y')
df['insert_date'] = pd.to_datetime(df['insert_date'], format='%d/%m/%Y')

df[['first_review_date','last_review_date','insert_date']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   first_review_date  6386 non-null   datetime64[ns]
 1   last_review_date   6385 non-null   datetime64[ns]
 2   insert_date        8000 non-null   datetime64[ns]
dtypes: datetime64[ns](3)
memory usage: 187.6 KB


### Funcion para borrar duplicados basados en la fecha mas reciente

In [7]:
def mantener_fecha_mas_reciente(df, columna_id, columna_fecha):
    """
    Elimina duplicados basados en un ID, manteniendo la fila con la fecha más reciente.
    
    Parámetros:
    df (DataFrame): El DataFrame que contiene los datos.
    columna_id (str): El nombre de la columna que identifica los duplicados.
    columna_fecha (str): El nombre de la columna que contiene las fechas.

    Retorna:
    DataFrame: Un nuevo DataFrame con los IDs únicos y sus fechas más recientes.
    """

    # Ordenar por fecha descendente
    df = df.sort_values(by=columna_fecha, ascending=False)

    # Eliminar duplicados manteniendo el más reciente
    df = df.drop_duplicates(subset=columna_id, keep='last')

    return df

#### Numero de registros duplicados

In [8]:
total_registros = len(df)
valores_unicos = df['apartment_id'].nunique()
duplicados = total_registros - valores_unicos
print(f"Registros duplicados: {duplicados}")
print(f"Registros que deberían quedar: {valores_unicos}")

Registros duplicados: 307
Registros que deberían quedar: 7693


### Ejecutar funcion al dataset con el ID del apartamento y la fecha de insersion 

In [9]:
df = mantener_fecha_mas_reciente(df, columna_id='apartment_id', columna_fecha='insert_date')
df.head()

Unnamed: 0,apartment_id,name,description,host_id,neighbourhood_name,neighbourhood_district,room_type,accommodates,bathrooms,bedrooms,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,is_instant_bookable,reviews_per_month,country,city,insert_date
3656,17249166,Arquitecte gaudi,Nuevo apartamento c�ntrico con buenos acabados...,109461701,Roses,,Entire home/apt,5,1,2,...,100.0,100.0,100.0,100.0,100.0,FALSO,9.0,spain,girona,2021-02-27
5770,23040910,Apartamento luminoso cerca del mar,Apartamento muy bonito a 200m de la playa y en...,17816909,Ciutadella de Menorca,,Entire home/apt,4,1,1,...,90.0,100.0,100.0,100.0,90.0,VERDADERO,3.0,spain,menorca,2021-02-27
7894,31968299,Villenpark Playa Arena House Flamenco,House with incredible garden and beautiful vie...,173153149,Lloret de Mar,,Entire home/apt,5,1,2,...,,,,,,VERDADERO,,spain,girona,2021-02-27
6749,26207763,Casa Flor.....somewhere different,"Built in the late 1950's, Casa Flor is a detac...",196976392,L'Escala,,Entire home/apt,2,1,1,...,100.0,100.0,100.0,100.0,100.0,FALSO,12.0,spain,girona,2021-02-27
4507,19236052,Apartment with garden & pool - 800 m Pals beac...,Description<br /><br /><br /><br /><br /><br /...,133933726,Pals,,Entire home/apt,7,2,3,...,100.0,80.0,80.0,80.0,80.0,VERDADERO,3.0,spain,girona,2021-02-27


### Normalizacion de "Amenities"

In [None]:
# Paso 1: Limpieza
df_clean = df.dropna(subset=['amenities_list'])
df_clean['amenities_list'] = df_clean['amenities_list'].apply(
    lambda x: [a.strip() for a in x.split(',') if a.strip()])

def clean_amenity_list(lst):
    return [re.sub(r'[\[\]]', '', a.strip().lower()) for a in lst if a.strip()]

df_clean['amenities_list'] = df_clean['amenities_list'].apply(clean_amenity_list)

# Paso 2: Extraer amenities únicas
all_amenities = [item for sublist in df_clean['amenities_list'] for item in sublist]
unique_amenities = sorted(set(all_amenities))

# Paso 3: Vectorizar
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
X = vectorizer.fit_transform(unique_amenities)

# Paso 4: Calcular similitud y distancia
similarity = cosine_similarity(X)
distance_matrix = np.clip(1 - similarity, 0, 1)

# Paso 5: Clustering con DBSCAN
clustering = DBSCAN(eps=0.3, min_samples=1, metric='precomputed')
labels = clustering.fit_predict(distance_matrix)

# Paso 6: Crear DataFrame y exportar
cluster_df = pd.DataFrame({'cluster_id': labels, 'amenity': unique_amenities
                           }).sort_values(by='cluster_id')

# Mostrar los grupos
cluster_view = defaultdict(list)
for label, amenity in zip(labels, unique_amenities):
    cluster_view[label].append(amenity)

# Contar cuántos grupos únicos hay
num_clusters = len(set(labels))
print(f"\nDe momento hay {len(unique_amenities)} comodidades únicas, dentro de {num_clusters} grupos provicionales.")

view_text = "\n".join(
    f"\nGrupo {cluster_id + 1}:\n" + ", ".join(sorted(group))
    for cluster_id, group in sorted(cluster_view.items()))

print(view_text)

#### Diccionario de equivalencias 

In [None]:
{(-1): [''],
('24-hour check-in'): ['24-hour check-in', 'building staff', 'doorman', 'doorman entry', 'front desk/doorperson'],
('air conditioning'): ['air conditioning', 'central air conditioning', 'central heating', 'heated floors', 'heating'],
('balcony'): ['balcony', 'patio or balcony'],
('barbecue utensils'): ['barbecue utensils', 'bbq grill'],
('bathroom essentials'): ['bath towel', 'bathroom essentials', 'beach essentials', 'essentials', 'body soap', 'conditioner', 'rituals body soap'],
('bathroom extras'): ['bathtub', 'bathtub with bath chair', 'bidet', 'dryer', 'hair dryer', 'en suite bathroom', 'drying rack for clothing'],
('beach view'): ['beach view', 'beachfront'],
('bedroom comforts'): ['bedroom comforts', 'bed linens', 'day bed', 'extra pillows and blankets'],
('sound system'): ['amazon echo', 'bluetooth sound system', 'ikea nearby bluetooth sound system', 'sound system'],
(27): ['buzzer/wireless intercom'],
(30): ['cat(s)', 'dog(s)'],
(31): ['ceiling fan'],
('kitchen essentials'): ['baking sheet', 'breakfast', 'breakfast table', 'cooking basics', 'kitchen', 'dishes and silverware'],
('kitchen extras'): ['bread maker', "chef's kitchen", 'full kitchen', 'dishwasher', 'dining area', 'dining table', 'formal dining area', 'outdoor dining area', 'freezer'],
(37): ['cleaning before checkout'],
(38): ['cleaning products'],
('clothing storage'): ['clothing storage', 'clothing storage: closet', 'hangers'],
('coffee maker'): ['coffee maker', 'espresso machine', 'nespresso machine'],
('workspace'): ['dedicated workspace', 'laptop friendly workspace', 'laptop-friendly workspace'],
('oven'): ['convection oven', 'double oven', 'electric stove', 'stainless steel electric stove', 'stainless steel oven', 'stainless steel stove', 'gas oven'],
('elevator'): ['elevator', 'elevator in building'],
('internet'): ['ethernet connection', 'internet', 'wireless internet', 'wifi', 'wifi u2013 100 mbps', 'pocket wifi'],
(65): ['ev charger'],
('exercise equipment'): ['exercise equipment', 'gym'],
('family friendly'): ['baby bath', 'baby essentials', 'baby monitor', 'baby safety gates', 'babysitter recommendations', 'changing table', 'crib', 'childrenu2019s books and toys', 'children�s books and toys', 'childrenu2019s dinnerware', 'children�s dinnerware', 'family/kid friendly'],
('fire safety'): ['carbon monoxide alarm', 'carbon monoxide detector', 'fire extinguisher', 'fireplace guards'],
(72): ['firm mattress'],
(73): ['first aid kit'],
(83): ['game console'],
('garden'): ['garden or backyard', 'shared garden or backyard'],
(88): ['hammock'],
(89): ['handheld shower head'],
(93): ['heated towel rack'],
(95): ['high chair'],
(96): ['host greets you'],
(97): ['hot tub'],
(98): ['hot water'],
(99): ['hot water kettle'],
(100): ['indoor fireplace'],
(102): ['iron'],
(103): ['ironing board'],
(104): ['keypad'],
(105): ['kitchenette'],
(106): ['lake access'],
(108): ['laundromat nearby'],
(109): ['lock on bedroom door'],
(110): ['lockbox'],
(111): ['long term stays allowed'],
(112): ['luggage dropoff allowed'],
(113): ['memory foam mattress'],
(114): ['microwave'],
(115): ['mini fridge'],
(116): ['mosquito net'],
(117): ['mountain view'],
(118): ['mudroom'],
(119): ['murphy bed'],
(121): ['no stairs or steps to enter'],
(122): ['office'],
(123): ['other'],
(124): ['other pet(s)'],
(125): ['outdoor furniture'],
(126): ['outdoor seating'],
(127): ['outdoor shower'],
(128): ['outlet covers'],
(129): ['oven'],
(130): ['pack u2019n play/travel crib',
 'pack �n play/travel crib'],
('parking'): ['free parking on premises', 'free driveway parking on premises u2013 1 space', 'free parking on street', 'free street parking', 'paid parking garage off premises', 'paid parking garage on premises', 'paid parking off premises', 'paid parking on premises', 'parking'],
(132): ['pets allowed'],
(133): ['pets live on this property'],
(134): ['pillow-top mattress'],
(136): ['pool', 'pool with pool hoist'],
(137): ['pool toys'],
(138): ['portable fans'],
(139): ['portable heater'],
(140): ['pour over coffee', 'pour-over coffee'],
(141): ['printer'],
(142): ['private entrance'],
(143): ['private living room'],
(144): ['rain shower'],
(145): ['refrigerator'],
(146): ['rice maker'],
(147): ['rituals shampoo'],
(148): ['roll-in shower'],
(149): ['room-darkening shades'],
(150): ['safety card'],
(151): ['security system'],
(152): ['self check-in'],
(153): ['shampoo'],
(154): ['shared outdoor pool', 'shared pool'],
(155): ['shower chair'],
(156): ['shower gel'],
(157): ['single level home'],
(158): ['ski in/ski out'],
(159): ['ski-in/ski-out'],
(160): ['smart lock'],
(162): ['smoke alarm'],
(163): ['smoke detector'],
(164): ['smoking allowed'],
(165): ['smooth pathway to front door'],
(166): ['soaking tub'],
(167): ['stair gates'],
(168): ['standing valet'],
(169): ['steam oven'],
(172): ['stove'],
(173): ['suitable for events'],
(174): ['sun loungers'],
(175): ['table corner guards'],
(176): ['terrace'],
(177): ['toaster'],
(178): ['toilet'],
(179): ['toilet paper'],
(180): ['touchless faucets'],
(181): ['translation missing: en.hosting_amenity_49',
 'translation missing: en.hosting_amenity_50'],
(182): ['trash can'],
('tv'): ['40 hdtv', '43 hdtv with netflix', 'cable tv', 'dvd player', 'hbo go', 'netflix', 'smart tv', 'tv'],
(184): ['walk-in shower'],
(185): ['warming drawer'],
(186): ['washer'],
(187): ['washer u2013u00a0in unit'],
(188): ['waterfront'],
(189): ['well-lit path to entrance'],
('wheelchair accessible'): ['accessible-height bed', 'accessible-height toilet', 'fixed grab bars for shower', 'fixed grab bars for shower & toilet', 'fixed grab bars for toilet', 'extra space around bed', 'extra space around shower and toilet', 'electric profiling bed', 'flat', 'flat path to front door', 'flat path to guest entrance', 'disabled parking spot', 'ground floor access', 'step-free access', 'step-free shower', 'wheelchair accessible', 'wide clearance to bed', 'wide clearance to shower', 'wide clearance to shower & toilet', 'wide doorway', 'wide doorway to guest bathroom', 'wide entrance', 'wide entrance for guests', 'wide entryway', 'wide hallway clearance', 'wide hallways'],
(200): ['window guards'],
(201): ['wine glasses']}

In [11]:
equivalencias = {
    # Categoría: Conectividad
    "wifi": ["wifi", "wi-fi", "wireless connection", "wireless internet", "wifi – 100 mbps", "pocket wifi"],
    
    # Categoría: Entretenimiento
    "tv": ["tv", "television", "cable tv", "smart tv", "netflix", "43 hdtv with netflix", "dvd player"],
    
    # Categoría: Climatización
    "air conditioning": ["air conditioning", "ac", "central air conditioning", "ceiling fan"],
    "heating": ["heating", "central heating"],

    # Categoría: Cocina
    "nespresso machine": ["nespresso machine", "espresso machine", "coffee maker", "pour-over coffee", "pour over coffee"],
    "oven": ["oven", "convection oven", "stainless steel oven", "double oven"],
    "stove": ["stove", "electric stove", "gas oven", "stainless steel stove"],
    "toaster": ["toaster", "bread maker"],
    "baking sheet": ["baking sheet"],
    "dishwasher": ["dishwasher"],
    "kitchen": ["kitchen", "full kitchen", "chef's kitchen", "kitchenette", "cooking basics"],
    "microwave": ["microwave", "microwave oven"],

    # Categoría: Habitaciones y camas
    "bed linens": ["bed linens", "extra pillows and blankets", "memory foam mattress", "pillow-top mattress", "firm mattress"],
    "crib": ["crib", "pack u2019n play/travel crib", "pack 'n play/travel crib"],

    # Categoría: Baños
    "bathtub": ["bathtub", "soaking tub", "hot tub"],
    "shower": ["roll-in shower", "rain shower", "shower chair", "step-free shower", "handheld shower head"],
    "bathroom essentials": ["toilet paper", "conditioner", "shampoo", "body soap", "bathroom essentials"],
    "hot water": ["hot water", "hot water kettle"],

    # Categoría: Accesibilidad
    "wheelchair accessible": ["wheelchair accessible", "flat path to guest entrance", "no stairs or steps to enter", "step-free access"],
    "wide clearance": ["wide clearance to shower & toilet", "wide clearance to bed", "wide doorway", "wide hallways"],
    "accessible-height bed": ["accessible-height bed", "accessible-height toilet"],

    # Categoría: Exterior
    "balcony": ["balcony", "patio or balcony"],
    "garden or backyard": ["garden or backyard", "shared garden or backyard"],
    "outdoor seating": ["outdoor seating", "terrace", "bbq grill"],
    "beachfront": ["beachfront", "beach essentials", "waterfront"],

    # Categoría: Niños y familia
    "children's amenities": ["children s books and toys", "children s dinnerware", "baby bath", "changing table", "stair gates", "table corner guards"],
    "high chair": ["high chair"],

    # Categoría: Seguridad
    "fire safety": ["fire extinguisher", "smoke alarm", "carbon monoxide alarm", "carbon monoxide detector", "fireplace guards"],
    "security system": ["security system"],

    # Categoría: Gimnasio y lujo
    "gym": ["gym", "exercise equipment"],
    "bluetooth sound system": ["bluetooth sound system", "sound system"],
    "heated towel rack": ["heated towel rack"],
    "wine glasses": ["wine glasses"],

    # Categoría: Estacionamiento
    "free parking": ["free parking on premises", "free parking on street", "free street parking"],
    "paid parking": ["paid parking on premises", "paid parking off premises", "paid parking garage"],

    # Categoría: Otros
    "iron": ["iron", "ironing board"],
    "workspace": ["dedicated workspace", "laptop-friendly workspace", "office", "printer"],
    "lockbox": ["lockbox"],
    "hangers": ["hangers"]
}
# Se puede actualizar el diccionario, segun se ingresen mas caracteristicas 



#### Funcion que normaliza las amenities


In [18]:
def normalizar_amenities(amenities, equivalencias):
    if not amenities:  # Si 'amenities' es None o está vacío
        return ''  # Retornamos un string vacío
    normalized = set()  # Usamos un conjunto para evitar duplicados
    for amenity in amenities.split(','):  # Dividimos por comas las amenidades
        matched = False
        for key, synonyms in equivalencias.items():
            for synonym in synonyms:
                if synonym.lower() in amenity.lower().strip():
                    normalized.add(key)
                    matched = True
                    break
        if not matched:  # Si no coincide con ninguna clave del diccionario
            normalized.add(amenity.lower().strip())  # Agregamos la amenidad como está
    return ', '.join(normalized)  # Retornamos las amenidades normalizadas

#### Normalizamos las amenities y creamos una nueva columna con las amenities normalizadas 

In [19]:
# Aplicamos la función a la columna 'amenities'
df_limpio['normalized_amenities'] = df_limpio['amenities_list'].apply(lambda x: normalizar_amenities(x, equivalencias))

# Mostramos el resultado
df_limpio[['normalized_amenities', 'amenities_list']].head()

Unnamed: 0,normalized_amenities,amenities_list
4067,"heating, private entrance, free parking, kitch...","Cooking basics, Hair dryer, Heating, TV, Bed l..."
5294,"heating, childrenu2019s books and toys], hair ...","Essentials, Kitchen, Hair dryer, Bed linens, T..."
461,"free parking, wifi, oven, kitchen, microwave, ...","Waterfront, Pack u2019n Play/travel crib, Pati..."
5637,"host greets you, heating, private entrance, fr...","Bathtub, Free parking on premises, Free street..."
5078,"heating, kitchen, wifi, essentials, workspace,...","Lockbox, Air conditioning, Carbon monoxide ala..."


#### Mostrar la diferencia entre amenities unicas con y sin normalizar

In [20]:
# Extraer las amenidades originales únicas
amenidades_originales = set()
df_limpio['amenities_list'].dropna().apply(lambda x: [amenidades_originales.add(i.strip().lower()) for i in x.split(',')])

# Extraer las amenidades normalizadas únicas
amenidades_normalizadas = set()
df_limpio['normalized_amenities'].dropna().apply(lambda x: [amenidades_normalizadas.add(i.strip().lower()) for i in x.split(',')])

# Mostrar las amenidades únicas
print("Amenidades originales únicas:", len(amenidades_originales))
print("Amenidades normalizadas únicas:", len(amenidades_normalizadas))

Amenidades originales únicas: 308
Amenidades normalizadas únicas: 180
