### Librerias necesarias

In [21]:
import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

### Conectamos con la tabla en CSV descargada.

In [22]:
path = r"..//Data//2025_04_28_Sprint03_bruto.csv"

df = pd.read_csv(path)

print(df.head(5))

   apartment_id                                               name  \
0         11964                                 A ROOM WITH A VIEW   
1         21853                               Bright and airy room   
2         32347  Explore Cultural Sights from a Family-Friendly...   
3         35379                  Double 02 CasanovaRooms Barcelona   
4         35801                  Can Torras Farmhouse Studio Suite   

                                         description  host_id  \
0  Private bedroom in our attic apartment. Right ...    45553   
1  We have a quiet and sunny room with a good vie...    83531   
2  Open French doors and step onto a plant-filled...   139939   
3  Room at a my apartment. Kitchen and 2 bathroom...   152232   
4  Lay in bed & watch sunlight change the mood of...   153805   

                neighbourhood_name neighbourhood_district        room_type  \
0                           Centro                    NaN     Private room   
1                         C�rmen

## Trabajo en Pandas

### Comprobar dataset

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   apartment_id                 10000 non-null  int64  
 1   name                         9997 non-null   object 
 2   description                  9862 non-null   object 
 3   host_id                      10000 non-null  int64  
 4   neighbourhood_name           10000 non-null  object 
 5   neighbourhood_district       6079 non-null   object 
 6   room_type                    10000 non-null  object 
 7   accommodates                 10000 non-null  int64  
 8   bathrooms                    9926 non-null   float64
 9   bedrooms                     9930 non-null   float64
 10  beds                         9955 non-null   float64
 11  amenities_list               9983 non-null   object 
 12  price                        9746 non-null   float64
 13  minimum_nights   

### Cambio de tipo de datos de fechas

In [24]:
print(df[['first_review_date','last_review_date','insert_date']])

     first_review_date last_review_date insert_date
0           02/01/2010       05/09/2017  31/07/2018
1           10/10/2014       15/07/2018  10/01/2020
2           05/01/2011       22/07/2019  29/07/2019
3           13/03/2012       04/01/2020  10/01/2020
4           08/07/2011       08/08/2018  19/02/2019
...                ...              ...         ...
9995               NaN              NaN  31/01/2021
9996               NaN              NaN  27/02/2021
9997               NaN              NaN  31/01/2021
9998               NaN              NaN  09/02/2021
9999               NaN              NaN  09/02/2021

[10000 rows x 3 columns]


In [25]:
date_cols = ['first_review_date', 'last_review_date', 'insert_date']
df[date_cols] = df[date_cols].apply(lambda col: pd.to_datetime(col, format='%d/%m/%Y', errors='coerce'))
df = df.astype({'bathrooms': 'Int64', 'bedrooms': 'Int64', 'beds': 'Int64'})

df[['first_review_date','last_review_date','insert_date','bathrooms','bedrooms','beds']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   first_review_date  7396 non-null   datetime64[ns]
 1   last_review_date   7395 non-null   datetime64[ns]
 2   insert_date        10000 non-null  datetime64[ns]
 3   bathrooms          9926 non-null   Int64         
 4   bedrooms           9930 non-null   Int64         
 5   beds               9955 non-null   Int64         
dtypes: Int64(3), datetime64[ns](3)
memory usage: 498.2 KB


### Funcion para borrar duplicados basados en la fecha mas reciente

In [26]:
def mantener_fecha_mas_reciente(df, columna_id, columna_fecha):
    """
    Elimina duplicados basados en un ID, manteniendo la fila con la fecha más reciente.
    
    Parámetros:
    df (DataFrame): El DataFrame que contiene los datos.
    columna_id (str): El nombre de la columna que identifica los duplicados.
    columna_fecha (str): El nombre de la columna que contiene las fechas.

    Retorna:
    DataFrame: Un nuevo DataFrame con los IDs únicos y sus fechas más recientes.
    """

    # Ordenar por fecha descendente
    df = df.sort_values(by=columna_fecha, ascending=False)

    # Eliminar duplicados manteniendo el más reciente
    df = df.drop_duplicates(subset=columna_id, keep='last')

    return df

#### Numero de registros duplicados

In [27]:
total_registros = len(df)
valores_unicos = df['apartment_id'].nunique()
duplicados = total_registros - valores_unicos
print(f"Registros duplicados: {duplicados}")
print(f"Registros que deberían quedar: {valores_unicos}")

Registros duplicados: 350
Registros que deberían quedar: 9650


### Ejecutar funcion al dataset con el ID del apartamento y la fecha de insersion 

In [28]:
df = mantener_fecha_mas_reciente(df, columna_id='apartment_id', columna_fecha='insert_date')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9650 entries, 3092 to 2456
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   apartment_id                 9650 non-null   int64         
 1   name                         9647 non-null   object        
 2   description                  9514 non-null   object        
 3   host_id                      9650 non-null   int64         
 4   neighbourhood_name           9650 non-null   object        
 5   neighbourhood_district       5860 non-null   object        
 6   room_type                    9650 non-null   object        
 7   accommodates                 9650 non-null   int64         
 8   bathrooms                    9584 non-null   Int64         
 9   bedrooms                     9583 non-null   Int64         
 10  beds                         9605 non-null   Int64         
 11  amenities_list               9633 non-null   

In [29]:
df[df['amenities_list'].isna()]

Unnamed: 0,apartment_id,name,description,host_id,neighbourhood_name,neighbourhood_district,room_type,accommodates,bathrooms,bedrooms,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,is_instant_bookable,reviews_per_month,country,city,insert_date
1115,5210987,Single room in Castellana/Chamartin,Very lightfull & modern single room in a luxur...,2563986,Hispanoam�rica,Chamart�n,Private room,1,1,1,...,,,,,,FALSO,,spain,madrid,2020-03-17
1096,5121791,Genial habitaci�n en Gracia,Habitaci�n grande y balc�n propio. Muy luminos...,26471057,la Vila de Gr�cia,Gr�cia,Private room,1,1,1,...,,,,,,FALSO,,spain,barcelona,2020-02-16
5188,21271313,Pol62-Rustica,The villa is in a rural setting surrounded by ...,98002653,Pollen�a,,Entire home/apt,6,2,3,...,60.0,70.0,70.0,80.0,60.0,VERDADERO,7.0,spain,mallorca,2019-12-31
3017,15031239,TIPI on the land of Finca s'Almudaina Montuiri,"Sacred place with beautiful surroundings, be d...",5275866,Montu�ri,,Entire home/apt,2,1,1,...,90.0,100.0,100.0,100.0,100.0,FALSO,7.0,spain,mallorca,2019-10-28
4970,20562314,Art Villa S'Agar�,El Art Villa S�Agar� es una encantadora villa ...,143723965,Castell-Platja d'Aro,,Private room,4,1,1,...,100.0,90.0,100.0,90.0,90.0,FALSO,17.0,spain,girona,2019-06-30
1650,7948633,Mallorca desconocida,"La casa est� muy cerca del pueblo de Lloseta, ...",41916589,Lloseta,,Entire home/apt,16,8,1,...,,,,,,FALSO,,spain,mallorca,2019-06-27
909,3888649,Apartamento muy centrico y acogedor,"Piso a dos minutos del casco antiguo,y diez mi...",20116128,Ciutadella de Menorca,,Private room,2,1,1,...,,,,,,VERDADERO,,spain,menorca,2019-05-30
1497,7127848,Nice appt in Madrid city center,"Wonderful flat, 3 rooms in the heart of Madr...",37334057,Embajadores,Centro,Entire home/apt,3,1,3,...,,,,,,FALSO,,spain,madrid,2018-09-11
5232,21418557,Roomspace Sandoval- Glorieta de Bilbao 2,Moderno apartamento ubicado en un edificio con...,31772865,Trafalgar,Chamber�,Entire home/apt,2,1,1,...,20.0,20.0,20.0,20.0,20.0,FALSO,22.0,spain,madrid,2018-05-14
736,3080652,HABITACION GRANDE CENTRO DE MALAGA,Habitacion grande con un balc�n a la calle en ...,977031,Centro,,Private room,2,1,1,...,,,,,,FALSO,,spain,malaga,2018-04-22


### Normalizacion de "Amenities"

In [30]:
# Paso 1: Limpieza
df['amenities_list'] = df['amenities_list'].fillna('not available')
df['amenities_list'] = df['amenities_list'].apply(
    lambda x: [a.strip() for a in x.split(',')] if isinstance(x, str) else ['not available'])

def clean_amenity_list(lst):
    return [re.sub(r'[\[\]]', '', a.strip().lower()) for a in lst if a.strip()]

df['amenities_list'] = df['amenities_list'].apply(clean_amenity_list)

# Paso 2: Extraer amenities únicas
all_amenities = [item for sublist in df['amenities_list'] for item in sublist]
unique_amenities = sorted(set(all_amenities))

# Paso 3: Vectorizar
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
X = vectorizer.fit_transform(unique_amenities)

# Paso 4: Calcular similitud y distancia
similarity = cosine_similarity(X)
distance_matrix = np.clip(1 - similarity, 0, 1)

# Paso 5: Clustering con DBSCAN
clustering = DBSCAN(eps=0.3, min_samples=1, metric='precomputed')
labels = clustering.fit_predict(distance_matrix)

# Paso 6: Crear DataFrame y exportar
cluster_df = pd.DataFrame({'cluster_id': labels, 'amenity': unique_amenities
                           }).sort_values(by='cluster_id')

# Mostrar los grupos
cluster_view = defaultdict(list)
for label, amenity in zip(labels, unique_amenities):
    cluster_view[label].append(amenity)

# Contar cuántos grupos únicos hay
num_clusters = len(set(labels))
print(f"\nDe momento hay {len(unique_amenities)} comodidades únicas, dentro de {num_clusters} grupos provicionales.")

view_text = "\n".join(
    f"Grupo {cluster_id + 1}:\n" + ", ".join(sorted(group))
    for cluster_id, group in sorted(cluster_view.items()))

print(view_text)


De momento hay 276 comodidades únicas, dentro de 220 grupos provicionales.
Grupo 0:

Grupo 1:
24-hour check-in
Grupo 2:
40 hdtv, hdtv
Grupo 3:
43 hdtv with netflix
Grupo 4:
accessible-height bed, accessible-height toilet
Grupo 5:
air conditioning, central air conditioning, portable air conditioning
Grupo 6:
amazon echo
Grupo 7:
baby bath
Grupo 8:
baby monitor
Grupo 9:
baby safety gates
Grupo 10:
babysitter recommendations
Grupo 11:
baking sheet
Grupo 12:
balcony, patio or balcony
Grupo 13:
barbecue utensils
Grupo 14:
bath towel
Grupo 15:
bathroom essentials, beach essentials, essentials
Grupo 16:
bathtub, bathtub with bath chair
Grupo 17:
bbq grill
Grupo 18:
beach view
Grupo 19:
beachfront
Grupo 20:
bed linens
Grupo 21:
bedroom comforts
Grupo 22:
bidet
Grupo 23:
bluetooth sound system, ikea nearby bluetooth sound system, sound system
Grupo 24:
body soap, rituals body soap
Grupo 25:
bread maker
Grupo 26:
breakfast, breakfast bar, breakfast table
Grupo 27:
building staff
Grupo 28:
buzze

#### Diccionario de equivalencias 

In [31]:
equivalencias = {
('24-hour check-in'): ['24-hour check-in', 'building staff', 'doorman', 'doorman entry', 'front desk/doorperson', 'host greets you'],
('air conditioning'): ['air conditioning', 'central air conditioning', 'central heating', 'heated floors', 'heating', 'ceiling fan', 'portable fans', 'portable heater'],
('baby essentials'): ['baby bath', 'baby essentials', 'baby monitor', 'baby safety gates', 'changing table', 'high chair', 'crib', 'pack u2019n play/travel crib', 'pack �n play/travel crib'],
('balcony'): ['balcony', 'patio or balcony', 'terrace'],
('barbecue utensils'): ['barbecue utensils', 'bbq grill'],
('bathroom essentials'): ['bath towel', 'bathroom essentials', 'beach essentials', 'essentials', 'body soap', 'conditioner', 'rituals body soap', 'rituals shampoo', 'shampoo', 'shower gel', 'toilet', 'toilet paper'],
('bathroom extras'): ['bidet', 'dryer', 'hair dryer', 'en suite bathroom', 'drying rack for clothing', 'heated towel rack', 'touchless faucets'],
('bathtub'): ['bathtub', 'hot tub', 'soaking tub'],
('beach view'): ['beach view', 'beachfront', 'waterfront'],
('bedroom estras'): ['bedroom comforts', 'bed linens', 'day bed', 'extra pillows and blankets'],
('sound system'): ['amazon echo', 'bluetooth sound system', 'ikea nearby bluetooth sound system', 'sound system'],
('buzzer/wireless intercom'): ['buzzer/wireless intercom'],
('cleaning before checkout'): ['cleaning before checkout', 'cleaning products'],
('storage'): ['clothing storage', 'clothing storage: closet', 'hangers', 'luggage dropoff allowed', 'standing valet'],
('coffee maker'): ['coffee maker', 'espresso machine', 'nespresso machine', 'pour over coffee', 'pour-over coffee'],
('elevator'): ['elevator', 'elevator in building'],
('internet'): ['ethernet connection', 'internet', 'wireless internet', 'wifi', 'wifi u2013 100 mbps', 'pocket wifi'],
('ev charger'): ['ev charger'],
('exercise equipment'): ['exercise equipment', 'gym'],
('family friendly'): ['babysitter recommendations', 'childrenu2019s books and toys', 'children�s books and toys', 'childrenu2019s dinnerware', 'children�s dinnerware', 'family/kid friendly', 'lock on bedroom door', 'table corner guards', 'window guards'],
('fire extinguisher'): ['fire extinguisher'],
('smoke detector'): ['carbon monoxide alarm', 'carbon monoxide detector', 'smoke alarm', 'smoke detector'],
('fireplace guards'): ['fireplace guards'],
('custom bed'): ['firm mattress', 'memory foam mattress', 'pillow-top mattress', 'murphy bed'],
('first aid kit'): ['first aid kit'],
('game console'): ['game console'],
('garden'): ['garden or backyard', 'shared garden or backyard'],
('indoor fireplace'): ['indoor fireplace'],
('ironing board'): ['iron', 'ironing board'],
('kitchen'): ["chef's kitchen", 'full kitchen', 'kitchen', 'kitchenette'],
('kitchen essentials'): ['baking sheet', 'cooking basics', 'dishes and silverware', 'wine glasses', 'bread maker', 'dishwasher', 'freezer', 'microwave', 'mini fridge', 'refrigerator', 'rice maker', 'toaster', 'hot water', 'hot water kettle'],
('dining area'): ['breakfast', 'breakfast table', 'dining area', 'dining table', 'formal dining area', 'outdoor dining area'],
('lake access'): ['lake access'],
('long term stays allowed'): ['long term stays allowed'],
('mosquito net'): ['mosquito net'],
('mountain view'): ['mountain view'],
('mudroom'): ['mudroom'],
('other'): ['other'],
('outdoor furniture'): ['hammock', 'outdoor furniture', 'outdoor seating', 'outlet covers', 'sun loungers'],
('oven'): ['convection oven', 'double oven', 'gas oven', 'oven', 'stainless steel oven', 'steam oven', 'warming drawer'],
('free parking'): ['free parking on premises', 'free driveway parking on premises u2013 1 space', 'free parking on street', 'free street parking', 'parking'],
('paid parking'): ['paid parking garage off premises', 'paid parking garage on premises', 'paid parking off premises', 'paid parking on premises'],
('pets allowed'): ['cat(s)', 'dog(s)', 'other pet(s)', 'pets allowed', 'pets live on this property'],
('pool'): ['pool', 'pool with pool hoist', 'pool toys', 'shared outdoor pool', 'shared pool'],
('private entrance'): ['private entrance'],
('private living room'): ['private living room'],
('room-darkening shades'): ['room-darkening shades'],
('security system'): ['keypad', 'lockbox', 'safety card', 'security system', 'smart lock'],
('self check-in'): ['self check-in'],
('shower'): ['outdoor shower', 'handheld shower head', 'rain shower', 'walk-in shower'],
('ski-in/ski-out'): ['ski in/ski out', 'ski-in/ski-out'],
('smoking allowed'): ['smoking allowed'],
('stair gates'): ['stair gates'],
('stove'): ['electric stove', 'stainless steel electric stove', 'stainless steel stove', 'stove'],
('suitable for events'): ['suitable for events'],
('translation missing:'): ['translation missing: en.hosting_amenity_49', 'translation missing: en.hosting_amenity_50'],
('trash can'): ['trash can'],
('tv'): ['40 hdtv', '43 hdtv with netflix', 'cable tv', 'dvd player', 'hbo go', 'netflix', 'smart tv', 'tv'],
('washer'): ['washer', 'washer u2013u00a0in unit', 'laundromat nearby'],
('well-lit path to entrance'): ['well-lit path to entrance'],
('wheelchair accessible'): ['disabled parking spot', 'electric profiling bed', 'flat', 'flat path to front door', 'flat path to guest entrance', 'ground floor access', 'step-free access', 'no stairs or steps to enter', 
                            'single level home', 'smooth pathway to front door', 'wheelchair accessible', 'wide doorway', 'wide entrance', 'wide entrance for guests', 'wide entryway', 'wide hallway clearance', 'wide hallways'],
('wheelchair accessible bathroom'): ['accessible-height toilet', 'bathtub with bath chair', 'extra space around shower and toilet', 'fixed grab bars for shower', 'fixed grab bars for shower & toilet', 'fixed grab bars for toilet', 
                                     'roll-in shower', 'shower chair', 'step-free shower', 'wide clearance to shower', 'wide clearance to shower & toilet', 'wide doorway to guest bathroom'],
('wheelchair accessible room'): ['accessible-height bed', 'extra space around bed', 'wide clearance to bed'],
('workspace'): ['dedicated workspace', 'laptop friendly workspace', 'laptop-friendly workspace', 'office', 'printer'],
}

#### Funcion que normaliza las amenities


In [32]:
def normalizar_amenities(amenities, equivalencias):
    if not amenities or not isinstance(amenities, str):
        return ''
    
    normalized = set()
    amenity_list = [a.strip().lower() for a in amenities.split(',') if a.strip()]

    for amenity in amenity_list:
        found = False
        for key, synonyms in equivalencias.items():
            if amenity in [s.lower() for s in synonyms]:
                normalized.add(key)
                found = True
                break
        if not found:
            normalized.add(amenity)  # Se conserva tal cual si no está en equivalencias
    
    return ', '.join(sorted(normalized))

#### Normalizamos las amenities y creamos una nueva columna con las amenities normalizadas 

In [33]:
# Aplicamos la función a la columna 'amenities'
df['amenities_normalized'] = df['amenities_list'].apply(lambda x: normalizar_amenities(', '.join(x), equivalencias))

# Mostramos el resultado
df[['amenities_normalized', 'amenities_list']].head()

Unnamed: 0,amenities_normalized,amenities_list
3092,"air conditioning, balcony, bathroom essentials...","[wifi, free parking on premises, free street p..."
9896,"air conditioning, bathroom essentials, bathroo...","[wifi, essentials, shampoo, hair dryer, heatin..."
7927,"24-hour check-in, balcony, bathroom essentials...","[host greets you, cooking basics, hair dryer, ..."
9966,"air conditioning, bathroom essentials, bathroo...","[dryer, oven, essentials, coffee maker, iron, ..."
9857,"air conditioning, bathroom essentials, bathroo...","[cooking basics, hair dryer, smoke alarm, tv, ..."


#### Mostrar la diferencia entre amenities unicas con y sin normalizar

In [34]:
# Extraer las amenidades originales únicas
amenidades_originales = set()
df['amenities_list'].dropna().apply(lambda lst: [amenidades_originales.add(i.strip().lower()) for i in lst])

# Extraer las amenidades normalizadas únicas
amenidades_normalizadas = set()
df['amenities_normalized'].dropna().apply(lambda lst: [amenidades_normalizadas.add(i.strip().lower()) for i in lst])

# Mostrar las amenidades únicas
print("Amenidades originales únicas:", len(amenidades_originales))
print("Amenidades normalizadas únicas:", len(amenidades_normalizadas))

Amenidades originales únicas: 276
Amenidades normalizadas únicas: 36


In [35]:
print(df.isnull().sum().sort_values(ascending=False).head(10))


neighbourhood_district         3790
review_scores_value            2664
review_scores_location         2664
review_scores_checkin          2663
review_scores_accuracy         2658
review_scores_communication    2654
review_scores_cleanliness      2652
review_scores_rating           2649
last_review_date               2548
reviews_per_month              2547
dtype: int64
