In [18]:
import pandas as pd
pd.options.plotting.backend = "plotly"
from sqlalchemy import create_engine
import plotly.express as px
import ast

In [19]:
connection_string = "postgresql+psycopg2://postgres:postgres@localhost:5433/dwh_airbnb"
engine = create_engine(connection_string)

In [20]:
schema_name = "stg"
table_name = "listing"

query = f"SELECT amenities FROM {schema_name}.{table_name};"

df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,amenities
0,"[""Cooking basics"", ""BBQ grill"", ""Single level ..."
1,"[""Ocean view"", ""Cooking basics"", ""42 inch HDTV..."
2,"[""Hangers"", ""Hair dryer"", ""Bed linens"", ""Heati..."
3,"[""Baking sheet"", ""Cooking basics"", ""BBQ grill""..."
4,"[""Elevator"", ""TV"", ""Wifi"", ""Dryer"", ""Kitchen"",..."


In [21]:
amenities_set = set()
for amenities_list in df["amenities"]:
    for amenitie in ast.literal_eval(amenities_list):
        amenities_set.add(amenitie.lower())


In [22]:
len(amenities_set)

107716

In [23]:
import re

amenities_regex = {
    "wifi": 
        r"\b(wi[-\s]?fi|internet|conexión\s+inalámbrica|red\s+inalámbrica|wifi)\b",
    "kitchen": 
        r"\b(kitchen|cocina|cocina\s+completa|cocina\s+equipada)\b",
    "free_parking": 
        r"\b(free\s+parking|estacionamiento\s+gratuito|aparcamiento\s+gratis)\b",
    "air_conditioning":
        r"\b(air\s*conditioning|aire\s+acondicionado|climatizador|ac\s+unit)\b",
    "heating": 
        r"\b(heating|calefacci[oó]n|radiadores|portable\s+heater|calefactor\s+port[aá]til|estufa\s+port[aá]til)\b",
    "washer": 
        r"\b(washer|lavadora|máquina\s+de\s+lavar)\b",
    "dryer": 
        r"\b(dryer|secadora|máquina\s+de\s+secar)\b",
    "tv": 
        r"\b(tv|televisión|televisor|pantalla\s+plana|smart\s*tv|\d{2,3}\s*inch\s*hdtv|\d{2,3}\s*\"\s*hdtv|hdtv\s+con\s+\w+|hdtv)\b",
    "pool": 
        r"\b(pool|piscina|alberca)\b",
    "hot_tub": 
        r"\b(hot\s*tub|jacuzzi|bañera\s+de\s+hidromasaje)\b",
    "self_check_in": 
        r"\b(self[-\s]?check[-\s]?in|llegada\s+autónoma|entrada\s+automática)\b",
    "pets_allowed":
        r"\b(pets\s+allowed|se\s+aceptan\s+mascotas|mascotas\s+permitidas)\b",
    "workspace": 
        r"\b(dedicated\s+workspace|espacio\s+de\s+trabajo|escritorio|zona\s+de\s+trabajo)\b",
    "vegan_shampoo": 
        r"\b(vegan\s+shampoo|champú\s+vegano|champu\s+vegano|champú\s+org[aá]nico|champu\s+org[aá]nico|champú\s+natural|champu\s+natural)\b",
    "vegan_conditioner": 
        r"\b(vegan\s+conditioner|acondicionador\s+vegano|acondicionador\s+orgánico|acondicionador\s+natural)\b",
    "vegan_soap": 
        r"\b(vegan\s+soap|jabón\s+vegano|jabon\s+vegano|jabón\s+orgánico|jabon\s+orgánico|jabón\s+natural|jabon\s+natural)\b",
    "shower_gel": 
        r"\b(shower\s+gel|gel\s+de\s+baño|gel\s+de\s+ducha)\b",
    "essentials": 
        r"\b(essentials|artículos\s+esenciales|s[aá]banas|toallas|jab[óo]n|papel\s+higiénico|dishes|platos|pre-stocking)\b",
    "hair_dryer":
        r"\b(hair\s*dryer|secador\s+de\s+cabello|secador\s+de\s+pelo)\b",
    "iron": 
        r"\b(iron|plancha)\b",
    "shampoo": 
        r"\b(shampoo|champú|champu)\b",
    "conditioner": 
        r"\b(conditioner|acondicionador)\b",
    "smoke_alarm": 
        r"\b(smoke\s+alarm|detector\s+de\s+humo|alarma\s+de\s+humo)\b",
    "carbon_monoxide_alarm": 
        r"\b(carbon\s+monoxide\s+alarm|detector\s+de\s+mon[oó]xido\s+de\s+carbono)\b",
    "first_aid_kit": 
        r"\b(first\s+aid\s+kit|botiqu[ií]n\s+de\s+primeros\s+auxilios)\b",
    "fire_extinguisher": 
        r"\b(fire\s+extinguisher|extintor\s+de\s+incendios|extintor)\b",
    "balcony": 
        r"\b(balcony|balc[óo]n|terraza)\b",
    "crib": 
        r"\b(crib|cuna|cuna\s+para\s+beb[ée])\b",
    "gym": 
        r"\b(gym|gimnasio|sala\s+de\s+ejercicio)\b",
    "bbq_grill": 
        r"\b(bbq\s+grill|parrilla|barbacoa|asador)\b",
    "fireplace": 
        r"\b(fireplace|chimenea)\b",
    "elevator": 
        r"\b(elevator|ascensor)\b",
    "parking": 
        r"\b(parking|estacionamiento|aparcamiento|garage|garaje|estacionamiento\s+cerrado|parking\s+cerrado|carport|cochera|aparcamiento\s+cubierto|parking\s+cubierto)\b",
    "sauna": 
        r"\b(sauna)\b",
    "bathtub": 
        r"\b(bathtub|bañera|tina\s+de\s+baño)\b",
    "coffee_maker": 
        r"\b(coffee\s+maker|cafetera|m[áa]quina\s+de\s+caf[ée])\b",
    "microwave": 
        r"\b(microwave|microondas)\b",
    "refrigerator": 
        r"\b(refrigerator|fridge|nevera|refrigerador)\b",
    "dishwasher":
        r"\b(dishwasher|lavavajillas|lavaplatos)\b",
    "oven": 
        r"\b(oven|horno)\b",
    "toaster": 
        r"\b(toaster|tostadora)\b",
    "blender": 
        r"\b(blender|licuadora)\b",
    "workspace": 
        r"\b(workspace|espacio\s+de\s+trabajo|escritorio)\b",
    "lockbox": 
        r"\b(lockbox|caja\s+de\s+seguridad|caja\s+fuerte)\b",
    "security_cameras": 
        r"\b(security\s+cameras|c[áa]maras\s+de\s+seguridad)\b",
    "garden": 
        r"\b(garden|jard[íi]n|patio|terraza|backyard)\b",
    "ski_in_out":
        r"\b(ski[-\s]?in[-\s]?out|acceso\s+directo\s+a\s+las\s+pistas\s+de\s+esqu[íi])\b",
    "bicycle": 
        r"\b(bicycle|bike|bicicleta)\b",
    "luggage_dropoff": 
        r"\b(luggage\s+drop[-\s]?off|consigna\s+de\s+equipaje|guardar\s+equipaje)\b",
    "long_term_stays": 
        r"\b(long[-\s]?term\s+stays|estancias\s+largas|alquiler\s+mensual)\b",
    "body_soap": 
        r"\b(body\s+soap|jab[oó]n\s+corporal|jab[oó]n\s+de\s+cuerpo|jab[oó]n\s+para\s+el\s+cuerpo)\b",
    "sound_system": 
        r"\b(sound\s+system|sistema\s+de\s+sonido|equipo\s+de\s+m[uú]sica|altavoz\s+bluetooth|bluetooth\s+speaker|enceinte\s+bluetooth)\b",
    "streaming_services": 
        r"\b(netflix|amazon\s+prime\s+video|disney\+|hulu|hbo\s+max|roku|chromecast|dvd\s+player|cable\s+est[aá]ndar|cable\s+premium)\b",
    "window_guards": 
        r"\b(window\s+guards|rejas\s+de\s+ventana|protecciones\s+de\s+ventana)\b",
    "housekeeping": 
        r"\b(housekeeping\s+available|servicio\s+de\s+limpieza\s+disponible|limpieza\s+incluida|limpieza\s+con\s+costo\s+adicional|housekeeping|limpieza|cleaning\s+available\s+during\s+stay|limpieza\s+disponible\s+durante\s+la\s+estancia)\b",
    "gas_stove": 
        r"\b(gas\s+stove|estufa\s+de\s+gas|cocina\s+de\s+gas|fog[oó]n\s+de\s+gas)\b",
    "induction_stove": 
        r"\b(induction\s+stove|estufa\s+de\s+inducci[oó]n|cocina\s+de\s+inducci[oó]n)\b",
    "electric_stove": 
        r"\b(electric\s+stove|estufa\s+el[eé]ctrica|cocina\s+el[eé]ctrica)\b",
    "stove": 
        r"\b(stove|estufa|cocina|cooktop|fogão|hornilla)\b",
    "lock_on_bedroom_door": 
        r"\b(lock\s+on\s+bedroom\s+door|cerradura\s+en\s+la\s+puerta\s+del\s+dormitorio|puerta\s+del\s+dormitorio\s+con\s+cerradura)\b",
    "golf_course_view":
        r"\b(golf\s+course\s+view|vista\s+al\s+campo\s+de\s+golf|vistas\s+al\s+campo\s+de\s+golf)\b",
    "baby_monitor": 
        r"\b(baby\s+monitor|monitor\s+de\s+beb[ée]|vigila\s+beb[ée]s)\b",
    "mountain_view": 
        r"\b(mountain\s+view|vista\s+a\s+la\s+montaña|vistas\s+a\s+la\s+montaña)\b",
    "game_console": 
        r"\b(game\s+console|consola\s+de\s+juegos|videojuegos)\b",
    "chef_service": 
        r"\b(chef\s+service|servicio\s+de\s+chef|chef\s+disponible|cook\s+service|servicio\s+de\s+cocina|cocinero\s+disponible)\b",
    "spa_access": 
        r"\b(spa\s+access|acceso\s+al\s+spa)\b",
    "window_guards": 
        r"\b(window\s+guards|protecciones\s+de\s+ventana|rejas\s+en\s+ventanas|barandillas\s+de\s+ventana)\b",
    "children_books_toys": 
        r"\b(children’s\s+books\s+and\s+toys|libros\s+y\s+juguetes\s+para\s+niños|juguetes\s+infantiles|libros\s+infantiles)\b",
    "exercise_equipment": 
        r"\b(exercise\s+equipment|equipo\s+de\s+ejercicio|equipamiento\s+de\s+ejercicio|gimnasio|gym)\b",
    "waitstaff": 
        r"\b(waitstaff|personal\s+de\s+servicio|camareros|servicio\s+de\s+camareros|butler\s+service|servicio\s+de\s+mayordomo|mayordomo\s+disponible)\b",
    "ski_in_ski_out": 
        r"\b(ski[-\s]?in/ski[-\s]?out|acceso\s+directo\s+a\s+pistas\s+de\s+esqu[íi]|acceso\s+a\s+pistas\s+de\s+esqu[íi])\b",
    "sports_court": 
        r"\b(squash\s+court|cancha\s+de\s+squash|pista\s+de\s+squash|volleyball\s+court|cancha\s+de\s+voleibol|pista\s+de\s+voleibol|bocce\s+ball\s+court|cancha\s+de\s+bochas|pista\s+de\s+petanca|foosball\s+table|batting\s+cage|caja\s+de\s+bateo|racquetball\s+court)\b",
    "table_sports": 
        r"\b(ping\s+pong\s+table|mesa\s+de\s+ping\s+pong|mesa\s+de\s+tenis\s+de\s+mesa|air\s+hockey\s+table|mesa\s+de\s+air\s+hockey|mesa\s+de\s+hockey\s+de\s+aire)\b",
    "breakfast_bar": 
        r"\b(breakfast\s+bar|barra\s+de\s+desayuno|barra\s+para\s+desayunar)\b",
    "clothing_storage": 
        r"\b(clothing\s+storage|walk[-\s]?in\s+closet|closet|wardrobe|dresser|almacenamiento\s+de\s+ropa|vestidor|armario|ropero|c[óo]moda)\b",
    "security_guard": 
        r"\b(security\s+guard|guardia\s+de\s+seguridad|vigilante\s+de\s+seguridad|property\s+manager|)\b",
    "security_system": 
        r"\b(security\s+system|sistema\s+de\s+seguridad|alarma\s+de\s+seguridad)\b",
    "board_games": 
        r"\b(board\s+games|juegos\s+de\s+mesa)\b",
    "baby_bath": 
        r"\b(baby\s+bath|bañera\s+para\s+beb[ée]|bañera\s+de\s+beb[ée])\b",
    "waterfront": 
        r"\b(waterfront|frente\s+al\s+agua|a\s+orillas\s+del\s+agua|ocean\s+view|vista\s+al\s+oc[ée]ano|vistas\s+al\s+mar|beach\s+access|acceso\s+a\s+la\s+playa|lake\s+access|acceso\s+al\s+lag[o|una]|sea\s+view)\b",
    "private_entrance": 
        r"\b(private\s+entrance|entrada\s+privada|acceso\s+privado)\b",
    "bar": 
        r"\b(bar|barra\s+de\s+bar|barra|mini\s+bar|minibar|outdoor\s+bar|bar\s+al\s+aire\s+libre|barra\s+exterior)\b",
    "resort_access": 
        r"\b(resort\s+access|acceso\s+al\s+resort|acceso\s+al\s+complejo\s+tur[íi]stico)\b",
    "bread_maker": 
        r"\b(bread\s+maker|panificadora|m[áa]quina\s+de\s+hacer\s+pan)\b",
    "hot_water_kettle": 
        r"\b(hot\s+water\s+kettle|hervidor\s+de\s+agua\s+caliente|tetera\s+el[ée]ctrica)\b",
    "shared_backyard": 
        r"\b(shared\s+backyard|patio\s+trasero\s+compartido|jard[íi]n\s+compartido)\b",
    "hammock": 
        r"\b(hammock|hamaca)\b",
    "baby_care": 
        r"\b(high\s+chair|children’s\s+dinnerware|)\b",
}


In [24]:
matched_amenities = set()
classified_dict = {}

for classification, _ in amenities_regex.items():
    classified_dict[classification] = []

for amen in amenities_set:
    for classification, regex_exp in amenities_regex.items():
        if re.search(regex_exp, amen):
            matched_amenities.add(amen)
            classified_dict[classification].append(amen)
            

In [25]:
len(matched_amenities)

107716

In [26]:
amenities_set = amenities_set.difference(matched_amenities)

In [27]:
len(amenities_set)

0

In [28]:
print(amenities_set)

set()


In [29]:
print(len(amenities_regex))

86


In [30]:
import os
import json

json_path = os.path.join(os.getcwd(), "amenities.json")

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(amenities_regex, f, ensure_ascii=False, indent=2)
