#### Importación de bibliotecas

In [1]:
import mysql.connector
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#### Conexión a MySQL

In [None]:
def get_database(host, user, password, database):
    try:
        cnx = mysql.connector.connect(
            host = host,
            user = user,
            password = password,
            database = database
        )

        if cnx.is_connected():
            print(f'Se conectó a la base de datos "{database}" en MySQL')

        cursor = cnx.cursor()
        cursor.execute('SHOW TABLES;')
        
        tables = cursor.fetchall()
        table_names = [table[0] for table in tables]

        print(f'Tablas encontradas: {table_names}.')

        db = {}

        for table in table_names:
            query = f'SELECT * FROM {table};'
            db[table] = pd.read_sql(query, con = cnx) 
        return db

    except mysql.connector.Error as error:
        print(f'Error: {error}')
        return None

    finally:
        if cnx.is_connected():
            cnx.close()
            print('Conexión a MySQL cerrada.')


def rename_dfs():
    for table, df in db.items():
        globals()[table] = df
        print(f'Se creó el dataframe "{table}"')

if __name__ == '__main__':
    host = '212.227.90.6'
    user = 'EquipoE'
    password = 'E1q2u3i4p5oE'
    database = 'Equip_E'

    db = get_database(host, user, password, database)
    rename_dfs()


Se conectó a la base de datos "Equip_E" en MySQL
Tablas encontradas: ['Tourist_Accommodation'].


  db[table] = pd.read_sql(query, con = cnx)


Conexión a MySQL cerrada.
Se creó el dataframe "Tourist_Accommodation"


#### Exploración inicial

In [3]:
df = Tourist_Accommodation
pd.set_option('display.max_columns', None) # ver todas las columnas
df.head()

Unnamed: 0,apartment_id,name,description,host_id,neighbourhood_name,neighbourhood_district,room_type,accommodates,bathrooms,bedrooms,beds,amenities_list,price,minimum_nights,maximum_nights,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,first_review_date,last_review_date,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,is_instant_bookable,reviews_per_month,country,city,insert_date
0,11964,A ROOM WITH A VIEW,Private bedroom in our attic apartment. Right ...,45553,Centro,,Private room,2,2,1,1.0,"TV,Internet,Wifi,Air conditioning,Elevator,Buz...",400.0,3,365,VERDADERO,7,20,40,130,78,02/01/2010,05/09/2017,970.0,100.0,100.0,100.0,100.0,100.0,100.0,FALSO,75.0,spain,malaga,31/07/2018
1,21853,Bright and airy room,We have a quiet and sunny room with a good vie...,83531,C�rmenes,Latina,Private room,1,1,1,1.0,"TV,Internet,Wifi,Air conditioning,Kitchen,Free...",170.0,4,40,VERDADERO,0,0,0,162,33,10/10/2014,15/07/2018,920.0,90.0,90.0,100.0,100.0,80.0,90.0,FALSO,52.0,spain,madrid,10/01/2020
2,32347,Explore Cultural Sights from a Family-Friendly...,Open French doors and step onto a plant-filled...,139939,San Vicente,Casco Antiguo,Entire home/apt,4,1,2,2.0,"TV,Internet,Wifi,Air conditioning,Wheelchair a...",990.0,2,120,VERDADERO,26,31,31,270,148,05/01/2011,22/07/2019,980.0,100.0,100.0,100.0,100.0,100.0,100.0,VERDADERO,142.0,spain,sevilla,29/07/2019
3,35379,Double 02 CasanovaRooms Barcelona,Room at a my apartment. Kitchen and 2 bathroom...,152232,l'Antiga Esquerra de l'Eixample,Eixample,Private room,2,2,1,1.0,"TV,Internet,Wifi,Kitchen,Breakfast,Elevator,Bu...",400.0,2,730,VERDADERO,9,23,49,300,292,13/03/2012,04/01/2020,940.0,100.0,90.0,100.0,100.0,100.0,90.0,VERDADERO,306.0,spain,barcelona,10/01/2020
4,35801,Can Torras Farmhouse Studio Suite,Lay in bed & watch sunlight change the mood of...,153805,Quart,,Private room,5,1,2,5.0,"Wifi,Pool,Free parking on premises,Breakfast,P...",900.0,1,180,VERDADERO,0,19,49,312,36,08/07/2011,08/08/2018,970.0,100.0,100.0,100.0,100.0,100.0,100.0,FALSO,39.0,spain,girona,19/02/2019


##### Valores nulos

In [6]:
df.isnull().sum()

apartment_id                      0
name                              3
description                     138
host_id                           0
neighbourhood_name                0
neighbourhood_district         3921
room_type                         0
accommodates                      0
bathrooms                        74
bedrooms                         70
beds                             45
amenities_list                   17
price                           254
minimum_nights                    0
maximum_nights                    0
has_availability                550
availability_30                   0
availability_60                   0
availability_90                   0
availability_365                  0
number_of_reviews                 0
first_review_date              2604
last_review_date               2605
review_scores_rating           2709
review_scores_accuracy         2718
review_scores_cleanliness      2712
review_scores_checkin          2723
review_scores_communication 

##### Duplicados

In [7]:
df.duplicated().sum()

np.int64(0)

#### 3. Limpieza de datos (Data Cleaning)

##### Tratamiento de duplicados

In [10]:
# Verificar los apartment_id duplicados
duplicados = df['apartment_id'].duplicated().sum()

print(f"El número de apartment_id duplicados es: {duplicados}")

El número de apartment_id duplicados es: 350


In [11]:
# Filtrr las filas con apartment_id duplicados
duplicados_filas = df[df['apartment_id'].duplicated(keep=False)]

# Mostrar las filas duplicadas
duplicados_filas

Unnamed: 0,apartment_id,name,description,host_id,neighbourhood_name,neighbourhood_district,room_type,accommodates,bathrooms,bedrooms,beds,amenities_list,price,minimum_nights,maximum_nights,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,first_review_date,last_review_date,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,is_instant_bookable,reviews_per_month,country,city,insert_date
22,144471,BEST ZONEPL.CATALU�A ROOM X 2BATHROOM +WIFI,HELLO PEOPLE :=) are you on vacation? or busi...,700165,Sant Antoni,Eixample,Private room,2,1,1,1.0,"Wireless Internet,Wheelchair accessible,Elevat...",470.0,2,365,VERDADERO,22,52,82,357,32,04/07/2011,04/08/2016,870.0,80.0,90.0,90.0,90.0,90.0,80.0,FALSO,42.0,spain,barcelona,12/09/2017
23,144471,BEST ZONEPL.CATALU�A ROOM X 2BATHROOM +WIFI,HELLO PEOPLE :=) are you on vacation? or busi...,700165,Sant Antoni,Eixample,Private room,2,1,1,1.0,"Wifi,Wheelchair accessible,Elevator,Heating,Fa...",490.0,2,365,VERDADERO,19,49,79,354,34,04/07/2011,15/08/2018,880.0,80.0,90.0,90.0,90.0,90.0,80.0,FALSO,38.0,spain,barcelona,10/10/2018
24,157327,House in Llofriu (Costa Brava),New rebuilt and furnished house pool bbq If yo...,755634,Forallac,,Entire home/apt,8,5,4,8.0,"TV,Air conditioning,Pool,Kitchen,Free parking ...",,7,90,VERDADERO,27,57,87,362,1,18/08/2015,18/08/2015,800.0,80.0,100.0,100.0,100.0,80.0,80.0,FALSO,2.0,spain,girona,30/04/2020
25,157327,House in Llofriu (Costa Brava),New rebuilt and furnished house pool bbq If yo...,755634,Forallac,,Entire home/apt,8,5,4,8.0,"TV,Air conditioning,Pool,Kitchen,Free parking ...",5000.0,7,60,VERDADERO,26,56,86,361,1,18/08/2015,18/08/2015,800.0,80.0,100.0,100.0,100.0,80.0,80.0,FALSO,3.0,spain,girona,30/08/2018
50,343864,3 BD APT IN THE HEART OF GRACIA,A spacious and comfortable 3 bedrooms apartmen...,1744516,la Vila de Gr�cia,Gr�cia,Entire home/apt,6,2,3,5.0,"TV,Wireless Internet,Kitchen,Buzzer/wireless i...",1390.0,2,1125,,9,28,58,333,4,17/06/2014,07/03/2017,950.0,100.0,100.0,100.0,90.0,100.0,90.0,FALSO,11.0,spain,barcelona,05/06/2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9759,42922000,Hostal Bruselas Hab. #7 con ba�o privado y balc�n,Hostal Bruselas se encuentra en el Centro de M...,335917627,Centro,,Private room,2,1,1,1.0,"Breakfast, Essentials, Hangers, Shampoo, Wifi,...",360.0,2,1125,VERDADERO,30,60,90,90,0,,,,,,,,,,VERDADERO,,spain,malaga,30/11/2020
9788,43193627,Recoletos IX,"Consta de dos dormitorios, dos ba�os, un aseo ...",247570318,Recoletos,Salamanca,Entire home/apt,4,3,2,3.0,"TV,Wifi,Air conditioning,Kitchen,Heating,Washe...",1070.0,30,1125,VERDADERO,30,60,90,365,0,,,,,,,,,,FALSO,,spain,madrid,17/04/2020
9789,43193627,Recoletos IX,"Consta de dos dormitorios, dos ba�os, un aseo ...",247570318,Recoletos,Salamanca,Entire home/apt,4,3,2,3.0,"Iron, TV, Hangers, Smoke alarm, Heating, Air c...",1080.0,30,1125,VERDADERO,30,60,90,365,0,,,,,,,,,,FALSO,,spain,madrid,06/11/2020
9852,43897607,Finca rural Els Ametllers,Hermosa Villa rural en Finca de almendros y ol...,351241882,Inca,,Entire home/apt,8,2,3,5.0,"Smoke alarm, Oven, Extra pillows and blankets,...",1820.0,5,1125,VERDADERO,23,53,83,323,0,,,,,,,,,,VERDADERO,,spain,mallorca,19/09/2020


In [12]:
def resolver_duplicados(df, id_col, date_col):
    """
    Conserva el duplicado más antiguo, elimina el más reciente,
    pero antes completa los valores faltantes del antiguo con los valores del más reciente.
    Devuelve el df sin duplicados y un resumen de los cambios
    """

    df = df.copy()  # para no modificar el df original
    resumen = []

    grupos_duplicados = df[df.duplicated(subset=[id_col], keep=False)]

    for value, group in grupos_duplicados.groupby(id_col):
        if len(group) > 1:
            # ordena por fecha
            group = group.sort_values(by=date_col, ascending=True)
            earliest_index = group.index[0]
            latest_index = group.index[1]

            # completa valores vacíos
            actualizacion = {}
            for col in df.columns:
                if pd.isna(df.at[earliest_index, col]) and not pd.isna(df.at[latest_index, col]):
                    df.at[earliest_index, col] = df.at[latest_index, col]
                    actualizacion[col] = df.at[latest_index, col]

            # si se actualizó, se guarda
            if actualizacion:
                resumen.append({
                    "ID": value,
                    "índice del registro antiguo": earliest_index,
                    "índice del registro eliminado": latest_index,
                    "columnas actualizadas": actualizacion
                })

            # eliminar el registro reciente
            df.drop(index=latest_index, inplace=True)

    resumen_df = pd.DataFrame(resumen)

    return df, resumen_df

df_sin_duplicados, duplicados_borrados = resolver_duplicados(df, id_col='apartment_id', date_col='insert_date')

In [13]:
df = df_sin_duplicados

# pd.set_option('display.max_colwidth', None)
duplicados_borrados

Unnamed: 0,ID,índice del registro antiguo,índice del registro eliminado,columnas actualizadas
0,157327,24,25,{'price': 5000.0}
1,343864,50,51,{'has_availability': 'VERDADERO'}
2,1624014,471,470,{'has_availability': 'VERDADERO'}
3,1895368,515,516,{'has_availability': 'VERDADERO'}
4,2450287,614,615,{'bedrooms': '0'}
5,3163230,759,758,{'has_availability': 'VERDADERO'}
6,3559666,839,838,{'has_availability': 'VERDADERO'}
7,3770072,879,878,{'has_availability': 'VERDADERO'}
8,5080749,1087,1088,"{'first_review_date': '27/10/2018', 'last_revi..."
9,5121791,1096,1095,{'amenities_list': ']'}


In [14]:
conteo_duplicados = df["apartment_id"].value_counts()
duplicados = conteo_duplicados[conteo_duplicados > 1]
duplicados

apartment_id
13966456    2
10005342    2
14582385    2
10713417    2
14326808    2
15402794    2
24038577    2
32161182    2
Name: count, dtype: int64

##### Tratamiento de nulos

In [15]:
# valores nulos
df.isnull().sum()

apartment_id                      0
name                              3
description                     134
host_id                           0
neighbourhood_name                0
neighbourhood_district         3791
room_type                         0
accommodates                      0
bathrooms                        64
bedrooms                         68
beds                             45
amenities_list                   16
price                           238
minimum_nights                    0
maximum_nights                    0
has_availability                534
availability_30                   0
availability_60                   0
availability_90                   0
availability_365                  0
number_of_reviews                 0
first_review_date              2522
last_review_date               2523
review_scores_rating           2625
review_scores_accuracy         2634
review_scores_cleanliness      2628
review_scores_checkin          2639
review_scores_communication 

In [16]:
# Eliminar columnas 'description'
df = df.drop(columns=['description'])

In [24]:
# Imputación de valores nulos en 'neighbourhood_district' utilizando la moda de 'neighbourhood_name' filtrando por 'city'.

df['neighbourhood_district'] = df.groupby(['city', 'neighbourhood_name'])['neighbourhood_district'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else "Desconocido")
)

In [25]:
# Reemplazar nulos por 1 en las columnas específicas
df[['beds', 'bathrooms', 'bedrooms']] = df[['accommodates', 'bathrooms', 'bedrooms']].fillna(1)

In [26]:
# Reemplazar los nulos en 'price' con la mediana del 'price' según 'room_type'
df['price'] = df.groupby('room_type')['price'].transform(lambda x: x.fillna(x.median()))

In [27]:
# df['amenities_list'] = df['amenities_list'].fillna()

In [28]:
df.isnull().sum()

apartment_id                      0
name                              3
host_id                           0
neighbourhood_name                0
neighbourhood_district            0
room_type                         0
accommodates                      0
bathrooms                         0
bedrooms                          0
beds                              0
amenities_list                   16
price                             0
minimum_nights                    0
maximum_nights                    0
has_availability                  0
availability_30                   0
availability_60                   0
availability_90                   0
availability_365                  0
number_of_reviews                 0
first_review_date              2522
last_review_date               2523
review_scores_rating           2625
review_scores_accuracy         2634
review_scores_cleanliness      2628
review_scores_checkin          2639
review_scores_communication    2630
review_scores_location      

#### Corrección de tipos de datos

### 4. Transformación de Datos (Data Transformation)

### 5. Reducción de Datos (Data Reduction)

### Exportación del dataset limpio

In [29]:
registros_totales = len(df)
print(f'Total de registros de la DB: {registros_totales}')

Total de registros de la DB: 9658


In [30]:
df.to_csv("../Data/tourist_accommodation_clean.csv", index=False)