Primero cargo la función para la descarga de imágenes

In [2]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm
from skimage import io
from typing import Optional, Union
import cv2
import urllib.request
from sklearn.model_selection import train_test_split

def download_images(paths: list,
                    canvas: tuple = (128, 128),
                    nb_channels: int = 3,
                    max_imgs: Optional[int] = None
                    ) -> tuple:
    n_images = len(paths) if not max_imgs else max_imgs
    images = np.zeros((n_images, canvas[0], canvas[1], nb_channels), dtype=np.uint8)
    downloaded_idxs = []

    for i_img, url in enumerate(tqdm(paths, total=n_images)):
        if i_img >= n_images:
            break
        try:
            img = io.imread(url)
            img = cv2.resize(img, (canvas[0], canvas[1]))
            downloaded_idxs.append(i_img)
            images[i_img] = img
        except (IOError, ValueError) as e:  # Unavailable url / conversion error
            pass
    return images[downloaded_idxs], downloaded_idxs



In [3]:
#descargo el dataset
!wget -O "airbnb-listings.csv" "https://public.opendatasoft.com/explore/dataset/airbnb-listings/download/?format=csv&disjunctive.host_verifications=true&disjunctive.amenities=true&disjunctive.features=true&refine.country=Spain&q=Madrid&timezone=Europe/London&use_labels_for_header=true&csv_separator=%3B"

# Cargo el dataset de airbnb.CSV
df = pd.read_csv("airbnb-listings.csv", sep = ';')

# Filtro filas sin Thumbnail Url o Precio
df = df.dropna(subset=['Thumbnail Url', 'Price'])



--2024-03-03 22:40:01--  https://public.opendatasoft.com/explore/dataset/airbnb-listings/download/?format=csv&disjunctive.host_verifications=true&disjunctive.amenities=true&disjunctive.features=true&refine.country=Spain&q=Madrid&timezone=Europe/London&use_labels_for_header=true&csv_separator=%3B
Resolving public.opendatasoft.com (public.opendatasoft.com)... 34.248.20.69, 34.249.199.226
Connecting to public.opendatasoft.com (public.opendatasoft.com)|34.248.20.69|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/csv]
Saving to: ‘airbnb-listings.csv’

airbnb-listings.csv     [             <=>    ]  52.85M  15.3MB/s    in 3.5s    

2024-03-03 22:40:05 (15.3 MB/s) - ‘airbnb-listings.csv’ saved [55414009]



In [4]:
# Descargo imágenes

downloaded_images, _ = download_images(df['Thumbnail Url'], max_imgs=200)

# Elimino columnas innecesarias después de descargar imágenes
df = df[['Price', 'Property Type', 'Room Type', 'Cancellation Policy', 'Accommodates',
         'Bathrooms', 'Bedrooms', 'Beds', 'Guests Included', 'Extra People',
         'Minimum Nights', 'Maximum Nights', 'Number of Reviews', 'Host Total Listings Count']]



100%|██████████| 200/200 [01:08<00:00,  2.93it/s]


In [5]:
# Divido en train/val/test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)  # 60-70% train, resto validation

#print("El conjunto de entrenamiento tiene dimensiones: ", x_train.shape)
#print("El conjunto de validación tiene dimensiones: ",x_val.shape)
#print("El conjunto de test tiene dimensiones: ",x_test.shape)

In [6]:
# Guardo
train_df.to_csv("train_data.csv", index=False)
val_df.to_csv("val_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

# Guardo las imágenes descargadas
np.save("downloaded_images.npy", downloaded_images)

Normalizo

In [7]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Selecciono las variables de interés
numeric_features = ['Accommodates',
         'Bathrooms', 'Bedrooms', 'Beds', 'Guests Included', 'Extra People',
         'Minimum Nights', 'Maximum Nights', 'Number of Reviews', 'Host Total Listings Count']

categorical_features = ['Property Type', 'Room Type', 'Cancellation Policy']

# Elijio el transformador de cada columna
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Añado el preprocesamiento a los conjuntos de datos
train_df_numeric_scaled = preprocessor.fit_transform(train_df[numeric_features])
val_df_numeric_scaled = preprocessor.transform(val_df[numeric_features])
test_df_numeric_scaled = preprocessor.transform(test_df[numeric_features])

# Voy a concatenar las características transformadas con el conjunto de datos original
train_df = pd.concat([train_df, pd.DataFrame(train_df_numeric_scaled)], axis=1)
val_df = pd.concat([val_df, pd.DataFrame(val_df_numeric_scaled)], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(test_df_numeric_scaled)], axis=1)

# Elimino las columnas originales con variables numéricas que ya no me sirven
train_df = train_df.drop(columns=numeric_features)
val_df = val_df.drop(columns=numeric_features)
test_df = test_df.drop(columns=numeric_features)

# Aplico one-hot encoding a las variables categóricas
encoder = OneHotEncoder()
train_df_cat_encoded = encoder.fit_transform(train_df[categorical_features]).toarray()
val_df_cat_encoded = encoder.transform(val_df[categorical_features]).toarray()
test_df_cat_encoded = encoder.transform(test_df[categorical_features]).toarray()

# las añado transformadas al conjunto de datos original
train_df = pd.concat([train_df, pd.DataFrame(train_df_cat_encoded)], axis=1)
val_df = pd.concat([val_df, pd.DataFrame(val_df_cat_encoded)], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(test_df_cat_encoded)], axis=1)

# Elimino las columnas categoricas que ya no me sirven
train_df = train_df.drop(columns=categorical_features)
val_df = val_df.drop(columns=categorical_features)
test_df = test_df.drop(columns=categorical_features)

ValueError: A given column is not a column of the dataframe

In [None]:
# Cargo las imágenes descargadas
downloaded_images = np.load("downloaded_images.npy")

# Aquí hago la normalización y redimensionado
images = downloaded_images.astype("float32") / 255.
image_size = (128, 128)

# Guardo imágenes normalizadas
np.save("normalized_images.npy", images)


# Dividir en train/val/test
train_images, test_images = train_test_split(images, test_size=0.2, random_state=42)
train_images, val_images = train_test_split(train_images, test_size=0.25, random_state=42)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Guardar los archivos en Google Drive
!cp train_data.csv "/content/drive/My Drive/"
!cp val_data.csv "/content/drive/My Drive/"
!cp test_data.csv "/content/drive/My Drive/"
!cp normalized_images.npy "/content/drive/My Drive/"
!cp thumbnails/train/train_images.npy "/content/drive/My Drive/"
!cp thumbnails/validation/val_images.npy "/content/drive/My Drive/"
!cp thumbnails/test/test_images.npy "/content/drive/My Drive/"


In [None]:
!ls -lah images* filtered* #compruebo que se haya hecho bien

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Creo modelo paso a paso
tabular_model = Sequential([
    Dense(128, activation='relu', input_shape=(len(features),)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1)  # Una neurona para la regresión
])

tabular_model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')

# Entrenamiento del modelo
tabular_model.fit(train_df[features], train_df['Price'], epochs=10, batch_size=32, validation_data=(val_df[features], val_df['Price']))

In [None]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import GlobalAveragePooling2D

# Cargo un modelo base preentrenado
base_model = MobileNetV2(input_shape=(image_size[0], image_size[1], 3), include_top=False, weights='imagenet')

for layer in base_model.layers:
    layer.trainable = False

# Crear modelo
image_model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1)  # Una neurona para la regresión
])

# Compilar modelo
image_model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')

# Entrenamiento del modelo de imágenes
image_model.fit(train_images, epochs=10, validation_data=val_images)

#Grafo de la  pérdida
plt.plot(np.arange(0, n_epochs), loss_epoch_tr)
plt.plot(np.arange(0, n_epochs), loss_epoch_val)
plt.legend(['train', 'val'], loc='upper left')
plt.title('Training Loss')
plt.xlabel('Epoch #')
plt.ylabel('Loss')