Ejecutado desde VERTEX AI Notebooks Colab Enterprise

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import tempfile
from google.cloud import storage

### Google Metadata

Paths de entrada y salida

In [2]:
bucket_name = "json-bucket-datos"
input_folder_name = "Google/metadata-sitios-parquet"
output_folder_name = "Google/ETL"
output_file_name = "metadata.parquet"

# Crear cliente de Google Cloud Storage
storage_client = storage.Client()

bucket = storage_client.get_bucket(bucket_name)

# Crear directorio temporal para trabajar con los archivos
temp_dir = tempfile.mkdtemp()

# Crear referencia al archivo de salida de Parquet
output_blob_name = os.path.join(output_folder_name, output_file_name)
output_blob = bucket.blob(output_blob_name)

# Vamos a guardar los datos transformados en un nuevo archivo de Parquet
output_file_path = os.path.join(temp_dir, output_file_name)

Leer los parquets a un dataframe

In [3]:
# Listar archivos en carpeta origen
blobs = bucket.list_blobs(prefix=input_folder_name)

# Lista para data
parquet_data = []

for blob in blobs:
    if blob.name.endswith(".parquet"):
        # Bajar archivo parquet
        temp_dir = tempfile.mkdtemp()
        local_file_path = os.path.join(temp_dir, os.path.basename(blob.name))
        blob.download_to_filename(local_file_path)

        # Leer la tabla y agregarla a la lista
        parquet_table = pq.read_table(local_file_path)
        parquet_data.append(parquet_table.to_pandas())

        # Limpiar directorio temporal
        os.remove(local_file_path)
        os.rmdir(temp_dir)

# Armar dataframe
df_metadata = pd.concat(parquet_data)

In [4]:
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3025011 entries, 0 to 275000
Data columns (total 15 columns):
 #   Column            Dtype  
---  ------            -----  
 0   name              object 
 1   address           object 
 2   gmap_id           object 
 3   description       object 
 4   latitude          float64
 5   longitude         float64
 6   category          object 
 7   avg_rating        float64
 8   num_of_reviews    int64  
 9   price             object 
 10  hours             object 
 11  MISC              object 
 12  state             object 
 13  relative_results  object 
 14  url               object 
dtypes: float64(3), int64(1), object(11)
memory usage: 369.3+ MB


Liberar referencia

In [5]:
del parquet_data

Eliminar columnas

In [6]:
eliminar_metadata = ['MISC','price', 'state', 'hours', 'relative_results', 'url', 'description']

df_metadata.drop(columns=eliminar_metadata, inplace=True)


Eliminar duplicados

In [9]:
df_metadata = df_metadata[~df_metadata.duplicated(subset='gmap_id', keep=False)]

In [10]:
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2971855 entries, 53146 to 275000
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   name            object 
 1   address         object 
 2   gmap_id         object 
 3   latitude        float64
 4   longitude       float64
 5   category        object 
 6   avg_rating      float64
 7   num_of_reviews  int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 204.1+ MB


Aislar Holiday Inn

In [11]:
hotel = ['Holiday Inn','holiday inn','Holiday inn', 'HolidayInn']

df_holiday = df_metadata[df_metadata['name'].str.contains('|'.join(hotel)) | df_metadata['address'].str.contains('|'.join(hotel))]

In [12]:
df_holiday.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 130 entries, 71118 to 208314
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            130 non-null    object 
 1   address         130 non-null    object 
 2   gmap_id         130 non-null    object 
 3   latitude        130 non-null    float64
 4   longitude       130 non-null    float64
 5   category        129 non-null    object 
 6   avg_rating      130 non-null    float64
 7   num_of_reviews  130 non-null    int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 9.1+ KB


Aislar Competencia

In [13]:
hotels_words = ['best western', 'Hyatt', 'Marriott', 'hilton', 'Wyndham', 'Belmondo', 'accor', 'Radisson', 'Meliá']
check_category = ['lodging', 'hotel', 'motel', 'travel agency', 'hostels', 'resort']

regex_pattern = '|'.join(hotels_words)
regex_category = '|'.join(check_category)

def contains_category(categories, check_categories):
    if categories is None:
        return False
    for category in categories:
        if isinstance(category, str) and any(category.lower() in check.lower() for check in check_categories):
            return True
    return False

all_hotels_words = df_metadata[df_metadata['name'].str.contains(regex_pattern, case=False, na=False)]
df_competidores = all_hotels_words[all_hotels_words['category'].apply(contains_category, check_categories=check_category)]

In [14]:
df_competidores.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 330 entries, 63928 to 272746
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            330 non-null    object 
 1   address         330 non-null    object 
 2   gmap_id         330 non-null    object 
 3   latitude        330 non-null    float64
 4   longitude       330 non-null    float64
 5   category        330 non-null    object 
 6   avg_rating      330 non-null    float64
 7   num_of_reviews  330 non-null    int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 23.2+ KB


Unir

In [21]:
df_metadata = pd.concat([df_holiday, df_competidores], axis=0)

Obtener Gmap Ids para pasos posteriores

In [22]:
gmap_ids = df_metadata['gmap_id'].tolist()

Listar ids Holiday Inn

In [23]:
holiday_business_ids = df_holiday['gmap_id'].tolist()

In [24]:
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 460 entries, 71118 to 272746
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            460 non-null    object 
 1   address         460 non-null    object 
 2   gmap_id         460 non-null    object 
 3   latitude        460 non-null    float64
 4   longitude       460 non-null    float64
 5   category        459 non-null    object 
 6   avg_rating      460 non-null    float64
 7   num_of_reviews  460 non-null    int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 32.3+ KB


Agregar columnas en base a address

In [25]:
def extract_city_state_zip(dataframe, address_column_name):
    # Hacer una copia del DataFrame original para conservar todas las columnas
    result = dataframe.copy()

    # Dividir la columna de direcciones en varias columnas usando la coma como separador
    result[['location', 'state_zip']] = result[address_column_name].str.split(', ', 1, expand=True)

    # Dividir la columna "state_zip" en las columnas "city" y "state_zip" usando la coma como separador
    result[['city', 'state_zip']] = result['state_zip'].str.rsplit(', ', 1, expand=True)

    # Separar la columna "state_zip" en las columnas "state" y "postal_code" usando el espacio como separador
    result[['state', 'postal_code']] = result['state_zip'].str.split(' ', 1, expand=True)

    # Eliminar las columnas intermedias si es necesario
    result.drop(['location', 'state_zip'], axis=1, inplace=True)

    # Conservar solo el nombre de la ciudad después de la última coma
    result['city'] = result['city'].str.split(',').str[-1].str.strip()

    return result

# Llamar a la función para extraer la información
df_metadata = extract_city_state_zip(df_metadata, 'address')



  result[['location', 'state_zip']] = result[address_column_name].str.split(', ', 1, expand=True)
  result[['city', 'state_zip']] = result['state_zip'].str.rsplit(', ', 1, expand=True)
  result[['state', 'postal_code']] = result['state_zip'].str.split(' ', 1, expand=True)


In [26]:
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 460 entries, 71118 to 272746
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            460 non-null    object 
 1   address         460 non-null    object 
 2   gmap_id         460 non-null    object 
 3   latitude        460 non-null    float64
 4   longitude       460 non-null    float64
 5   category        459 non-null    object 
 6   avg_rating      460 non-null    float64
 7   num_of_reviews  460 non-null    int64  
 8   city            460 non-null    object 
 9   state           460 non-null    object 
 10  postal_code     460 non-null    object 
dtypes: float64(3), int64(1), object(7)
memory usage: 43.1+ KB


In [27]:
df_metadata.reset_index(inplace=True)

In [28]:
# Modificar las columnas para los índices especificados
indices_modificar = {
    97: {'city': 'Medford', 'state': 'OR', 'postal_code': '97501'},
    105: {'city': 'Homewood', 'state': 'AL', 'postal_code': '35209'},
    117: {'city': 'Denver', 'state': 'CO', 'postal_code': '80249'},
    156: {'city': 'Waimea', 'state': 'Hawaii', 'postal_code': '96743'},
    367: {'city': 'Mansfield Center', 'state': 'CT', 'postal_code': '06250'},
    374: {'city': 'Bloomington', 'state': 'MN', 'postal_code': '55425'},
    377: {'city': 'Terrell', 'state': 'TX', 'postal_code': '75160'},
    386: {'city': 'San Antonio', 'state': 'TX', 'postal_code': '78232'},
    387: {'city': 'Arlington', 'state': 'VA', 'postal_code': '22202'},
    388: {'city': 'Los Angeles', 'state': 'CA', 'postal_code': '90045'},
    403: {'city': 'Claremore', 'state': 'OK', 'postal_code': '74017'},
    405: {'city': 'West Sacramento', 'state': 'CA', 'postal_code': '95605'},
    411: {'city': 'Colorado Springs', 'state': 'CO', 'postal_code': '80904'},
    412: {'city': 'San Diego', 'state': 'CA', 'postal_code': '92129'},
    423: {'city': 'Jacksonville', 'state': 'FL', 'postal_code': '32256'},
    425: {'city': 'Tulsa', 'state': 'OK', 'postal_code': '74145'},
    426: {'city': 'East Lansing', 'state': 'MI', 'postal_code': '48823'}
}

for indice, modificaciones in indices_modificar.items():
    df_metadata.at[indice, 'city'] = modificaciones['city']
    df_metadata.at[indice, 'state'] = modificaciones['state']
    df_metadata.at[indice, 'postal_code'] = modificaciones['postal_code']

In [30]:
df_metadata.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           460 non-null    int64  
 1   name            460 non-null    object 
 2   address         460 non-null    object 
 3   gmap_id         460 non-null    object 
 4   latitude        460 non-null    float64
 5   longitude       460 non-null    float64
 6   category        459 non-null    object 
 7   avg_rating      460 non-null    float64
 8   num_of_reviews  460 non-null    int64  
 9   city            460 non-null    object 
 10  state           460 non-null    object 
 11  postal_code     460 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 43.2+ KB


Creamos campo is_holiday_inn para identificar los hoteles Holiday Inn

In [31]:
df_metadata['is_holiday_inn'] = df_metadata['gmap_id'].isin(holiday_business_ids).astype(int)

In [37]:
df_metadata.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           460 non-null    int64  
 1   name            460 non-null    object 
 2   address         460 non-null    object 
 3   gmap_id         460 non-null    object 
 4   latitude        460 non-null    float64
 5   longitude       460 non-null    float64
 6   category        459 non-null    object 
 7   avg_rating      460 non-null    float64
 8   num_of_reviews  460 non-null    int64  
 9   city            460 non-null    object 
 10  state           460 non-null    object 
 11  postal_code     460 non-null    object 
 12  is_holiday_inn  460 non-null    int64  
dtypes: float64(3), int64(3), object(7)
memory usage: 46.8+ KB


In [40]:
new_column_order = [
    'gmap_id',
    'name',
    'address',
    'city',
    'state',
    'postal_code',
    'latitude',
    'longitude',
    'category',
    'avg_rating',
    'is_holiday_inn'
    ]

df_metadata = df_metadata[new_column_order]


In [42]:
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gmap_id         460 non-null    object 
 1   name            460 non-null    object 
 2   address         460 non-null    object 
 3   city            460 non-null    object 
 4   state           460 non-null    object 
 5   postal_code     460 non-null    object 
 6   latitude        460 non-null    float64
 7   longitude       460 non-null    float64
 8   category        459 non-null    object 
 9   avg_rating      460 non-null    float64
 10  is_holiday_inn  460 non-null    int64  
dtypes: float64(3), int64(1), object(7)
memory usage: 39.7+ KB


In [43]:
df_metadata.reset_index(drop=True, inplace=True)


In [44]:
# Guardar el DataFrame como archivo Parquet
table = pa.Table.from_pandas(df_metadata)
pq.write_table(table, output_file_path)

# Subir el archivo resultante Parquet a la carpeta
output_blob.upload_from_filename(output_file_path)

# Limpieza del directorio temporal
os.remove(output_file_path)
os.rmdir(temp_dir)

FileNotFoundError: ignored

In [45]:
del df_metadata