# relation_df Dataset

Creación de dataset relation_df que buscar relacionar los dataset de google maps y yelp mediante las columnas 'latitude' y 'longitude'

In [214]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [215]:
from pathlib import Path

# Definir la ruta relativa desde el directorio del script
relative_path_business = Path("datasets") / "business.parquet"
relative_path_pizza = Path("datasets") / "pizza_metadata.parquet"

# Cargar el archivo
business = pd.read_parquet(relative_path_business)
pizza_metadata = pd.read_parquet(relative_path_pizza)


In [216]:
business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13


In [217]:
pizza_metadata.head()

Unnamed: 0,name,address,gmap_id,latitude,longitude,avg_rating,num_of_reviews
0,Sir Pizza Xpress,"Sir Pizza Xpress, 201 E Broad St, Smithville, ...",0x8866cef69c5737a1:0xbdf838584dca0b9c,35.956944,-85.811111,4.1,18
1,Bucci's Greek & Italian Specialties,"Bucci's Greek & Italian Specialties, 8030 S Ho...",0x876c8410e0d7ec6d:0xb06492857fa9ec9d,39.572697,-104.922399,4.7,13
2,Dynamite Pizza & BBQ,"Dynamite Pizza & BBQ, 59069 Gratiot Ave, New H...",0x882518a534093375:0x4333d5f58f7af4c,42.735038,-82.78534,4.1,8
3,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",0x89e3169821e62d4d:0x14ff0683c1ebca0e,42.559072,-70.881542,3.9,48
4,Hot Stuff Pizza,"Hot Stuff Pizza, 1104 1st St E, Park Rapids, M...",0x52b7b8e4459a8697:0x81e50b7c06d172df,46.921303,-95.037448,3.3,7


Join mediante 'longitude' y 'latitude'

In [218]:
epsilon = 0.0001  # Margen de tolerancia en coordenadas

# Renombrar columnas para evitar conflictos en el merge
business = business.rename(columns={"latitude": "latitude_business", "longitude": "longitude_business"})
pizza_metadata = pizza_metadata.rename(columns={"latitude": "latitude_metadata", "longitude": "longitude_metadata"})

# Merge manteniendo solo los negocios en el dataset business
relation_df = business.merge(
    pizza_metadata,
    how="left"  # Asegura que solo se mantengan los datos de business
)

# Filtrar solo las coincidencias dentro del margen epsilon
relation_df = relation_df.query(
    "abs(latitude_business - latitude_metadata) <= @epsilon and abs(longitude_business - longitude_metadata) <= @epsilon"
)[["name", "business_id", "gmap_id", "latitude_business", "longitude_business"]]

# Renombrar las columnas finales
relation_df = relation_df.rename(columns={"latitude_business": "latitude", "longitude_business": "longitude"})

# Mostrar las primeras filas
relation_df.head()

Unnamed: 0,name,business_id,gmap_id,latitude,longitude


No hay coincidencias usando 'latitude' y 'longitude'

In [219]:
print(business[["latitude_business", "longitude_business"]].head(5))
print(pizza_metadata[["latitude_metadata", "longitude_metadata"]].head(5))

   latitude_business  longitude_business
0          34.426679         -119.711197
1          38.551126          -90.335695
2          32.223236         -110.880452
3          39.955505          -75.155564
4          40.338183          -75.471659
   latitude_metadata  longitude_metadata
0          35.956944          -85.811111
1          39.572697         -104.922399
2          42.735038          -82.785340
3          42.559072          -70.881542
4          46.921303          -95.037448


'address' de pizza_metadata necesita normalización

In [220]:
print(business['address'].head(5))
print(pizza_metadata['address'].head(5))

0             1616 Chapala St, Ste 2
1    87 Grasso Plaza Shopping Center
2               5255 E Broadway Blvd
3                        935 Race St
4                      101 Walnut St
Name: address, dtype: object
0    Sir Pizza Xpress, 201 E Broad St, Smithville, ...
1    Bucci's Greek & Italian Specialties, 8030 S Ho...
2    Dynamite Pizza & BBQ, 59069 Gratiot Ave, New H...
3    Three Star Pizza, 409 Cabot St #1, Beverly, MA...
4    Hot Stuff Pizza, 1104 1st St E, Park Rapids, M...
Name: address, dtype: object


In [221]:
# Buscar valores NaN en 'address'
pizza_metadata['address'].isna().sum()

27

In [222]:
print(pizza_metadata[pizza_metadata['address'].isna() == True]['name'].value_counts())

name
Galaxy Zone                                             1
Che Buono Restaurant and Caterer                        1
Archepoint™                                             1
U.B.'s Pizza and Mediterranean Pies                     1
Brothers Pizza                                          1
The WoodShed Mobile Wood-Fired Pizza                    1
Pizza sandrino                                          1
Chrislees Submasters bar and grill                      1
Slice of Heaven                                         1
Old Town Pizza                                          1
The Country Vineyard Italian Restaurant and Pizzeria    1
Giovanni's Pizza Parma                                  1
American Pizza                                          1
Foothill Pizza                                          1
New York Style Pizza & Gyros                            1
Best Pizza Delivery Near Arlington, VA                  1
Simple Simon's Pizza - Harrison, AR                     1
Supremo's

In [223]:
pizza_metadata = pizza_metadata.dropna(subset=['address'])
pizza_metadata['address'].isna().sum()

0

In [224]:
# Normalizar columna 'address' y se cambia nombre a 'old_address'
# Se extraer solo la parte de la dirección en pizza_metadata, eliminando el nombre del negocio
pizza_metadata["old_address"] = pizza_metadata["address"].str.split(",", n=1).str[1].str.strip()

print(pizza_metadata['old_address'].head(5))

0      201 E Broad St, Smithville, TN 37166
1     8030 S Holly St, Centennial, CO 80122
2    59069 Gratiot Ave, New Haven, MI 48048
3        409 Cabot St #1, Beverly, MA 01915
4      1104 1st St E, Park Rapids, MN 56470
Name: old_address, dtype: object


Nota: 'address' de pizza_metadata contienen otra información, tal como la ciudad y el estado

El siguiente código separa los componentes de pizza_metadata['address'] en distintas columnas según corresponda

In [225]:
# Dividir la dirección en partes
split_address = pizza_metadata["old_address"].str.rsplit(",", n=2, expand=True)  # Divide desde el final

# Asignar cada parte a nuevas columnas
pizza_metadata["address"] = split_address[0].str.strip()
pizza_metadata["city"] = split_address[1].str.strip()

# Separar state y postal_code correctamente
state_zip_split = split_address[2].str.strip().str.split(" ", n=1, expand=True)
pizza_metadata["state"] = state_zip_split[0].str.strip()  # Estado (Ej: "GA")
pizza_metadata["postal_code"] = state_zip_split[1].str.strip()  # Código Postal (Ej: "31522")

# Mostrar los primeros registros para verificar
print(pizza_metadata[["old_address", "address", "city", "state", "postal_code"]].head(10))


                                  old_address             address  \
0        201 E Broad St, Smithville, TN 37166      201 E Broad St   
1       8030 S Holly St, Centennial, CO 80122     8030 S Holly St   
2      59069 Gratiot Ave, New Haven, MI 48048   59069 Gratiot Ave   
3          409 Cabot St #1, Beverly, MA 01915     409 Cabot St #1   
4        1104 1st St E, Park Rapids, MN 56470       1104 1st St E   
5      2725 Manitowoc Rd, Green Bay, WI 54311   2725 Manitowoc Rd   
6       10637 N Tatum Blvd, Phoenix, AZ 85028  10637 N Tatum Blvd   
7      790 Washington Ave, Brooklyn, NY 11238  790 Washington Ave   
8         3909 Creek Rd, Youngstown, NY 14174       3909 Creek Rd   
9  511 Ocean Blvd, St Simons Island, GA 31522      511 Ocean Blvd   

               city state postal_code  
0        Smithville    TN       37166  
1        Centennial    CO       80122  
2         New Haven    MI       48048  
3           Beverly    MA       01915  
4       Park Rapids    MN       56470  


In [226]:
# Revisar cuantos valores NaN hay en las columnas anteriores
for column in ["old_address", "address", "city", "state", "postal_code"]:
    
    print(f"{column}:", pizza_metadata[column].isna().sum())

old_address: 0
address: 0
city: 1
state: 102
postal_code: 102


In [227]:
pizza_metadata[pizza_metadata['city'].isna() == True]

Unnamed: 0,name,address,gmap_id,latitude_metadata,longitude_metadata,avg_rating,num_of_reviews,old_address,city,state,postal_code
16593,Crossroads Pizza,North Carolina 28516,0x89a89095fc78b2b1:0x8c7433d35e612e7c,34.769659,-76.551205,4.4,108,North Carolina 28516,,,


In [228]:
pizza_metadata = pizza_metadata.dropna(subset=['city'])
pizza_metadata = pizza_metadata.dropna(subset=['state'])

In [229]:
pizza_metadata.shape

(23592, 11)

In [230]:
# # Dividir la dirección en partes
# split_address = pizza_metadata["address"].str.split(",", expand=True)

# # Asignar cada parte a nuevas columnas
# pizza_metadata["address"] = split_address[0].str.strip()
# pizza_metadata["city"] = split_address[1].str.strip()
# pizza_metadata["state"] = split_address[2].str.split(" ", expand=True)[0].str.strip()
# pizza_metadata["postal_code"] = split_address[2].str.split(" ", expand=True)[1].str.strip()

# # Mostrar los primeros registros para verificar
# print(pizza_metadata[["address", "city", "state", "postal_code"]].head(10))


El siguiente código trata de unir ambos datasets mediante la columna 'address'

In [231]:
# # Unir los datasets por dirección
# relation_df = business.merge(
#     pizza_metadata,
#     left_on="address",
#     right_on="clean_address",
#     how="inner"
# )

# # Seleccionar columnas relevantes
# relation_df = relation_df[["name", "business_id", "gmap_id", "latitude", "longitude"]]

# # Mostrar las primeras filas
# relation_df.head()