In [2]:
import pandas as pd
import os

# Google

## Extraccion de locales

In [20]:
reviews=pd.read_csv('../datasets/csv/G_ulta_beauty.csv')

### Lectura de datos

In [21]:
# Especifica la ruta de la carpeta que contiene los archivos .parquet particionados
ruta_carpeta_parquet = '../datasets/parquets/metadatos Google'

# Lista para almacenar DataFrames de cada partición
dataframes = []

# Itera sobre los archivos .parquet en la carpeta
for root, dirs, files in os.walk(ruta_carpeta_parquet):
    for file in files:
        if file.endswith('.parquet'):
            # Construye la ruta completa del archivo
            ruta_completa = os.path.join(root, file)
            
            # Lee el archivo .parquet y agrega el DataFrame a la lista
            df_particion = pd.read_parquet(ruta_completa)
            dataframes.append(df_particion)

# Concatena todos los DataFrames en uno solo
df_google = pd.concat(dataframes, ignore_index=True)

### Extraccion

In [22]:
# Filtrar por el valor 'ulta beauty' en la columna 'name'
df_google = df_google[df_google['name'] == 'Ulta Beauty']
# Eliminamos columnas innecesarias
df_google = df_google[["gmap_id",'url',"latitude","longitude","avg_rating","State", "state", "address"]]
# Eliminamos Duplicados
df_google.drop_duplicates(inplace=True)
# Se exporta
df_google.to_csv('../datasets/csv/G_ulta_establishments.csv', index=False)

# Scrapping

In [None]:
#Instalamos extension para coneccion
pip install apify-client

### Importar API Y datos

In [14]:
from apify_client import ApifyClient

apify_client = ApifyClient('apify_api_USgUEX5UbJ4Z0YMJoKF4S1MYt9mCmw4mmrzs')

# Fetch results from the actor's default dataset
dataset = apify_client.dataset('IwxKoLSghgggI6jm1')

#Extraigo los datos
dataset_items=dataset.list_items()

#Los paso a dataframe
data_API=pd.DataFrame(data=dataset_items.items)

In [7]:
data_API.columns

Index(['url', 'searchString', 'rank', 'searchPageUrl', 'searchPageLoadedUrl',
       'isAdvertisement', 'title', 'subTitle', 'description', 'price',
       'categoryName', 'address', 'neighborhood', 'street', 'city',
       'postalCode', 'state', 'countryCode', 'website', 'phone',
       'phoneUnformatted', 'claimThisBusiness', 'location', 'locatedIn',
       'plusCode', 'menu', 'totalScore', 'permanentlyClosed',
       'temporarilyClosed', 'placeId', 'categories', 'cid', 'reviewsCount',
       'reviewsDistribution', 'imagesCount', 'imageCategories', 'scrapedAt',
       'reserveTableUrl', 'googleFoodUrl', 'hotelStars', 'hotelDescription',
       'checkInDate', 'checkOutDate', 'similarHotelsNearby',
       'hotelReviewSummary', 'hotelAds', 'popularTimesLiveText',
       'popularTimesLivePercent', 'popularTimesHistogram', 'openingHours',
       'peopleAlsoSearch', 'placesTags', 'reviewsTags', 'additionalInfo',
       'gasPrices', 'questionsAndAnswers', 'updatesFromCustomers',
       'web

### Tratar columnas

In [15]:
# Usar pd.json_normalize para aplanar la columna 'location'
data_API = pd.concat([data_API, pd.json_normalize(data_API['location'])], axis=1)

# Eliminar la columna 'location' si ya no la necesitas
data_API = data_API.drop('location', axis=1)

# Me quedo con las columnas de interes
data_API=data_API[['url','searchString', 'state', 'lat', 'lng', 'reviews']]

# Quito los diccionarios que estan en listas
data_API = data_API.explode('reviews')

#Reseteo los indices
data_API=data_API.reset_index(drop=True)

# Utilizo json_normalize para expandir la columna 'reviews'
df_expanded  = pd.json_normalize(data_API['reviews'])

# Combina el DataFrame expandido con el DataFrame original
df_result = pd.concat([data_API, df_expanded], axis=1).drop(columns=['reviews'])

### Renombrar columnas

In [16]:
#Borramos nuevamente las columnas innecesarias
df_result=df_result[['url', 'reviewerId',  'stars', 'text', 'publishedAtDate',  'name', 'lat', 'lng', 'state']]

# Y las renombramos
df_result.rename(columns={'url':'business_url','reviewerId':'user_id',  'lat': 'latitude','lng': 'longitude', 'state':'state_name'}, inplace=True)

In [17]:
df_result

Unnamed: 0,business_url,user_id,stars,text,publishedAtDate,name,latitude,longitude,state_name
0,https://www.google.com/maps/place/Ulta+Beauty/...,,5.0,"Omar was amazing, he was very helpful today du...",2023-12-09T21:53:09.672Z,Lupita Valadez,34.215101,-119.068813,California
1,https://www.google.com/maps/place/Ulta+Beauty/...,,3.0,,2023-12-06T02:21:57.666Z,Michelle “Belle”,34.215101,-119.068813,California
2,https://www.google.com/maps/place/Ulta+Beauty/...,,5.0,,2023-12-04T07:07:10.151Z,lilian flores,34.215101,-119.068813,California
3,https://www.google.com/maps/place/Ulta+Beauty/...,,1.0,"I had intended to write a review, but during m...",2023-10-21T06:50:59.970Z,Karla Murillo,34.215101,-119.068813,California
4,https://www.google.com/maps/place/Ulta+Beauty/...,,5.0,,2023-10-04T02:37:56.115Z,Jacx Bernardo,34.215101,-119.068813,California
...,...,...,...,...,...,...,...,...,...
12371,https://www.google.com/maps/place/Ulta+Beauty/...,,5.0,I did not go in but I am sure it is just fine.,2022-02-14T15:28:45.716Z,Gwayne Stern,45.384248,-122.750672,Oregon
12372,https://www.google.com/maps/place/Ulta+Beauty/...,,5.0,"Ok, i literally never get my hair done but now...",2022-02-04T02:32:28.661Z,Allison Lloyd,45.384248,-122.750672,Oregon
12373,https://www.google.com/maps/place/Ulta+Beauty/...,,4.0,,2022-02-01T01:47:51.040Z,Denise Robb,45.384248,-122.750672,Oregon
12374,https://www.google.com/maps/place/Ulta+Beauty/...,,3.0,They could have definitely used more cashiers....,2022-01-05T03:14:44.894Z,Kristin Pdx,45.384248,-122.750672,Oregon


In [18]:
df_result.isnull().sum()

business_url           0
user_id            12376
stars                 14
text                4239
publishedAtDate       14
name                  14
latitude               3
longitude              3
state_name             3
dtype: int64

# YELP

In [19]:
establecimientos=pd.read_csv('../datasets/csv/G_ulta_establishments.csv')

In [22]:
establecimientos.url[0]

'https://www.google.com/maps/place//data=!4m2!3m1!1s0x80e849691015d7b7:0x314b8627656bc6d5?authuser=-1&hl=en&gl=us'

In [23]:
yelp_reviews=pd.read_csv('../datasets/csv/Y_ulta_beauty.csv')

In [26]:
yelp_reviews.business_id.unique()[1]

'Vxqa8u_5RD5e7oBqdaU0yQ'