In [1]:
import os
import json
from google.colab import files # solo para google collab
from google.cloud import storage
import pandas as pd
print(storage.__version__)

# NLP
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

2.8.0


In [2]:
# google collab
# solo se ejecuta la primera vez: cargo el json con las credenciales del api key
#object_uploaded = files.upload()

# google collab
path_api_key = '/content/extreme-unison-399121-cadd77c555ca.json'

# entorno local
#path_api_key = 'gcloud_api_key/extreme-unison-399121-cadd77c555ca.json'

# extraigo las credenciales para el acceso
service_account_info = json.load(open(path_api_key))

# autorizo el acceso a cloud storage
client_storage = storage.Client.from_service_account_info(service_account_info)

In [3]:
# para entorno Google Colab o maquina local:

# cambiar segun bucket
bucket = client_storage.get_bucket('1_transform')

# cambiar segun dataframe
data_blob = bucket.blob('business_reviews_norm.parquet')
data_business_reviews = pd.read_parquet(storage.fileio.BlobReader(data_blob))
data_business_reviews.info()
data_business_reviews.head(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1531095 entries, 0 to 415753
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   business_id         1531095 non-null  object 
 1   user_id             1531095 non-null  object 
 2   rating              1531095 non-null  float64
 3   user_text           1101628 non-null  object 
 4   user_time_year      1531095 non-null  int32  
 5   user_time_month     1531095 non-null  int32  
 6   user_time_day       1531095 non-null  int32  
 7   user_time_hms       1531095 non-null  object 
 8   state_name          1531095 non-null  object 
 9   state_code          1531095 non-null  object 
 10  codigo_postal_zcta  1531095 non-null  int64  
dtypes: float64(1), int32(3), int64(1), object(6)
memory usage: 122.7+ MB


Unnamed: 0,business_id,user_id,rating,user_text,user_time_year,user_time_month,user_time_day,user_time_hms,state_name,state_code,codigo_postal_zcta
0,0x88892bb449d44cb3:0x7beaf0b2e3171a49,110771718750038252970,2.0,"No masks, requested curbside and waited 10 min...",2021,8,27,02:29:34.725000,alabama,AL,35115
1,0x88892bb449d44cb3:0x7beaf0b2e3171a49,104877410053821600291,5.0,This location does an awesome job! The delive...,2020,10,10,02:42:56.261000,alabama,AL,35115


1. columna rating (0 a 5)>> promedio año-mes de rating >> rescalar de 0 a 1>> multiplicar por 0.4 = kpi_satisfaccion_rating por restaurante y año-mes (nueva columna).

In [16]:
# 1. Promedio de rating por año-mes
data_business_reviews_average_rating = data_business_reviews.groupby(
    ["business_id","user_time_year","user_time_month"])['rating'].mean().reset_index().rename(
        columns={'rating': 'average_rating'})

# 2. Rescalar el rating promedio de 0 a 1
min_rating = 1 # valor minimo absoluto/teorico
max_rating = 5 # valor maximo absoluto/teorico
data_business_reviews_average_rating['rescaled_rating'] = (
    data_business_reviews_average_rating['average_rating'] - min_rating
    ) / (max_rating - min_rating)

# 3. Calcular el KPI de satisfacción
data_business_reviews_average_rating['kpi_satisfaccion_rating'] = data_business_reviews_average_rating[
    'rescaled_rating'] * 0.4

# Ahora, 'data' contiene todas las columnas requeridas, incluyendo 'kpi_satisfaccion_rating'
data_business_reviews_average_rating.info()
data_business_reviews_average_rating.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526578 entries, 0 to 526577
Data columns (total 6 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   business_id              526578 non-null  object 
 1   user_time_year           526578 non-null  int64  
 2   user_time_month          526578 non-null  int64  
 3   average_rating           526578 non-null  float64
 4   rescaled_rating          526578 non-null  float64
 5   kpi_satisfaccion_rating  526578 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 24.1+ MB


Unnamed: 0,business_id,user_time_year,user_time_month,average_rating,rescaled_rating,kpi_satisfaccion_rating
0,---kPU91CF4Lq2-WlRu9Lw,2020,8,4.0,0.75,0.3
1,---kPU91CF4Lq2-WlRu9Lw,2020,9,5.0,1.0,0.4
2,---kPU91CF4Lq2-WlRu9Lw,2020,10,5.0,1.0,0.4
3,---kPU91CF4Lq2-WlRu9Lw,2020,12,4.5,0.875,0.35
4,---kPU91CF4Lq2-WlRu9Lw,2021,10,5.0,1.0,0.4


2. columna review >> compound_score >> promedio año-mes del compound_score por restaurante >> rescalar de 0 a 1 >> multiplicar por 0.6  = kpi_satisfaccion_sentimiento por restaurante y año-mes (nueva columna).

In [25]:
# Descarga los recursos necesarios de NLTK si no los tienes ya
nltk.download('vader_lexicon')

# Inicializar el analizador de sentimientos VADER
sia = SentimentIntensityAnalyzer()

# Función para clasificar el sentimiento basado en VADER
def classify_sentiment_vader(text):
    if pd.notna(text) and isinstance(text, str):
        sentiment_score = sia.polarity_scores(text)
        compound_score = sentiment_score['compound']
        return compound_score
    else:
        return None

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
# Calcular el valor de compound_score y agregarlo como columna
data_business_reviews['compound_score'] = data_business_reviews['user_text'].apply(classify_sentiment_vader)

In [21]:
# Calcular el promedio del compound_score por restaurante y año-mes
data_business_reviews_avg_compound = data_business_reviews.groupby(
    ["business_id","user_time_year","user_time_month"])['compound_score'].mean().reset_index().rename(
        columns={'compound_score': 'average_compound'})

# Rescalar de 0 a 1 y luego multiplicar por 0.6 para obtener el KPI de satisfacción de sentimiento
min_compound = -1 # valor minimo absoluto/teorico
max_compound = 1 # valor maximo absoluto/teorico
data_business_reviews_avg_compound['rescaled_average_compound'] = (
    data_business_reviews_avg_compound['average_compound'] - min_compound) / (max_compound - min_compound)
data_business_reviews_avg_compound['kpi_satisfaccion_sentimiento'] = data_business_reviews_avg_compound['rescaled_average_compound'] * 0.6

# Finalmente, data_business_reviews_avg_compound contendrá el KPI de satisfacción de sentimiento por restaurante y año-mes.
data_business_reviews_avg_compound.info()
data_business_reviews_avg_compound.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526578 entries, 0 to 526577
Data columns (total 6 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   business_id                   526578 non-null  object 
 1   user_time_year                526578 non-null  int64  
 2   user_time_month               526578 non-null  int64  
 3   average_compound              457123 non-null  float64
 4   rescaled_average_compound     457123 non-null  float64
 5   kpi_satisfaccion_sentimiento  457123 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 24.1+ MB


Unnamed: 0,business_id,user_time_year,user_time_month,average_compound,rescaled_average_compound,kpi_satisfaccion_sentimiento
0,---kPU91CF4Lq2-WlRu9Lw,2020,8,0.7351,0.86755,0.52053
1,---kPU91CF4Lq2-WlRu9Lw,2020,9,0.86795,0.933975,0.560385


Endpoint: kpi_satisfaccion_suma = kpi_satisfaccion_rating + kpi_satisfaccion_sentimiento por restaurante y año-mes

DF final: business_kpi_satisfaccion.parquet
columnas: business_id, year, month, kpi_satisfaccion_suma, kpi_satisfaccion_rating, kpi_satisfaccion_sentimiento

In [27]:
# Fusiona los DataFrames 'data_business_reviews_avg_compound' y 'data_business_reviews_average_rating' en función de la columna 'business_id'
data_business_reviews_kpi_satisfaccion = pd.merge(
    data_business_reviews_average_rating,
    data_business_reviews_avg_compound,
    how='left').drop(columns=[
        "average_compound","rescaled_average_compound",
        "average_rating","rescaled_rating"])

# Calcular el KPI de satisfacción de suma ponderada por restaurante y año-mes
data_business_reviews_kpi_satisfaccion['kpi_satisfaccion_suma'] = data_business_reviews_kpi_satisfaccion[
    'kpi_satisfaccion_rating'] + data_business_reviews_kpi_satisfaccion['kpi_satisfaccion_sentimiento']
data_business_reviews_kpi_satisfaccion.info()
data_business_reviews_kpi_satisfaccion.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 526578 entries, 0 to 526577
Data columns (total 6 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   business_id                   526578 non-null  object 
 1   user_time_year                526578 non-null  int64  
 2   user_time_month               526578 non-null  int64  
 3   kpi_satisfaccion_rating       526578 non-null  float64
 4   kpi_satisfaccion_sentimiento  457123 non-null  float64
 5   kpi_satisfaccion_suma         457123 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 28.1+ MB


Unnamed: 0,business_id,user_time_year,user_time_month,kpi_satisfaccion_rating,kpi_satisfaccion_sentimiento,kpi_satisfaccion_suma
0,---kPU91CF4Lq2-WlRu9Lw,2020,8,0.3,0.52053,0.82053
1,---kPU91CF4Lq2-WlRu9Lw,2020,9,0.4,0.560385,0.960385
2,---kPU91CF4Lq2-WlRu9Lw,2020,10,0.4,0.56838,0.96838
3,---kPU91CF4Lq2-WlRu9Lw,2020,12,0.35,0.583965,0.933965
4,---kPU91CF4Lq2-WlRu9Lw,2021,10,0.4,0.58356,0.98356


In [28]:
# Guardar el DataFrame final como un archivo parquet
#data_business_reviews_kpi_satisfaccion.to_parquet('business_kpi_satisfaccion.parquet',index=False)