### Librerias

In [2]:
import string
import pandas as pd
import numpy as np
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib as mtp
import nltk
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [5]:
# Lee el conjunto de datos
df_review = pd.read_parquet('../Facundo/reviews_unified.parquet.gz')
df_geo = pd.read_csv('../Facundo/geo.csv')

# Se copia la info para evitar errores
df_uno = df_review.copy()
df_dos = df_geo.copy()


In [100]:

# Fusiona los conjuntos de datos basado en gmap_id
merged_df = df_uno.merge(df_dos[['gmap_id','business_name','latitude','longitude']], on='gmap_id', how='outer')
merged_df.columns

Index(['gmap_id', 'user_id', 'time', 'rating', 'text', 'business_reply',
       'source', 'business_name', 'latitude', 'longitude'],
      dtype='object')

### Funciones

<h4>obtener data<h4>

In [8]:
def obtener_data():
    """
    Combina dos conjuntos de datos en función de una columna común y selecciona columnas específicas.

    Args:
        df_review_file (str): Ruta al archivo del conjunto de datos de revisión.
        df_business_file (str): Ruta al archivo del conjunto de datos comerciales.

    Returns:
        pd.DataFrame: El conjunto de datos fusionado con las columnas seleccionadas.
    """

    # Lee el conjunto de datos
    df_review = pd.read_parquet('../Facundo/reviews_unified.parquet.gz')
    df_business = pd.read_csv('../Facundo/geo.csv')

    # Se copia la info para evitar errores
    df_uno = df_review.copy()
    df_dos = df_business.copy()

    # Fusiona los conjuntos de datos basado en gmap_id
    merged_df = df_uno.merge(df_dos[['gmap_id','business_name']], on='gmap_id', how='outer')

    # Filtrado de columnas
    columnas_seleccionadas = ['gmap_id', 'rating', 'text', 'business_name']
    merged_df = merged_df[columnas_seleccionadas]  
    
    # Reemplazar los valores nulos por "SD" en el DataFrame df_review
    merged_df['text'] = merged_df['text'].replace('None', 'SD')
    return merged_df

prueba_1 = obtener_data()
prueba_1

Unnamed: 0,gmap_id,rating,text,business_name
0,0x887d579ef372c2f7:0x1f347e0e964cd5a4,5.0,My sweet lady went in to get us a six pack of ...,Dollar General
1,0x887d579ef372c2f7:0x1f347e0e964cd5a4,5.0,Very friendly and organized. Most organized I ...,Dollar General
2,0x887d579ef372c2f7:0x1f347e0e964cd5a4,3.0,They are out of milk just about every time I'm...,Dollar General
3,0x887d579ef372c2f7:0x1f347e0e964cd5a4,4.0,"If you don't have a mask, they have them at th...",Dollar General
4,0x887d579ef372c2f7:0x1f347e0e964cd5a4,5.0,Brand new. Very spacious. Very well designed b...,Dollar General
...,...,...,...,...
695174,0x88557d23b4194627:0x486caa7e23bbf4e2,,,HOT SPOT
695175,0x8644d007039a2617:0x4c81868f19f9023,,,H-E-B Convenience Store
695176,0x89002c8ef436b093:0xa4dec6b69c91d61d,,,Speedway
695177,0x87f6268d6493baa5:0xc5110d4f9e9dc644,,,Penn Super USA


<h4>asignar columna "sentiment_analysis"<h4>

In [10]:
def asignar_sentiment_analysis():
    
    # Obtiene la data
    df_review = obtener_data()
    
    # Asigna los valores a la nueva columna sentiment_analysis
    def asignar_valor(stars):
        if stars >= 4:
            return 1
        elif stars == 3:
            return 0
        else:
            return -1

    # Aplicar la función a la columna 'stars' para crear la nueva columna 'sentiment_analysis'
    df_review['sentiment_analysis'] = df_review['rating'].apply(lambda x: asignar_valor(x))
    
    return df_review

prueba_2 = asignar_sentiment_analysis()
prueba_2

Unnamed: 0,gmap_id,rating,text,business_name,sentiment_analysis
0,0x887d579ef372c2f7:0x1f347e0e964cd5a4,5.0,My sweet lady went in to get us a six pack of ...,Dollar General,1
1,0x887d579ef372c2f7:0x1f347e0e964cd5a4,5.0,Very friendly and organized. Most organized I ...,Dollar General,1
2,0x887d579ef372c2f7:0x1f347e0e964cd5a4,3.0,They are out of milk just about every time I'm...,Dollar General,0
3,0x887d579ef372c2f7:0x1f347e0e964cd5a4,4.0,"If you don't have a mask, they have them at th...",Dollar General,1
4,0x887d579ef372c2f7:0x1f347e0e964cd5a4,5.0,Brand new. Very spacious. Very well designed b...,Dollar General,1
...,...,...,...,...,...
695174,0x88557d23b4194627:0x486caa7e23bbf4e2,,,HOT SPOT,-1
695175,0x8644d007039a2617:0x4c81868f19f9023,,,H-E-B Convenience Store,-1
695176,0x89002c8ef436b093:0xa4dec6b69c91d61d,,,Speedway,-1
695177,0x87f6268d6493baa5:0xc5110d4f9e9dc644,,,Penn Super USA,-1


In [12]:
prueba_2.to_parquet('df_filtrado_polaridad.parquet')

<h4>Filtro de business(nombre del negocio) y sentiment_analysis ( positivo (1), neutral (0), negativo (-1) )<h4>

In [4]:
def filtro_business_sentiment(gmap_id,sentiment_value):

    # Se obtiene la data
    df_review = asignar_sentiment_analysis()

    # Se filtra la data por el valor del negocio y la polaridad de la reseña
    df_filtrado_polaridad = df_review[(df_review['gmap_id'] == gmap_id) & (df_review['sentiment_analysis'] == sentiment_value)]
    
    return df_filtrado_polaridad


TypeError: filtro_business_sentiment() missing 2 required positional arguments: 'gmap_id' and 'sentiment_value'

<h4>Aplicacion del modelo de ML <h4>

In [61]:
def model_ML_review(gmap_id,sentiment_value):

    # Trae la data filtrada
    df_modelo_ml = filtro_business_sentiment(gmap_id,sentiment_value)

    # Elimina palabras irrelevantes y aplica stemming
    stopwords = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.stem.WordNetLemmatizer()

    def preprocess(text):
        tokens = nltk.word_tokenize(text.lower())
        filtered_tokens = [t for t in tokens if t not in stopwords and t.isalpha()]
        stemmed_tokens = [stemmer.lemmatize(t) for t in filtered_tokens]
        return stemmed_tokens


    # Aplica la función de preprocesamiento a tus datos
    processed_reviews = [preprocess(review) for review in df_modelo_ml['text']]

    #---------------------------------------------------------------------------------------------------------------------------
            # Aplicacion del modelo de ML parte 2 (Diccionario, Matriz, LDA)


    # Crea un diccionario y una matriz de términos-frecuencia
    dictionary = corpora.Dictionary(processed_reviews)
    corpus = [dictionary.doc2bow(review) for review in processed_reviews]
    # Aplica LDA a tus datos
    lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

    #------------------------------------------------------------------------------------------------------------------------
        
    # Get the document topics for each review
    doc_topics = [lda_model.get_document_topics(review) for review in corpus]

    # Group the reviews by topic ID
    grouped_docs = {}
    for doc_topic in doc_topics:
        for topic, prob in doc_topic:
            if topic not in grouped_docs:
                grouped_docs[topic] = []
            grouped_docs[topic].append(prob)

    # Calculate the average probability for each topic
    avg_probs = {}
    for topic, probs in grouped_docs.items():
        avg_probs[topic] = sum(probs) / len(probs)

    # Sort the topics by average probability
    sorted_topics = sorted(avg_probs.items(), key=lambda x: x[1], reverse=True)

    # Print the top topics
    for i, (topic, prob) in enumerate(sorted_topics[:5]):
        print(f"Tema {i+1}: {prob:.4f}")
        print("Palabras más importantes:", [term[0] for term in lda_model.show_topic(topic, topn=5)])
        print()


# prueba del codigo





<h4>Ejemplo de como se usa<h4>

In [62]:
model_ML_review("Walgreens",-1)

Tema 1: 0.3908
Palabras más importantes: ['pharmacy', 'prescription', 'time', 'get', 'hour']

Tema 2: 0.3083
Palabras más importantes: ['customer', 'rude', 'service', 'pharmacy', 'walgreens']

Tema 3: 0.1821
Palabras más importantes: ['said', 'told', 'went', 'would', 'walgreens']

Tema 4: 0.1777
Palabras más importantes: ['store', 'one', 'line', 'help', 'people']

Tema 5: 0.1327
Palabras más importantes: ['sd', 'mask', 'google', 'la', 'original']



### Funcion completa

In [65]:
def review_analysis():
    obtener_data()
    asignar_sentiment_analysis()


In [66]:
review_analysis()

Tema 1: 0.3328
Palabras más importantes: ['customer', 'service', 'rude', 'pharmacy', 'store']

Tema 2: 0.2939
Palabras más importantes: ['prescription', 'pharmacy', 'get', 'time', 'ready']

Tema 3: 0.2338
Palabras más importantes: ['pharmacy', 'line', 'time', 'wait', 'drive']

Tema 4: 0.1974
Palabras más importantes: ['walgreens', 'store', 'said', 'item', 'went']

Tema 5: 0.1385
Palabras más importantes: ['sd', 'photo', 'mask', 'picture', 'shot']



In [13]:
df_pruebita = pd.read_parquet('df_filtrado_polaridad.parquet')

In [14]:
df_pruebita

Unnamed: 0,gmap_id,rating,text,business_name,sentiment_analysis
0,0x887d579ef372c2f7:0x1f347e0e964cd5a4,5.0,My sweet lady went in to get us a six pack of ...,Dollar General,1
1,0x887d579ef372c2f7:0x1f347e0e964cd5a4,5.0,Very friendly and organized. Most organized I ...,Dollar General,1
2,0x887d579ef372c2f7:0x1f347e0e964cd5a4,3.0,They are out of milk just about every time I'm...,Dollar General,0
3,0x887d579ef372c2f7:0x1f347e0e964cd5a4,4.0,"If you don't have a mask, they have them at th...",Dollar General,1
4,0x887d579ef372c2f7:0x1f347e0e964cd5a4,5.0,Brand new. Very spacious. Very well designed b...,Dollar General,1
...,...,...,...,...,...
695174,0x88557d23b4194627:0x486caa7e23bbf4e2,,,HOT SPOT,-1
695175,0x8644d007039a2617:0x4c81868f19f9023,,,H-E-B Convenience Store,-1
695176,0x89002c8ef436b093:0xa4dec6b69c91d61d,,,Speedway,-1
695177,0x87f6268d6493baa5:0xc5110d4f9e9dc644,,,Penn Super USA,-1


In [17]:
df_pruebita = df_pruebita[(df_pruebita['business_name'] == "Walgreens") 
                          | (df_pruebita['business_name'] == "7-Eleven") 
                          | (df_pruebita['business_name'] == "Circle K") 
                          | (df_pruebita['business_name'] == "Casey's General Store")]

In [18]:
df_pruebita

Unnamed: 0,gmap_id,rating,text,business_name,sentiment_analysis
126,0x88896d9a53db1229:0x9d59442b288dd900,5.0,I moved to Moody and had to switch to this pha...,Walgreens,1
127,0x88896d9a53db1229:0x9d59442b288dd900,1.0,This is for the pharmacy staff. The retail sta...,Walgreens,-1
128,0x88896d9a53db1229:0x9d59442b288dd900,1.0,If I could give a negative star rating I would...,Walgreens,-1
129,0x88896d9a53db1229:0x9d59442b288dd900,1.0,Ordered online got email to go collect and onl...,Walgreens,-1
130,0x88896d9a53db1229:0x9d59442b288dd900,1.0,Worst pharmacy in the world. Employees can not...,Walgreens,-1
...,...,...,...,...,...
695147,0x80ea6af33838ce2b:0x3a1996055d301b49,,,Walgreens,-1
695161,0x88e6da413d09bcb7:0x8c31b1ac32e1fc2a,,,Walgreens,-1
695165,0x88db39443c074c89:0xb20f9cd6006f0bc2,,,Circle K,-1
695166,0x88db6a843ca61173:0xb38f733cbe406d75,,,Circle K,-1


In [19]:
df_pruebita.to_parquet('dataset_filter_CC.parquet')