In [None]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [4]:
# Paso 1: Cargar los nombres de los locales y los gmap_id correspondientes
local_data = []
with open(r'strict-filtered-bars.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        entry = json.loads(line)
        local_data.append(entry)

# Paso 2: Cargar las reseñas y las calificaciones filtradas por el gmap_id
reviews = []
ratings = []

local_name = input("Ingrese el nombre del local: ")
gmap_id = None

# Buscar el gmap_id correspondiente al nombre del local ingresado
for entry in local_data:
    if entry['name'].strip().lower() == local_name.strip().lower():
        gmap_id = entry['gmap_id']
        break

# Si no se encuentra el gmap_id, mostrar un mensaje de error
if gmap_id is None:
    print("No se encontró el local especificado.")
    exit()

# Cargar las reseñas y las calificaciones filtradas por el gmap_id desde el archivo JSONL
with open(r'strict-filtered-reviews.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        entry = json.loads(line)
        if entry['gmap_id'] == gmap_id:
            reviews.append(entry['processed_text'])
            ratings.append(entry['rating'])

# Cargar las categorías y las palabras asociadas desde el archivo lista-categorias.jsonl
categories_data = []
with open(r'lista-categorias.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        entry = json.loads(line)
        category_name = next(iter(entry))
        category_words = entry[category_name].split(", ")
        categories_data.append({"category": category_name, "words": category_words})


labels = np.array([0 if rating < 3 else 1 for rating in ratings])


X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train_vectorized, y_train)

def get_top_categories_for_local():
    local_reviews = [review for review, label in zip(X_test, y_test) if label == 0]
    categories_count = {}
    top_words = {}

    # Predict the labels for local_reviews using the logistic regression model
    local_reviews_vectorized = vectorizer.transform(local_reviews)
    local_reviews_predicted = logreg.predict(local_reviews_vectorized)

    for review, predicted_label in zip(local_reviews, local_reviews_predicted):
        words = review.split()
        for category in categories_data:
            category_name = category['category']
            category_words = category['words']
            count = sum(word in category_words for word in words)
            if category_name in categories_count:
                categories_count[category_name] += count
            else:
                categories_count[category_name] = count

            if category_name in top_words:
                top_words[category_name].extend([word for word in words if word in category_words])
            else:
                top_words[category_name] = [word for word in words if word in category_words]

    top_categories = sorted(categories_count.items(), key=lambda x: x[1], reverse=True)

    for category, count in top_categories:
        words_count = {}
        category_words = top_words[category]
        for word in category_words:
            if word in words_count:
                words_count[word] += 1
            else:
                words_count[word] = 1

        top_words_count = sorted(words_count.items(), key=lambda x: x[1], reverse=True)[:3]
        top_words[category] = top_words_count

    return top_categories, top_words, local_reviews_predicted


# Obtener las categorías con la mayor cantidad de palabras repetidas y las tres palabras más repetidas en cada categoría
#top_categories, top_words = get_top_categories_for_local()
top_categories, top_words, local_reviews_predicted = get_top_categories_for_local()


# Imprimir las categorías, la cantidad de palabras repetidas y las tres palabras más repetidas
print(f"Categorías a mejorar en el local {local_name}:")
for category, count in top_categories:
    if count > 0:
        print(f"{category} = {count}")
        print(f"Las palabras más repetidas en la categoría {category}:")
        for word, word_count in top_words[category]:
            print(f"{word} ({word_count})")
        print()

Categorías a mejorar en el local TGI Fridays:
Service = 2
Las palabras más repetidas en la categoría Service:
prices (1)
service (1)

Food = 1
Las palabras más repetidas en la categoría Food:
food (1)

Negative = 1
Las palabras más repetidas en la categoría Negative:
horrible (1)

