In [1]:
import jsonlines
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from scipy.sparse import vstack
from scipy.sparse import csr_matrix
import nltk
import numpy as np

from nltk.sentiment import SentimentIntensityAnalyzer



# FOR TESTING, IGNORE
def get_gmap_id_by_name(name):
    with jsonlines.open('data\data filtrada\Google Maps filtrado\Google Maps json\strict-filtered-bars.jsonl') as file:
        for line in file:
            if line['name'] == name:
                return line['gmap_id']
    return None

#Example usage
name = "Chili's Grill & Bar"
gmap_id = get_gmap_id_by_name(name)
if gmap_id:
    print(f"The 'gmap_id' for '{name}' is '{gmap_id}'.")
else:
    print(f"No 'gmap_id' found for '{name}'.")

In [2]:
def get_gmap_ids_by_name(name):
    gmap_ids = []
    with jsonlines.open(r'data/data filtrada/Google Maps filtrado/Google Maps json/strict-filtered-bars.jsonl') as file:
        for line in file:
            if line['name'] == name:
                gmap_ids.append(line['gmap_id'])
    return gmap_ids

#Example usage
gmap_ids = get_gmap_ids_by_name("Chili's Grill & Bar")
#name = "Chili's Grill & Bar"
#gmap_ids = get_gmap_ids_by_name(name)
''' ///  Hooters  ///  Chili's Grill & Bar  ///  Cheesecake Bistro by Copeland's   /// XTC Cabaret Dallas   '''

" ///  Hooters  ///  Chili's Grill & Bar  ///  Cheesecake Bistro by Copeland's   /// XTC Cabaret Dallas   "

In [3]:
def get_reviews_by_gmap_ids(gmap_ids):
    reviews_low = []
    reviews_high = []
    total_rating = 0
    count = 0
    
    with jsonlines.open(r'data/data filtrada/Google Maps filtrado/Google Maps json/google-strict-filtered-reviews.jsonl') as file:
        for line in file:
            if line['gmap_id'] in gmap_ids:
                rating = line['rating']
                total_rating += rating
                count += 1
                
                if rating <= 2:
                    line['rating_category'] = 0
                    reviews_low.append(line)
                elif rating >= 4:
                    line['rating_category'] = 1
                    reviews_high.append(line)
    
    
    return reviews_low, reviews_high

In [4]:
# Call the function and retrieve the filtered reviews and average score
filtered_reviews_low, filtered_reviews_high = get_reviews_by_gmap_ids(gmap_ids)


In [5]:
def create_tfidf_matrix(reviews_low, reviews_high):
    # Combine the processed text from reviews_low and reviews_high
    all_reviews_text = [review['processed_text'] for review in reviews_low] + [review['processed_text'] for review in reviews_high]

    # Create the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit the vectorizer on the combined processed text
    vectorizer.fit(all_reviews_text)

    # Transform the processed text of reviews_low using the fitted vectorizer
    tfidf_matrix_low = vectorizer.transform([review['processed_text'] for review in reviews_low])

    # Transform the processed text of reviews_high using the fitted vectorizer
    tfidf_matrix_high = vectorizer.transform([review['processed_text'] for review in reviews_high])

    return tfidf_matrix_low, tfidf_matrix_high

# Usage example

reviews_low, reviews_high = get_reviews_by_gmap_ids(gmap_ids)
tfidf_matrix_low, tfidf_matrix_high = create_tfidf_matrix(reviews_low, reviews_high)

#tfidf_matrix_low, tfidf_matrix_high = create_tfidf_matrix(reviews_low, reviews_high)

# USANDO SVM INSTEAD

In [6]:
## TURN BACK TO PYTHON SCRIPT TO RUN

# Combine the TF-IDF matrices
tfidf_matrix = vstack((tfidf_matrix_low, tfidf_matrix_high))

# Create the target variable for reviews_low and reviews_high
target_variable_low = np.zeros(tfidf_matrix_low.shape[0])
target_variable_high = np.ones(tfidf_matrix_high.shape[0])

# Combine the target variables
target_variable = np.concatenate((target_variable_low, target_variable_high))

# Prepare your data and split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, target_variable, test_size=0.2, random_state=42)

# Create and train the SVM model
svm_model = SVR(kernel='linear')
svm_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = svm_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")


Root Mean Squared Error (RMSE): 0.24001852100357135


In [9]:
original_ratings = [review['rating'] for review in reviews_low]
predicted_increases = svm_model.predict(tfidf_matrix_low)
updated_ratings = [original + increase for original, increase in zip(original_ratings, predicted_increases)]
updated_rate = np.mean(updated_ratings)
average_updated_rating = np.mean(predicted_increases)


In [12]:
ratings = [review['rating'] for review in reviews_low]
average_rating = np.mean(ratings)
print("puntaje promedio actual:", {average_rating})
print("incremento en puntaje promedio:", {average_updated_rating})
print("puntaje promedio modificado:", {updated_rate})

puntaje promedio actual: {1.4133016627078385}
incremento en puntaje promedio: {0.20397236654390022}
puntaje promedio modificado: {1.6172740292517387}
