In [1]:
import nltk
import pyarrow as pa
import pyarrow.parquet as pq
import sklearn
from sklearn.preprocessing import StandardScaler
import seaborn as sns
    
from sklearn.linear_model import LinearRegression

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

In [2]:
df = pd.read_parquet('gs://data_henry_proyecto_final/google_analisis_sentimen.parquet')

In [3]:
df

Unnamed: 0,name_y,text,latitude,longitude,num_of_reviews,avg_rating
0,"""TACOS"" MICHOACAN",26,34.193687,-118.531962,88,4.1
1,#8:46,26,30.057229,-95.494884,57,4.4
2,$,35,26.348095,-80.084331,158,3.3
3,'Essen,30,40.759743,-73.973728,118,4.1
4,.,19,40.111530,-75.336822,56,4.2
...,...,...,...,...,...,...
23094,马师傅面馆,3,32.951999,-96.727767,38,3.9
23095,魔饭 Morefan,8,33.067249,-96.696350,28,4.0
23096,옛날통닭 & 스시 | Chicken and Sushi Flushing NY,29,40.761944,-73.803939,124,4.2
23097,초원가든 Chowon Garden,7,40.763264,-73.808759,35,3.8


In [4]:
def calcular_puntaje(fila):
    puntaje = 0
    
    pesos = {'text': -2, 'avg_rating': 1, 'num_of_reviews': 1, 'num_of_reviews': 1, }
    for columna, peso in pesos.items():
        puntaje += peso * fila[columna]
    return puntaje

In [5]:
df['puntaje'] = df.apply(calcular_puntaje, axis=1)

In [6]:
df

Unnamed: 0,name_y,text,latitude,longitude,num_of_reviews,avg_rating,puntaje
0,"""TACOS"" MICHOACAN",26,34.193687,-118.531962,88,4.1,40.1
1,#8:46,26,30.057229,-95.494884,57,4.4,9.4
2,$,35,26.348095,-80.084331,158,3.3,91.3
3,'Essen,30,40.759743,-73.973728,118,4.1,62.1
4,.,19,40.111530,-75.336822,56,4.2,22.2
...,...,...,...,...,...,...,...
23094,马师傅面馆,3,32.951999,-96.727767,38,3.9,35.9
23095,魔饭 Morefan,8,33.067249,-96.696350,28,4.0,16.0
23096,옛날통닭 & 스시 | Chicken and Sushi Flushing NY,29,40.761944,-73.803939,124,4.2,70.2
23097,초원가든 Chowon Garden,7,40.763264,-73.808759,35,3.8,24.8


In [7]:
X = df[['puntaje']]
y = df[['latitude', 'longitude']]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


modelo = KNeighborsRegressor(n_neighbors=3)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Error cuadrático medio: {mse}')

#test del modelo
nuevo_puntaje = 90  
prediccion = modelo.predict(np.array([[nuevo_puntaje]]))
print(f'Predicción de latitud y longitud para el nuevo restaurante: {prediccion}')

Error cuadrático medio: 223.95590539962112
Predicción de latitud y longitud para el nuevo restaurante: [[ 40.6077766 -74.3530109]]




In [8]:
predicciones = modelo.predict(X)
df_predicciones = pd.DataFrame(predicciones, columns=['latitud_predicha', 'longitud_predicha'])

In [9]:
df_predicciones

Unnamed: 0,latitud_predicha,longitud_predicha
0,36.062192,-97.015322
1,37.270520,-106.735997
2,35.921441,-75.871005
3,41.486506,-75.579128
4,32.681518,-110.530677
...,...,...
23094,28.610678,-93.246688
23095,37.016697,-104.463729
23096,41.525705,-77.591831
23097,33.578022,-103.697119


In [11]:

df = df.reset_index(drop=True)
df_predicciones = df_predicciones.reset_index(drop=True)

df['latitud_predicha'] = df_predicciones['latitud_predicha']
df['longitud_predicha'] = df_predicciones['longitud_predicha']

In [12]:
df

Unnamed: 0,name_y,text,latitude,longitude,num_of_reviews,avg_rating,puntaje,latitud_predicha,longitud_predicha
0,"""TACOS"" MICHOACAN",26,34.193687,-118.531962,88,4.1,40.1,36.062192,-97.015322
1,#8:46,26,30.057229,-95.494884,57,4.4,9.4,37.270520,-106.735997
2,$,35,26.348095,-80.084331,158,3.3,91.3,35.921441,-75.871005
3,'Essen,30,40.759743,-73.973728,118,4.1,62.1,41.486506,-75.579128
4,.,19,40.111530,-75.336822,56,4.2,22.2,32.681518,-110.530677
...,...,...,...,...,...,...,...,...,...
23094,马师傅面馆,3,32.951999,-96.727767,38,3.9,35.9,28.610678,-93.246688
23095,魔饭 Morefan,8,33.067249,-96.696350,28,4.0,16.0,37.016697,-104.463729
23096,옛날통닭 & 스시 | Chicken and Sushi Flushing NY,29,40.761944,-73.803939,124,4.2,70.2,41.525705,-77.591831
23097,초원가든 Chowon Garden,7,40.763264,-73.808759,35,3.8,24.8,33.578022,-103.697119


In [15]:
import pyarrow as pa
import pyarrow.parquet as pq

In [16]:
table = pa.Table.from_pandas(df)

In [17]:
schema = table.schema

In [None]:
with pq.ParquetWriter('gs://data_henry_proyecto_final/google_analisis_prediccion.parquet', schema) as writer:
    writer.write_table(table)