In [33]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

class LinearKernelTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return linear_kernel(X, X)
    
    def score(self, X, y=None):
        # Ejemplo: retornar la media de los valores de la matriz de similitud
        return np.mean(self.transform(X))

# Cargar el dataframe
df1 = pd.read_csv('merged.csv')

# Preprocesamiento de datos (eliminar duplicados, manejar datos faltantes, etc.)
df1.drop_duplicates(inplace=True)
df1.dropna(subset=['name', 'category'], inplace=True)
df = df1[df1['category'].str.contains('restaurant', case=False)]


In [34]:

# Separar datos en entrenamiento y prueba
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# Modelo de recomendación
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LinearKernelTransformer())
])

# Definir los hiperparámetros para Randomized Search
parameters = {
    'tfidf__max_df': (0.25, 0.5),
    'tfidf__ngram_range': [(1, 1), (1, 2)],
}

# Randomized Search y Validación Cruzada
random_search = RandomizedSearchCV(pipeline, parameters, cv=5, n_jobs=-1, n_iter=4)
random_search.fit(X_train['category'], X_train['business_id'])

def recommend_business(business_id, df, model):
    business_index = df[df['business_id'] == business_id].index
    if len(business_index) == 0:
        print("Business ID not found in DataFrame")
        return None  # or any other default value you prefer
    cosine_similarities = model.best_estimator_.named_steps['clf'].transform(
        model.best_estimator_.named_steps['tfidf'].transform([df.loc[business_index[0]]['category']])
    )[0]
    related_business_indices = cosine_similarities.argsort()[:-6:-1]
    return df.iloc[related_business_indices]['business_id'].values.tolist()

# Ejemplo de uso
recommended_businesses = recommend_business('0x88e665ef4e6d51d9:0x15ba4a3671abf5d3', df, random_search)
if recommended_businesses:
    print(recommended_businesses)
else:
    print("No se encontraron negocios similares.")




['0x8894b5a7a7909725:0xe4687299fd288188']
