# **SISTEMAS DE RECOMENDACIÓN**

## **Filtrado Basado en Contenido**


Miembros del Grupo:
- Paula Arias Fernández
- Jorge del Castillo Gómez
- Anny Álvarez Nogales

In [None]:
### BORRAR ESTE COMENTARIO CUANDO LO LEÁIS
#Sigo la estructura de la presentación 
# (dejo los huequitos antes de mi código pero si queréis cambiar 
# el orden de algo o moverlo sin problema que no sabia como ponerlo)


In [None]:
# Importing libraries
import pandas as pd
import re
from joblib import Parallel, delayed

# **Pruebas Datos Textuales** 

## Word2Vec Embedding Model

In [None]:
#DATOS
train_reviews=pd.read_csv('train_reviews.csv', sep=',')
test_reviews = pd.read_csv('test_reviews.csv')
negocios_df=pd.read_csv('negocios.csv')

df = train_reviews.merge(negocios_df, on='business_id',how='inner')
df

In [None]:
#PREPROCESS DATA 

df['text'] = df['text'].astype(str) + " " + df['categories'].astype(str)

def preprocess_text_parallel(text):
    return re.findall(r'\b[a-zA-Z]+\b', text.lower())

train_reviews['tokens'] = Parallel(n_jobs=-1)(delayed(preprocess_text_parallel)(text) for text in df['text'])
test_reviews['tokens'] = Parallel(n_jobs=-1)(delayed(preprocess_text_parallel)(text) for text in test_reviews['text'])


In [None]:
#TEXT MODEL WORD2VEC
#se entrena una vez y se guarda el modelo

import gensim
from gensim.models import Word2Vec
model = Word2Vec(
    sentences=train_reviews['tokens'],
    vector_size=50,  
    window=5,
    min_count=5,  
    workers=4,  
    epochs=7 
)


#model.save("word2vec_model.model")
import gensim
from gensim.models import Word2Vec
model = Word2Vec.load("word2vec_model.model")

In [None]:
#APPLY TEXT EMBEDDING MODEL
def get_review_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return [0] * model.vector_size  
    return sum(vectors) / len(vectors)

train_reviews['vector'] = train_reviews['tokens'].apply(lambda x: get_review_vector(x, model))

X_train = list(train_reviews['vector'])
y_train = train_reviews['stars']

In [None]:
#CLASSIFIER LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=2500,C= 14.867708330182724,solver='newton-cg',penalty='l2')
classifier.fit(X_train, y_train)

test_reviews['vector'] = test_reviews['tokens'].apply(lambda x: get_review_vector(x, model))

X_test = list(test_reviews['vector'])
predicted_stars = classifier.predict(X_test)

submission_df = pd.DataFrame({
    'review_id': test_reviews['review_id'],
    'stars': predicted_stars
})

submission_df

In [None]:
#CLASSIFIER RANDOMFOREST

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

test_reviews['vector'] = test_reviews['tokens'].apply(lambda x: get_review_vector(x, model))

X_test = list(test_reviews['vector'])
predicted_stars = classifier.predict(X_test)

submission_df = pd.DataFrame({
    'review_id': test_reviews['review_id'],
    'stars': predicted_stars
})


submission_df

In [None]:
#XGBOOST CLASSIFIER

from xgboost import XGBClassifier
y_train_adj = y_train - 1

classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
classifier.fit(X_train, y_train_adj)

X_test = list(test_reviews['vector'])
predicted_stars = classifier.predict(X_test) + 1  


submission_df = pd.DataFrame({
    'review_id': test_reviews['review_id'],
    'stars': predicted_stars
})

In [None]:
#OPTUNA WITH LOGISTIC REGRESSION
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Optimización para el mejor modelo de regresión logística

def objective(trial):
    C = trial.suggest_loguniform('C', 1e-5, 100)  
    solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs', 'newton-cg', 'saga'])
    max_iter = trial.suggest_int('max_iter', 1000, 5000, step=500)
    penalty = trial.suggest_categorical('penalty', ['l2'])

    classifier = LogisticRegression(C=C, solver=solver, max_iter=max_iter, penalty=penalty)
    classifier.fit(X_train, y_train)

    predicted_stars = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, predicted_stars)
    
    return accuracy

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Maximize accuracy
study = optuna.create_study(direction='maximize')  
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

best_classifier = LogisticRegression(**best_params)
best_classifier.fit(X_train, y_train)

#evaluation
final_predictions = best_classifier.predict(X_test)
final_accuracy = accuracy_score(y_test, final_predictions)
print(f"Final Accuracy: {final_accuracy}")


## Fast Text Embedding Model

In [None]:
#TEXT MODEL FAST TEXT
from gensim.models import FastText


fasttext_model = FastText(sentences=train_reviews['tokens'], vector_size=50, window=5, min_count=5, epochs=10)
fasttext_model.save("fasttext_model.model")

In [None]:
def get_review_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return [0] * model.vector_size
    
    return sum(vectors) / len(vectors)

train_reviews['vector'] = train_reviews['tokens'].apply(lambda x: get_review_vector(x, fasttext_model))

X_train = list(train_reviews['vector'])
y_train = train_reviews['stars']

In [None]:
# XGBOOST CLASSIFIER

from xgboost import XGBClassifier
y_train_adj = y_train - 1

classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
classifier.fit(X_train, y_train_adj)

X_test = list(test_reviews['vector'])
predicted_stars = classifier.predict(X_test) + 1  


submission_df = pd.DataFrame({
    'review_id': test_reviews['review_id'],
    'stars': predicted_stars
})


# **Pruebas Datos No Textuales**

In [None]:
# FEATURES SELECTION
usuarios=pd.read_csv('usuarios.csv', sep=',')

df = df.merge(usuarios, on='user_id',how='inner')

df2=df.drop(['review_id',	'user_id'	,'business_id','text', 'date', 'name_x','address','city','state','attributes','categories','name_y','elite','friends','hours','yelping_since','postal_code'],axis=1)
df2.rename(columns={'stars_x':'stars','name_y':'user_name','useful_x':'useful','funny_x':'funny','cool_x':'cool'}, inplace=True)
df2

In [None]:
#NO TEXTUAL DATA NORMALIZATION

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
df2=df2[['funny','stars','cool','useful']]
label_encoder = LabelEncoder()


y_train = df2['stars'].round().astype(int)  
X_train = df2.drop(columns=['stars'])  


# Normalización
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Entrenamiento
classifier = LogisticRegression(max_iter=3000)
classifier.fit(X_train_scaled, y_train)

In [None]:
# CLASSIFIER

#X_test_scaled = scaler.transform(test_reviews)
common_cols = list(set(X_train.columns) & set(test_reviews.columns))
X_test = test_reviews[common_cols]
X_test_scaled = scaler.fit_transform(X_test)

X_test_scaled
predicted_stars = classifier.predict(X_test_scaled)
predicted_stars

submission_df = pd.DataFrame({
    'review_id': test_reviews['review_id'],
    'stars': predicted_stars
})

submission_df.to_csv('submission.csv', index=False)