# Filtrado basado en contenido

In [None]:
pip install lightgbm --quiet

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from scipy.sparse import hstack

In [3]:
df_users = pd.read_csv("../../usuarios.csv", sep=",", dtype={"elite": str})
#df_users["elite"] = df_users["elite"].apply(lambda x: list(map(int, str(x).split(","))) if pd.notnull(x) else [])

df_businesses = pd.read_csv("../../negocios.csv", sep=",")
df_train = pd.read_csv("../../train_reviews.csv", sep=",")
df_test = pd.read_csv("../../test_reviews.csv", sep=",")

In [4]:
import itertools

dfs = {'users': df_users, 'businesses': df_businesses, 'train': df_train, 'test': df_test}
columnas_comunes = {}

for (name1, df1), (name2, df2) in itertools.combinations(dfs.items(), 2):
    if (name1 == 'train' and name2 =='test') or (name1 == 'test' and name2 =='train'):
        continue
    comunes = set(df1.columns).intersection(df2.columns)
    if comunes:
        columnas_comunes[(name1, name2)] = sorted(comunes)
        print(f"Columnas comunes entre {name1} y {name2}: {sorted(comunes)}")
        for col in comunes:
            rename1, rename2 = name1, name2
            if name1 == 'train' or name1 == 'test':
                rename1 = 'reviews'
            elif name2 == 'train' or name2 == 'test':
                rename2 = 'reviews'
            if col[-3:]== '_id':
                continue
            dfs[name1] = dfs[name1].rename(columns={col: f'{col}_{rename1}'})
            dfs[name2] = dfs[name2].rename(columns={col: f'{col}_{rename2}'})


Columnas comunes entre users y businesses: ['name', 'review_count']
Columnas comunes entre users y train: ['cool', 'funny', 'useful', 'user_id']
Columnas comunes entre users y test: ['cool', 'funny', 'useful', 'user_id']
Columnas comunes entre businesses y train: ['business_id', 'stars']
Columnas comunes entre businesses y test: ['business_id']


In [5]:
df_users = dfs['users']
df_businesses = dfs['businesses']
df_train = dfs['train']
df_test = dfs['test']

In [6]:
df_train = df_train.merge(df_users, on="user_id", how="left")
df_train = df_train.merge(df_businesses, on="business_id", how="left")

df_test = df_test.merge(df_users, on="user_id", how="left")
df_test = df_test.merge(df_businesses, on="business_id", how="left")

In [7]:
text_col = "text"
target_col = "stars_reviews"
id_col = "review_id"

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
count.fit(df_train[text_col])
print(f"Vocabulario total: {len(count.vocabulary_)}")

Vocabulario total: 209368


In [9]:
X_train = df_train.drop(columns=[target_col])
y_train = df_train[target_col]

X_test = df_test.copy()

In [41]:
# TF-IDF del texto
tfidf = TfidfVectorizer(max_features=5000)

X_text_train = tfidf.fit_transform(X_train[text_col])
X_text_test = tfidf.transform(X_test[text_col])

In [25]:
print(type(X_text_train), type(X_text_train))
print(X_text_train.shape, X_text_test.shape)
print(type(X_train))

<class 'scipy.sparse._csr.csr_matrix'> <class 'scipy.sparse._csr.csr_matrix'>
(967784, 5000) (414765, 5000)
<class 'pandas.core.frame.DataFrame'>


In [26]:
# Modelo LightGBM
model = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=7)
model.fit(X_text_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 9.454706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 869492
[LightGBM] [Info] Number of data points in the train set: 967784, number of used features: 5000
[LightGBM] [Info] Start training from score 3.759882


In [28]:
# Predicción
y_pred = model.predict(X_text_test)

In [30]:
df_submission = pd.DataFrame({
    "review_id": df_test[id_col],
    "stars": y_pred
})
df_submission.to_csv("predictions_contenido/submission_lightgbm.csv", index=False)

Resultado inicial del MAE: `0.6922`

In [42]:
from sklearn.impute import SimpleImputer
from scipy.sparse import hstack
from lightgbm import LGBMRegressor


cols_estructura = [
    'review_count_users', 'useful_users', 'funny_users', 'cool_users', 'fans',
    'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile',
    'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain',
    'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos',
    'stars_businesses', 'review_count_businesses', 'is_open',
    'useful_reviews', 'funny_reviews', 'cool_reviews',
    'review_month', 'review_weekday', 'text_len'
]

df_train['review_month'] = pd.to_datetime(df_train['date']).dt.month
df_train['review_weekday'] = pd.to_datetime(df_train['date']).dt.weekday

df_test['review_month'] = pd.to_datetime(df_test['date']).dt.month
df_test['review_weekday'] = pd.to_datetime(df_test['date']).dt.weekday

cols_estructura += ['review_month', 'review_weekday']

df_train['text_len'] = df_train['text'].apply(len)
df_test['text_len'] = df_test['text'].apply(len)
cols_estructura += ['text_len']

imputer = SimpleImputer(strategy='mean')

X_struct_train = imputer.fit_transform(df_train[cols_estructura])
X_struct_test = imputer.transform(df_test[cols_estructura])

X_train_final = hstack([X_text_train, X_struct_train])
X_test_final = hstack([X_text_test, X_struct_test])
y_train = df_train["stars_reviews"]


model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_final, y_train)


y_pred = model.predict(X_test_final)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 8.092455 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 873741
[LightGBM] [Info] Number of data points in the train set: 967784, number of used features: 5029
[LightGBM] [Info] Start training from score 3.759882




In [None]:
df_output = pd.DataFrame({
    "review_id": df_test["review_id"],
    "stars": y_pred
})

df_output.to_csv("predictions_contenido/prediction_tfidf_lightgbm_29cols.csv", index=False)

Al añadir estas columnas extra de información al modelo, el resultado aumenta a un `0.5385`

Ahora vamos a proceder a añadir una característica que indica los años que el usuario ha estado en el programa elite de yelp.

In [43]:
df_train["elite_years"] = df_train["elite"].fillna("").apply(lambda x: len(x.split(',')) if x else 0)
df_test["elite_years"] = df_test["elite"].fillna("").apply(lambda x: len(x.split(',')) if x else 0)

In [44]:
cols_estructura += ['elite_years']

In [45]:
# Vectorizar 'categories'
cat_vect = CountVectorizer(max_features=100, token_pattern='[^,]+')  # separa por coma
X_cat_train = cat_vect.fit_transform(df_train['categories'].fillna(''))
X_cat_test = cat_vect.transform(df_test['categories'].fillna(''))

In [46]:
# Imputar valores faltantes en las columnas estructuradas
imputer = SimpleImputer(strategy='mean')
X_struct_train = imputer.fit_transform(df_train[cols_estructura])
X_struct_test = imputer.transform(df_test[cols_estructura])

# Concatenar texto + columnas
X_train_final = hstack([X_text_train, X_struct_train])
X_test_final = hstack([X_text_test, X_struct_test])
y_train = df_train["stars_reviews"]

In [46]:
#Entrenar el modelo
model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_final, y_train)

# Predicción
y_pred = model.predict(X_test_final)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 8.922104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 873759
[LightGBM] [Info] Number of data points in the train set: 967784, number of used features: 5030
[LightGBM] [Info] Start training from score 3.759882




In [47]:
# Guardar las predicciones en un archivo CSV
df_output = pd.DataFrame({
    "review_id": df_test["review_id"],
    "stars": y_pred
})
df_output.to_csv("predictions_contenido/prediction_tfidf_lightgbm_elite+categories.csv", index=False)

Vamos a hacer un RandomizedSearch para optimizar los parámetros

In [47]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error
import time

In [48]:
# Modelo base
model = LGBMRegressor(random_state=42, n_jobs=-1)

param_dist = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [7, 10, 12],
    'num_leaves': [31, 64, 128],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=40,
    scoring=mae_scorer,
    cv=3,
    verbose=2,
    random_state=42
)

start = time.time()

search.fit(X_train_final, y_train)

end = time.time()
elapsed_hours = round((end - start) / 3600, 2)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 5.121050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 872595
[LightGBM] [Info] Number of data points in the train set: 645189, number of used features: 5030
[LightGBM] [Info] Start training from score 3.761369
[CV] END colsample_bytree=1.0, learning_rate=0.03, max_depth=10, n_estimators=300, num_leaves=31, subsample=1.0; total time= 3.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 6.796422 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 872972
[LightGBM] [Info] Number of data points in the train set: 645189, number of used features: 5030
[LightGBM] [Info] Start training from score 3.758269
[CV] END colsample_bytree=1.0, learning_rate=0.03, max_depth=10, n_estimators=300, num_leaves=31, subsample=1.0;

In [49]:
print(f"Tiempo total de búsqueda: {elapsed_hours} horas")
print("Mejores parámetros encontrados:")
for k, v in search.best_params_.items():
    print(f"{k}: {v}")
print(f"Mejor MAE (negativo): {search.best_score_}")

Tiempo total de búsqueda: 7.26 horas
Mejores parámetros encontrados:
subsample: 0.8
num_leaves: 64
n_estimators: 500
max_depth: 12
learning_rate: 0.1
colsample_bytree: 1.0
Mejor MAE (negativo): -0.49150960937576565


In [50]:
best_model = search.best_estimator_
best_model.fit(X_train_final, y_train)

# Predicción
y_pred = best_model.predict(X_test_final)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 10.133734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 873759
[LightGBM] [Info] Number of data points in the train set: 967784, number of used features: 5030
[LightGBM] [Info] Start training from score 3.759882




In [51]:
df_output = pd.DataFrame({
    "review_id": df_test["review_id"],
    "stars": y_pred
})
df_output.to_csv("predictions_contenido/prediction_tfidf_lightgbm_gridsearch.csv", index=False)

Obtenemos unos resultados de `0.4906`

Randomized Search (versión 2)

In [52]:
# Modelo base
model = LGBMRegressor(random_state=42, n_jobs=-1)

param_dist = {
    'n_estimators': [400, 500, 600],
    'learning_rate': [0.08, 0.1, 0.12],
    'max_depth': [10, 12, 14],
    'num_leaves': [48, 64, 80],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.9, 1.0]
}

mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=20,
    scoring=mae_scorer,
    cv=3,
    verbose=2,
    random_state=42
)

start = time.time()

search.fit(X_train_final, y_train)

end = time.time()
elapsed_hours = round((end - start) / 3600, 2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 6.876454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 872595
[LightGBM] [Info] Number of data points in the train set: 645189, number of used features: 5030
[LightGBM] [Info] Start training from score 3.761369
[CV] END colsample_bytree=1.0, learning_rate=0.12, max_depth=10, n_estimators=500, num_leaves=80, subsample=0.7; total time= 5.5min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 9.599111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 872972
[LightGBM] [Info] Number of data points in the train set: 645189, number of used features: 5030
[LightGBM] [Info] Start training from score 3.758269
[CV] END colsample_bytree=1.0, learning_rate=0.12

In [53]:
print(f"\nTiempo total: {elapsed_hours} horas")
print("Mejores parámetros encontrados:")
for k, v in search.best_params_.items():
    print(f"{k}: {v}")
print(f"Mejor MAE (negativo): {search.best_score_}")


Tiempo total: 5.99 horas
Mejores parámetros encontrados:
subsample: 0.7
num_leaves: 64
n_estimators: 600
max_depth: 14
learning_rate: 0.12
colsample_bytree: 1.0
Mejor MAE (negativo): -0.4849501041408935


In [54]:
best_model = search.best_estimator_
best_model.fit(X_train_final, y_train)
y_pred = best_model.predict(X_test_final)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 12.390065 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 873759
[LightGBM] [Info] Number of data points in the train set: 967784, number of used features: 5030
[LightGBM] [Info] Start training from score 3.759882




In [55]:
df_output = pd.DataFrame({
    "review_id": df_test["review_id"],
    "stars": y_pred
})
df_output.to_csv("predictions_contenido/prediction_tfidf_lightgbm_gridsearch2.csv", index=False)

Resultado: `0.4837`

Probamos a procesar el texto con word2vec en vez de tfidf

In [36]:
import nltk
import re
import gensim
from gensim.models import Word2Vec
from scipy.sparse import hstack, csr_matrix
from tqdm import tqdm
from sklearn.impute import SimpleImputer

In [25]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paula\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
def tokenize(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # quitar símbolos
    return nltk.word_tokenize(text)

df_train["tokens"] = df_train[text_col].fillna("").apply(tokenize)
df_test["tokens"] = df_test[text_col].fillna("").apply(tokenize)

In [31]:
all_tokens = df_train["tokens"].tolist() + df_test["tokens"].tolist()
w2v_model = Word2Vec(sentences=all_tokens, vector_size=100, window=5, min_count=2, workers=4, sg=1, seed=42)

In [33]:
# Promediar vectores por review

def get_review_vector(tokens, model, vector_size=100):
    vecs = [model.wv[word] for word in tokens if word in model.wv]
    if len(vecs) == 0:
        return np.zeros(vector_size)
    return np.mean(vecs, axis=0)

X_vec_train = np.vstack([get_review_vector(tokens, w2v_model, 100) for tokens in tqdm(df_train["tokens"])])
X_vec_test = np.vstack([get_review_vector(tokens, w2v_model, 100) for tokens in tqdm(df_test["tokens"])])

# Convertir a sparse
X_vec_train = csr_matrix(X_vec_train)
X_vec_test = csr_matrix(X_vec_test)

100%|██████████| 967784/967784 [04:45<00:00, 3384.43it/s]
100%|██████████| 414765/414765 [02:20<00:00, 2960.24it/s]


In [37]:
# Añadir longitud del texto como feature

df_train["text_len"] = df_train[text_col].fillna("").apply(len)
df_test["text_len"] = df_test[text_col].fillna("").apply(len)

X_struct_train = df_train[["text_len"]].values
X_struct_test = df_test[["text_len"]].values

imputer = SimpleImputer(strategy="mean")
X_struct_train = imputer.fit_transform(X_struct_train)
X_struct_test = imputer.transform(X_struct_test)

X_struct_train = csr_matrix(X_struct_train)
X_struct_test = csr_matrix(X_struct_test)

In [38]:
# Añadir Word2Vec

X_train_final = hstack([X_vec_train, X_struct_train])
X_test_final = hstack([X_vec_test, X_struct_test])
y_train = df_train["stars_reviews"]

In [39]:
# Entrenar LightGBM

model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train_final, y_train)

# Predicción
print("Generando predicciones...")
y_pred = model.predict(X_test_final)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.423514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25755
[LightGBM] [Info] Number of data points in the train set: 967784, number of used features: 101
[LightGBM] [Info] Start training from score 3.759882
Generando predicciones...


In [40]:
# Guardar las predicciones en un archivo CSV
df_output = pd.DataFrame({
    "review_id": df_test["review_id"],
    "stars": y_pred
})
df_output.to_csv("predictions_contenido/prediction_word2vec_lightgbm.csv", index=False)