In [3]:
from sqlalchemy import create_engine
import pandas as pd
import os
from dotenv import load_dotenv

import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

load_dotenv()

user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("HOST", "localhost")

# Configura la conexión a la base de datos
database_url = f"mysql+mysqlconnector://{user}:{password}@{host}/meneame"
print(database_url)
engine = create_engine(database_url)

# Cargar los datos desde la base de datos
query_news = "SELECT * FROM news_info_table"
df_news = pd.read_sql(query_news, engine)


mysql+mysqlconnector://root:password123@localhost/meneame


In [5]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287563 entries, 0 to 287562
Data columns (total 17 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   news_id          287563 non-null  int64         
 1   title            287563 non-null  object        
 2   content          287563 non-null  object        
 3   category_id      287563 non-null  int64         
 4   meneos           287563 non-null  int64         
 5   clicks           287563 non-null  int64         
 6   karma            287563 non-null  int64         
 7   positive_votes   287563 non-null  int64         
 8   anonymous_votes  287563 non-null  int64         
 9   negative_votes   287563 non-null  int64         
 10  comments         287563 non-null  int64         
 11  published_date   287563 non-null  datetime64[ns]
 12  scraped_date     287563 non-null  datetime64[ns]
 13  user_id          287563 non-null  int64         
 14  source_id        287

In [None]:
features = ['clicks', 'category_id', 'meneos', 'karma', 'positive_votes', 'negative_votes', 'comments', 'published_date']
df_selected = df_news[features]

df_selected



Unnamed: 0,clicks,category_id,meneos,karma,positive_votes,negative_votes,comments,published_date
0,0,7,246,11,246,0,21,2005-12-07 05:07:36
1,0,1,61,11,61,0,5,2005-12-07 09:25:02
2,0,1,34,13,34,0,0,2005-12-07 10:30:01
3,0,1,18,22,18,0,0,2005-12-07 11:30:01
4,0,1,35,32,35,0,0,2005-12-07 13:25:01
...,...,...,...,...,...,...,...,...
287558,1300,2,91,473,48,0,102,2025-03-04 11:45:02
287559,867,7,122,433,65,0,27,2025-03-04 12:50:03
287560,2282,7,302,406,137,4,250,2025-03-04 10:55:03
287561,810,7,191,501,90,0,48,2025-03-04 12:50:02


In [6]:
# Seleccionar características relevantes
features = ['category_id', 'meneos', 'karma', 'positive_votes', 'negative_votes', 'comments', 'published_date']
df_selected = df_news[features + ['clicks']]  # Incluimos 'clicks' como la variable objetivo


# 1. Preprocesamiento de datos
df_selected = df_selected.dropna()

# Extraer características útiles de 'published_date'
df_selected['published_date'] = pd.to_datetime(df_selected['published_date'])
df_selected['day_of_week'] = df_selected['published_date'].dt.dayofweek  # Día de la semana (0: lunes, 6: domingo)
df_selected['month'] = df_selected['published_date'].dt.month  # Mes (1: enero, 12: diciembre)
df_selected['year'] = df_selected['published_date'].dt.year  # Año

# Eliminar la columna 'published_date' original
df_selected.drop('published_date', axis=1, inplace=True)

# Crear nuevas características numéricas
df_selected['votes_ratio'] = df_selected['positive_votes'] / (df_selected['negative_votes'] + 1)  # +1 para evitar división por cero
df_selected['votes_diff'] = df_selected['positive_votes'] - df_selected['negative_votes']
df_selected['interaction'] = df_selected['meneos'] * df_selected['karma']

# Verificar los tipos de datos
print(df_selected.dtypes)

# 2. Dividir los datos
X = df_selected.drop('clicks', axis=1)
y = df_selected['clicks']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Ajuste de hiperparámetros con RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']  # Corregido: 'auto' reemplazado por 'sqrt'
}

model = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=50,  # Número de combinaciones a probar
    cv=5,       # Validación cruzada de 5 folds
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# Mejores hiperparámetros encontrados
print("Mejores hiperparámetros:", random_search.best_params_)

# 4. Entrenar el modelo con los mejores hiperparámetros
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# 5. Evaluación del modelo
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Error Cuadrático Medio (MSE): {mse}')
print(f'Error Absoluto Medio (MAE): {mae}')
print(f'Coeficiente de Determinación (R²): {r2}')

# 6. Análisis de residuales
residuals = y_test - y_pred
print("Residuales (media):", np.mean(residuals))
print("Residuales (desviación estándar):", np.std(residuals))

# 7. Guardar el modelo mejorado
import joblib
joblib.dump(best_model, 'modelo_entrenado_mejorado.pkl')


category_id         int64
meneos              int64
karma               int64
positive_votes      int64
negative_votes      int64
comments            int64
clicks              int64
day_of_week         int32
month               int32
year                int32
votes_ratio       float64
votes_diff          int64
interaction         int64
dtype: object


KeyboardInterrupt: 