In [1]:
# Importaciones necesarias
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [2]:
df= pd.read_csv("/Users/isaromobru/Desktop/FakeNews 10.57.24/proyecto_machine_learning_Fake_News/data/processed/archivo.csv")

In [3]:
le = LabelEncoder()
df['subject_encoded'] = le.fit_transform(df['subject_grouped'])


In [4]:
from scipy.sparse import csr_matrix, hstack
# Asegurar que no haya valores nulos
df['text'] = df['text'].fillna("")
df['title'] = df['title'].fillna("")

# Vectorización con TF-IDF (manteniendo sparse matrices)
tfidf_text = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_title = TfidfVectorizer(max_features=1000, stop_words='english')

X_text = tfidf_text.fit_transform(df['text'])
X_title = tfidf_title.fit_transform(df['title'])

# Variables numéricas (convertidas a matriz dispersa)
X_other_features = csr_matrix(df[['subject_encoded', 'title_length', 'title_word_count']].values)

# Concatenar matrices dispersas sin convertirlas a DataFrame
X_final = hstack([X_other_features, X_text, X_title])

# Variable objetivo
y = df['label']

In [5]:

# Dividir en Train/Test
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)

# Mostrar tamaños de los conjuntos
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((35751, 6003), (8938, 6003), (35751,), (8938,))

In [9]:
# Definir los hiperparámetros a probar
param_grid_xgb = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.7, 0.8, 1.0]
}

param_grid_rf = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

param_grid_gb = {
    "n_estimators": [100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7],
    "subsample": [0.7, 0.8, 1.0]
}

# Inicializar modelos
xgb = XGBClassifier( eval_metric="logloss")
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)

# Configurar GridSearch para cada modelo
grid_xgb = GridSearchCV(xgb, param_grid_xgb, cv=3, scoring="accuracy", n_jobs=4, verbose=1)
grid_rf = GridSearchCV(rf, param_grid_rf, cv=3, scoring="accuracy", n_jobs=4, verbose=1)
grid_gb = GridSearchCV(gb, param_grid_gb, cv=3, scoring="accuracy", n_jobs=4, verbose=1)


In [10]:
# Ejecutar GridSearch para cada modelo
print("🔍 Optimizando XGBoost...")
grid_xgb.fit(X_train, y_train)
print(f"✅ Mejor precisión XGBoost: {grid_xgb.best_score_}")
print(f"📌 Mejores parámetros XGBoost: {grid_xgb.best_params_}")

print("\n🔍 Optimizando Random Forest...")
grid_rf.fit(X_train, y_train)
print(f"✅ Mejor precisión Random Forest: {grid_rf.best_score_}")
print(f"📌 Mejores parámetros Random Forest: {grid_rf.best_params_}")

print("\n🔍 Optimizando Gradient Boosting...")
grid_gb.fit(X_train, y_train)
print(f"✅ Mejor precisión Gradient Boosting: {grid_gb.best_score_}")
print(f"📌 Mejores parámetros Gradient Boosting: {grid_gb.best_params_}")


🔍 Optimizando XGBoost...
Fitting 3 folds for each of 54 candidates, totalling 162 fits


✅ Mejor precisión XGBoost: 0.9977063578641158
📌 Mejores parámetros XGBoost: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}

🔍 Optimizando Random Forest...
Fitting 3 folds for each of 54 candidates, totalling 162 fits
