In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import glob
import os
import joblib


import sys
sys.path.append("/Users/lucija/Projects/Analisis-de-noticias/src")  # Ensure Python finds "utils"

# Import the class from text_processing.py
from utils.text_processing import NewsProcessor

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier

pd.set_option("display.max_colwidth", None)


In [2]:
folder_path = "../00.data/scraped"

csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")), key=os.path.getmtime)

df_list = [pd.read_csv(file) for file in csv_files]
meneame = pd.concat(df_list, ignore_index=True)

In [3]:
processor = NewsProcessor()

In [4]:
meneame = processor.categorize_news(meneame)

In [5]:
# buscando mas instancias de noticias de las categorias subrepresentadas

meneame.loc[(meneame['category']=='Otros') & (meneame['content'].str.contains('fútbol', case=False, na=False)), 'category'] = 'Deportes'
meneame.loc[(meneame['category']=='Otros') & (meneame['content'].str.contains('crim', case=False, na=False)) & (meneame['content'].str.contains('polic', case=False, na=False)), 'category'] = 'Crimen'
meneame.loc[(meneame['category']=='Otros') & (meneame['content'].str.contains('econom', case=False, na=False)) & (meneame['content'].str.contains('financ', case=False, na=False)), 'category'] = 'Negocios y Economía'
meneame.loc[(meneame['category']=='Otros') & (meneame['content'].str.contains('escuela', case=False, na=False)) & (meneame['content'].str.contains('currícu', case=False, na=False)), 'category'] = 'Educación'

In [6]:
meneame['category'].value_counts()

category
Otros                        135113
Política y Sociedad           95499
Entretenimiento y Cultura     40280
Tecnología y Ciencia          14537
Deportes                        632
Historia y Humanidades          294
Crimen                          254
Negocios y Economía             246
Humor y Memes                   246
Transporte                      221
Medioambiente y Energía         200
Salud y Medicina                137
Cuestiones Sociales              69
Educación                        27
Name: count, dtype: int64

In [7]:
nltk.download("stopwords")
spanish_stopwords = stopwords.words("spanish")

meneame = meneame.dropna(subset=["content"])

meneame_labeled = meneame[meneame['category'] != "Otros"]

label_encoder = LabelEncoder()
meneame_labeled["category_encoded"] = label_encoder.fit_transform(meneame_labeled["category"])

X = meneame_labeled["content"] 
y = meneame_labeled["category_encoded"]

vectorizer = TfidfVectorizer(stop_words=spanish_stopwords, max_features=50000)
X_tfidf = vectorizer.fit_transform(X)

print("Before Sampling:", Counter(y))

# combinacion de under y oversampling
undersampling_strategy = {9: 3000, 4: 3000, 11: 3000}
oversampling_strategy = {5: 1000, 6: 1000, 12: 1000, 7: 1000, 10: 800, 
                         0: 1000, 1: 800, 8: 1000, 3: 800, 2: 1000}  

undersampler = RandomUnderSampler(sampling_strategy=undersampling_strategy, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_tfidf, y)

oversampler = SMOTE(sampling_strategy=oversampling_strategy, random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_resampled, y_resampled)

print("After Sampling:", Counter(y_resampled))

meneame_balanced = pd.DataFrame({"category": label_encoder.inverse_transform(y_resampled)})

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# usamos Random Forest classifier para entrenar el modelo
model = RandomForestClassifier(
    bootstrap=False, 
    max_depth=None, 
    max_features='log2', 
    min_samples_leaf=1, 
    min_samples_split=5, 
    n_estimators=300, 
    random_state=42  
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucija/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meneame_labeled["category_encoded"] = label_encoder.fit_transform(meneame_labeled["category"])


Before Sampling: Counter({9: 95499, 4: 40273, 11: 14536, 2: 632, 5: 294, 0: 254, 8: 246, 6: 240, 12: 221, 7: 200, 10: 137, 1: 69, 3: 27})
After Sampling: Counter({4: 3000, 9: 3000, 11: 3000, 0: 1000, 2: 1000, 5: 1000, 6: 1000, 7: 1000, 8: 1000, 12: 1000, 1: 800, 3: 800, 10: 800})
Model Accuracy: 0.8299
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       200
           1       1.00      0.98      0.99       160
           2       0.99      0.84      0.91       200
           3       1.00      1.00      1.00       160
           4       0.62      0.60      0.61       600
           5       1.00      0.91      0.95       200
           6       0.84      0.94      0.89       200
           7       1.00      0.98      0.99       200
           8       1.00      0.98      0.99       200
           9       0.70      0.75      0.72       600
          10       0.99      0.98      0.99       160
          11       0.72      0.76      0.74    

In [11]:
joblib.dump(model, "random_forest_model.pkl", compress=4)

joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

joblib.dump(label_encoder, "label_encoder.pkl")


['label_encoder.pkl']

In [None]:
import joblib
import os

def save_model_in_chunks(model, base_filename="random_forest_model", folder="model_chunks", chunk_size_mb=50):
    """
    Saves a large model in chunks smaller than chunk_size_mb MB.

    Parameters:
    - model: The trained model object (e.g., RandomForestClassifier).
    - base_filename: The base name for the output files.
    - folder: The directory to save model chunks.
    - chunk_size_mb: Maximum size (MB) per file.
    """
    # Ensure folder exists
    os.makedirs(folder, exist_ok=True)

    # Serialize the model into separate numpy arrays
    model_parts = joblib.dump(model, None, compress=4)  # Dump to memory first

    # Save each part separately
    for i, part in enumerate(model_parts):
        file_path = os.path.join(folder, f"{base_filename}_part{i}.pkl")
        joblib.dump(part, file_path)
        print(f"Saved: {file_path} ({os.path.getsize(file_path) / (1024 * 1024):.2f} MB)")

    print("\n✅ Model saved in multiple parts!")

# Example usage:
save_model_in_chunks(model)
