In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import glob
import os
import joblib
import sys

pd.set_option("display.max_colwidth", None)


In [5]:
from dotenv import load_dotenv
import os
import sys

load_dotenv()

user = os.getenv("user")
sys.path.append(f"/Users/{user}/Projects/Analisis-de-noticias/src")

from utils.text_processing import NewsProcessor


In [6]:
folder_path = "../00.data/scraped"

csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")), key=os.path.getmtime)

df_list = [pd.read_csv(file) for file in csv_files]
meneame = pd.concat(df_list, ignore_index=True)


In [7]:
meneame = meneame.drop_duplicates(subset=["news_id"])

In [8]:
processor = NewsProcessor()

## Procesamiento de 'clicks'

In [9]:
meneame['clicks'] = meneame['clicks'].replace(0, np.nan)

## Procesamiento de 'comunidad' y 'provincia'

In [10]:
meneame_prov = processor.assign_province_and_community(meneame)


# Procesamiento de Categorias


In [11]:
meneame_prov_categ = processor.categorize_news(meneame_prov)

In [15]:
meneame_prov_categ['category'].value_counts()

category
Otros                        135107
Política y Sociedad           95417
Entretenimiento y Cultura     40231
Tecnología y Ciencia          14524
Deportes                        632
Historia y Humanidades          294
Crimen                          254
Negocios y Economía             246
Humor y Memes                   246
Transporte                      221
Medioambiente y Energía         200
Salud y Medicina                137
Cuestiones Sociales              69
Educación                        27
Name: count, dtype: int64

### Usando modelo de ../03_01/RFC_categorias para predecir categorias

In [None]:

meneame_unlabeled = meneame_prov_categ[meneame_prov_categ['category'] == "Otros"]
meneame_labeled = meneame_prov_categ[meneame_prov_categ['category'] != "Otros"]

model = joblib.load("../03_01.ML/random_forest_model.pkl")
vectorizer = joblib.load("../03_01.ML/tfidf_vectorizer.pkl")
label_encoder = joblib.load("../03_01.ML/label_encoder.pkl")

In [23]:
meneame_unlabeled = meneame_unlabeled.dropna(subset=["content"])


In [24]:
X_unlabeled = vectorizer.transform(meneame_unlabeled["content"])

y_proba = model.predict_proba(X_unlabeled)

max_probs = np.max(y_proba, axis=1)

y_pred_indices = np.argmax(y_proba, axis=1)

y_pred_labels = label_encoder.inverse_transform(y_pred_indices)

#si modelo esta <30% cierto sobre la categorisacion, lo clasifica como "otros"
final_categories = np.where(max_probs >= 0.30, y_pred_labels, "Otros")

meneame_unlabeled["category"] = final_categories


In [25]:
meneame_final = pd.concat([meneame_labeled, meneame_unlabeled], ignore_index=True)

In [26]:
def change_type(df):
    df = df.astype({
        "meneos": "uint16",
        "karma": "uint16",
        "positive_votes": "uint16",
        "negative_votes": "uint16",
        "anonymous_votes": "uint16",
        "comments": "uint16",
        "category": "category",
        "provincia": "category",
        "comunidad": "category"
    })

    df["clicks"] = df["clicks"].astype("float32")

    df["published_date"] = pd.to_datetime(df["published_date"], errors="coerce")
    df["scraped_date"] = pd.to_datetime(df["scraped_date"], errors="coerce")

    return df

meneame_final = change_type(meneame_final)

meneame_final.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287602 entries, 0 to 287601
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   news_id          287602 non-null  int64         
 1   title            287602 non-null  object        
 2   content          287588 non-null  object        
 3   full_story_link  287602 non-null  object        
 4   meneos           287602 non-null  uint16        
 5   clicks           206937 non-null  float32       
 6   karma            287602 non-null  uint16        
 7   positive_votes   287602 non-null  uint16        
 8   anonymous_votes  287602 non-null  uint16        
 9   negative_votes   287602 non-null  uint16        
 10  category         287602 non-null  category      
 11  comments         287602 non-null  uint16        
 12  published_date   287602 non-null  datetime64[ns]
 13  user             287602 non-null  object        
 14  source           287

In [27]:
chunk_size = 100000
num_chunks = (len(meneame_final) // chunk_size) + 1

for i in range(num_chunks):
    start_row = i * chunk_size
    end_row = start_row + chunk_size
    meneame_chunk = meneame_final.iloc[start_row:end_row]

    file_name = f"../00.data/preprocesado/meneame_procesado_{i+1}.pkl"
    with open(file_name, "wb") as f:
        pickle.dump(meneame_chunk, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Guardado: {file_name} con {len(meneame_chunk)} filas")

Guardado: ../00.data/preprocesado/meneame_procesado_1.pkl con 100000 filas
Guardado: ../00.data/preprocesado/meneame_procesado_2.pkl con 100000 filas
Guardado: ../00.data/preprocesado/meneame_procesado_3.pkl con 87602 filas
