In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import glob
import joblib
import sys
from dotenv import load_dotenv
import os
import sys

load_dotenv()

user = os.getenv("user")
sys.path.append(f"/Users/{user}/Projects/Analisis-de-noticias/src")

from utils.text_processing import NewsProcessor

pd.set_option("display.max_colwidth", None)


In [2]:
folder_path = "../00.data/scraped"

csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")), key=os.path.getmtime)

df_list = [pd.read_csv(file) for file in csv_files]
meneame = pd.concat(df_list, ignore_index=True)


In [3]:
meneame = meneame.drop_duplicates(subset=["news_id"])

In [4]:
processor = NewsProcessor()

## Procesamiento de 'clicks'

In [6]:
meneame['clicks'] = meneame['clicks'].replace(0, np.nan)

## Procesamiento de 'comunidad' y 'provincia'

In [7]:
meneame_prov = processor.assign_province_and_community(meneame)


# Procesamiento de Categorias


In [8]:
meneame_prov_categ = processor.categorize_news(meneame_prov)

### Usando modelo de ../03_01/RFC_categorias para predecir categorias

In [None]:

meneame_unlabeled = meneame_prov_categ[meneame_prov_categ['category'] == "Otros"]
meneame_labeled = meneame_prov_categ[meneame_prov_categ['category'] != "Otros"]

model = joblib.load("../03_01.ML/random_forest_model.pkl")
vectorizer = joblib.load("../03_01.ML/tfidf_vectorizer.pkl")
label_encoder = joblib.load("../03_01.ML/label_encoder.pkl")

In [23]:
meneame_unlabeled = meneame_unlabeled.dropna(subset=["content"])


In [24]:
X_unlabeled = vectorizer.transform(meneame_unlabeled["content"])

y_proba = model.predict_proba(X_unlabeled)

max_probs = np.max(y_proba, axis=1)

y_pred_indices = np.argmax(y_proba, axis=1)

y_pred_labels = label_encoder.inverse_transform(y_pred_indices)

#si modelo esta <30% cierto sobre la categorisacion, lo clasifica como "otros"
final_categories = np.where(max_probs >= 0.30, y_pred_labels, "Otros")

meneame_unlabeled["category"] = final_categories


In [25]:
meneame_final = pd.concat([meneame_labeled, meneame_unlabeled], ignore_index=True)

In [None]:
meneame_final = processor.change_type(meneame_final)

meneame_final.info()


<class 'pandas.core.frame.DataFrame'>
Index: 287730 entries, 0 to 287879
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   news_id          287730 non-null  int64         
 1   title            287730 non-null  object        
 2   content          287713 non-null  object        
 3   full_story_link  287730 non-null  object        
 4   meneos           287730 non-null  uint16        
 5   clicks           207065 non-null  float32       
 6   karma            287730 non-null  uint16        
 7   positive_votes   287730 non-null  uint16        
 8   anonymous_votes  287730 non-null  uint16        
 9   negative_votes   287730 non-null  uint16        
 10  category         287730 non-null  category      
 11  comments         287730 non-null  uint16        
 12  published_date   287730 non-null  datetime64[ns]
 13  user             287730 non-null  object        
 14  source           287730 n

In [27]:
chunk_size = 100000
num_chunks = (len(meneame_final) // chunk_size) + 1

for i in range(num_chunks):
    start_row = i * chunk_size
    end_row = start_row + chunk_size
    meneame_chunk = meneame_final.iloc[start_row:end_row]

    file_name = f"../00.data/preprocesado/meneame_procesado_{i+1}.pkl"
    with open(file_name, "wb") as f:
        pickle.dump(meneame_chunk, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Guardado: {file_name} con {len(meneame_chunk)} filas")

Guardado: ../00.data/preprocesado/meneame_procesado_1.pkl con 100000 filas
Guardado: ../00.data/preprocesado/meneame_procesado_2.pkl con 100000 filas
Guardado: ../00.data/preprocesado/meneame_procesado_3.pkl con 87602 filas
