In [52]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import glob
import os
import joblib

import sys
sys.path.append("/Users/lucija/Projects/Analisis-de-noticias/src")  # Ensure Python finds "utils"

# Import the class from text_processing.py
from utils.text_processing import NewsProcessor


from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier

pd.set_option("display.max_colwidth", None)


In [53]:
folder_path = "../00.data/scraped"

csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")), key=os.path.getmtime)

df_list = [pd.read_csv(file) for file in csv_files]
meneame = pd.concat(df_list, ignore_index=True)


In [54]:
meneame = meneame.drop_duplicates(subset=["news_id"])

In [55]:
processor = NewsProcessor()

## Procesamiento de 'clicks'

In [56]:
meneame['clicks'] = meneame['clicks'].replace(0, np.nan)

## Procesamiento de 'comunidad' y 'provincia'

In [57]:
meneame_prov = processor.assign_province_and_community(meneame)


# Procesamiento de Categorias


In [58]:
meneame_prov_categ = processor.categorize_news(meneame_prov)

In [59]:
# buscando mas instancias de noticias de las categorias subrepresentadas

df.loc[(df['category']=='Otros') & (df['content'].str.contains('fútbol', case=False, na=False)), 'category'] = 'Deportes'
df.loc[(df['category']=='Otros') & (df['content'].str.contains('crim', case=False, na=False)) & (df['content'].str.contains('polic', case=False, na=False)), 'category'] = 'Crimen'
df.loc[(df['category']=='Otros') & (df['content'].str.contains('econom', case=False, na=False)) & (df['content'].str.contains('financ', case=False, na=False)), 'category'] = 'Negocios y Economía'
df.loc[(df['category']=='Otros') & (df['content'].str.contains('escuela', case=False, na=False)) & (df['content'].str.contains('currícu', case=False, na=False)), 'category'] = 'Educación'

### Usando modelo de ../03_01/RFC_categorias para predecir categorias

In [60]:
meneame = meneame.dropna(subset=["content"])

meneame_unlabeled = meneame[meneame['category'] == "Otros"]
meneame_labeled = meneame[meneame['category'] != "Otros"]

model = joblib.load("../03_01.ML/random_forest_model.pkl")
vectorizer = joblib.load("../03_01.ML/tfidf_vectorizer.pkl")
label_encoder = joblib.load("../03_01.ML/label_encoder.pkl")

In [61]:
X_unlabeled = vectorizer.transform(meneame_unlabeled["content"])

y_proba = model.predict_proba(X_unlabeled)

max_probs = np.max(y_proba, axis=1)

y_pred_indices = np.argmax(y_proba, axis=1)

y_pred_labels = label_encoder.inverse_transform(y_pred_indices)

#si modelo esta <30% cierto sobre la categorisacion, lo clasifica como "otros"
final_categories = np.where(max_probs >= 0.30, y_pred_labels, "Otros")

meneame_unlabeled["category"] = final_categories


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meneame_unlabeled["category"] = final_categories


In [45]:
meneame_final = pd.concat([meneame_labeled, meneame_unlabeled], ignore_index=True)

In [47]:
def change_type(df):
    df = df.astype({
        "meneos": "uint16",
        "karma": "uint16",
        "positive_votes": "uint16",
        "negative_votes": "uint16",
        "anonymous_votes": "uint16",
        "comments": "uint16",
        "category": "category",
        "provincia": "category",
        "comunidad": "category"
    })

    df["clicks"] = df["clicks"].astype("float32")

    df["published_date"] = pd.to_datetime(df["published_date"], errors="coerce")
    df["scraped_date"] = pd.to_datetime(df["scraped_date"], errors="coerce")

    return df

meneame_final = change_type(meneame_final)

meneame_final.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287588 entries, 0 to 287587
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   news_id          287588 non-null  int64         
 1   title            287588 non-null  object        
 2   content          287588 non-null  object        
 3   full_story_link  287588 non-null  object        
 4   meneos           287588 non-null  uint16        
 5   clicks           206923 non-null  float32       
 6   karma            287588 non-null  uint16        
 7   positive_votes   287588 non-null  uint16        
 8   anonymous_votes  287588 non-null  uint16        
 9   negative_votes   287588 non-null  uint16        
 10  category         287588 non-null  category      
 11  comments         287588 non-null  uint16        
 12  published_date   287588 non-null  datetime64[ns]
 13  user             287588 non-null  object        
 14  source           287

In [51]:
chunk_size = 100000
num_chunks = (len(meneame_final) // chunk_size) + 1

for i in range(num_chunks):
    start_row = i * chunk_size
    end_row = start_row + chunk_size
    meneame_chunk = meneame_final.iloc[start_row:end_row]

    file_name = f"../00.data/preprocesado/meneame_procesado_{i+1}.pkl"
    with open(file_name, "wb") as f:
        pickle.dump(meneame_chunk, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Guardado: {file_name} con {len(meneame_chunk)} filas")

Guardado: ../00.data/preprocesado/meneame_procesado_1.pkl con 100000 filas
Guardado: ../00.data/preprocesado/meneame_procesado_2.pkl con 100000 filas
Guardado: ../00.data/preprocesado/meneame_procesado_3.pkl con 87588 filas
