In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import glob
import os

import sys
sys.path.append("../src")
from utils import text_processing
from text_processing import asignar_provincia_comunidad, categorize_news

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier

pd.set_option("display.max_colwidth", None)


In [11]:
folder_path = "../00.data/scraped"

csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")), key=os.path.getmtime)

df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)


In [12]:
df = df.drop_duplicates(subset=["news_id"])

## Procesamiento de 'comunidad' y 'provincia'

In [13]:
asignar_provincia_comunidad(df)

# Procesamiento de Categorias


In [14]:
categorize_news(df)

In [79]:
# buscando mas instancias de noticias de las categorias subrepresentadas

df.loc[(df['category']=='Otros') & (df['content'].str.contains('fútbol', case=False, na=False)), 'category'] = 'Deportes'
df.loc[(df['category']=='Otros') & (df['content'].str.contains('crim', case=False, na=False)) & (df['content'].str.contains('polic', case=False, na=False)), 'category'] = 'Crimen'
df.loc[(df['category']=='Otros') & (df['content'].str.contains('econom', case=False, na=False)) & (df['content'].str.contains('financ', case=False, na=False)), 'category'] = 'Negocios y Economía'
df.loc[(df['category']=='Otros') & (df['content'].str.contains('escuela', case=False, na=False)) & (df['content'].str.contains('currícu', case=False, na=False)), 'category'] = 'Educación'

### categorisando "otros" usando ML

In [81]:
nltk.download("stopwords")
spanish_stopwords = stopwords.words("spanish")

df = df.dropna(subset=["content"])

df_unlabeled = df[df['category'] == "Otros"]
df_labeled = df[df['category'] != "Otros"]

label_encoder = LabelEncoder()
df_labeled["category_encoded"] = label_encoder.fit_transform(df_labeled["category"])

X = df_labeled["content"] 
y = df_labeled["category_encoded"]

vectorizer = TfidfVectorizer(stop_words=spanish_stopwords, max_features=50000)
X_tfidf = vectorizer.fit_transform(X)

print("Before Sampling:", Counter(y))

# combinacion de under y oversampling
undersampling_strategy = {9: 3000, 4: 3000, 11: 3000}
oversampling_strategy = {5: 1000, 6: 1000, 12: 1000, 7: 1000, 10: 800, 
                         0: 1000, 1: 800, 8: 1000, 3: 800, 2: 1000}  

undersampler = RandomUnderSampler(sampling_strategy=undersampling_strategy, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_tfidf, y)

oversampler = SMOTE(sampling_strategy=oversampling_strategy, random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_resampled, y_resampled)

print("After Sampling:", Counter(y_resampled))

df_balanced = pd.DataFrame({"category": label_encoder.inverse_transform(y_resampled)})

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# usamos Random Forest classifier para entrenar el modelo
model = RandomForestClassifier(
    bootstrap=False, 
    max_depth=None, 
    max_features='log2', 
    min_samples_leaf=1, 
    min_samples_split=5, 
    n_estimators=300, 
    random_state=42  
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucija/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labeled["category_encoded"] = label_encoder.fit_transform(df_labeled["category"])


Before Sampling: Counter({9: 95400, 4: 40217, 11: 14522, 2: 632, 5: 294, 0: 254, 8: 246, 6: 240, 12: 221, 7: 200, 10: 137, 1: 69, 3: 27})
After Sampling: Counter({4: 3000, 9: 3000, 11: 3000, 0: 1000, 2: 1000, 5: 1000, 6: 1000, 7: 1000, 8: 1000, 12: 1000, 1: 800, 3: 800, 10: 800})
Model Accuracy: 0.8318
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       200
           1       1.00      0.98      0.99       160
           2       0.98      0.84      0.91       200
           3       1.00      1.00      1.00       160
           4       0.63      0.62      0.63       600
           5       0.99      0.92      0.96       200
           6       0.86      0.94      0.90       200
           7       1.00      0.99      0.99       200
           8       0.99      0.98      0.98       200
           9       0.71      0.76      0.74       600
          10       0.99      0.97      0.98       160
          11       0.71      0.74      0.72    

In [82]:
X_unlabeled = vectorizer.transform(df_unlabeled["content"])

y_proba = model.predict_proba(X_unlabeled)

max_probs = np.max(y_proba, axis=1)

y_pred_indices = np.argmax(y_proba, axis=1)

y_pred_labels = label_encoder.inverse_transform(y_pred_indices)

#si modelo esta <30% cierto sobre la categorisacion, lo clasifica como "otros"
final_categories = np.where(max_probs >= 0.30, y_pred_labels, "Otros")

df_unlabeled["category"] = final_categories


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unlabeled["category"] = final_categories


In [83]:
df_final = pd.concat([df_labeled, df_unlabeled], ignore_index=True)

In [84]:
df_final['category'].value_counts()

category
Política y Sociedad          143466
Entretenimiento y Cultura     65832
Tecnología y Ciencia          40355
Otros                         32472
Humor y Memes                  3023
Deportes                        890
Negocios y Economía             306
Historia y Humanidades          306
Crimen                          256
Transporte                      222
Medioambiente y Energía         200
Salud y Medicina                139
Cuestiones Sociales              69
Educación                        27
Name: count, dtype: int64

In [None]:
def change_type(df):
    df = df.astype({
        "meneos": "uint16",
        "karma": "uint16",
        "positive_votes": "uint16",
        "negative_votes": "uint16",
        "anonymous_votes": "uint16",
        "comments": "uint16",
        "clicks": "int32",
        "category": "category",
        "provincia": "category",
        "comunidad": "category"
    })

    df["published_date"] = pd.to_datetime(df["published_date"], errors="coerce")
    df["scraped_date"] = pd.to_datetime(df["scraped_date"], errors="coerce")

    return df


In [None]:
chunk_size = 100000
num_chunks = (len(df_final) // chunk_size) + 1

for i in range(num_chunks):
    start_row = i * chunk_size
    end_row = start_row + chunk_size
    df_chunk = df_final.iloc[start_row:end_row]

    file_name = f"../00.data/preprocesado/meneame_procesado_{i+1}.pkl"
    with open(file_name, "wb") as f:
        pickle.dump(df_chunk, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Guardado: {file_name} con {len(df_chunk)} filas")

In [None]:
#with gzip.open("../00.data/preprocesado/meneame_procesado.pkl.gz", "wb") as f:
#    pickle.dump(df_final, f, protocol=pickle.HIGHEST_PROTOCOL)
