In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch

df = pd.read_csv('categories.csv')
df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,id,titulo,subtitular,categoria
0,010222nacional,celebracion lo lleva a la muerte,"ya no llego a darles el abrazo a sus papas, en...",Comunidades
1,010322nacional,campeon historico,los toros suman su primer titulo de liga nacional,Deporte
2,010322occidente,invierten en obras,cuatro comunidades de san pedro sacatepequez s...,Comunidades
3,010322centro,abarrotan balneario,los aposentos recibe a cientos de chimaltecos ...,Comunidades
4,010322sur,caen en hondonada,percance vial en santa maria el naranjo coatep...,Accidentes Viales


In [None]:
# Load BETO tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', num_labels=len(df['categoria'].unique()))

# Preprocess the title and subtitle into the format the model expects
def preprocess_text(row):
    # Ensure both title and subtitle are strings (handle NaN)
    title = str(row['titulo']) if pd.notnull(row['titulo']) else ''
    subtitle = str(row['subtitular']) if pd.notnull(row['subtitular']) else ''
    
    # Concatenate title and subtitle
    text = title + " " + subtitle
    return tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

In [None]:
# Apply preprocessing to your data
df['inputs'] = df.apply(preprocess_text, axis=1)

def classify_newspaper(text):
    # Preprocess the input text (new title and subtitle)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    # Get the predicted category
    predicted_class = torch.argmax(logits, dim=1).item()
    return df['categoria'].unique()[predicted_class]

In [None]:
# Test the classification with a new title and subtitle
titulo = "Carro choca en media via"
subtitulo = "15 personas mueren en golpe de autobus luego de pasar por la carretera las fuentes"
new_text = titulo + " " + subtitulo
print(classify_newspaper(new_text))