In [33]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [34]:
torch.cuda.is_available()

True

In [35]:
device = "cuda:0" if torch.cuda.is_available() else "cpu" #para gpu
print(device)

cuda:0


In [36]:
df = pd.read_csv('dataset_final_actualizacion_mayus2.csv')
df

Unnamed: 0,Periódico,Hipervínculo,Fecha publicación,Titular,Subtítulo,Cuerpo,Categoría,Target
0,ElDiario.es,https://www.eldiario.es/politica/pp-recupera-l...,2023-07-11,El PP se recupera ligeramente pero sigue sin o...,La estimación de escaños no se mueve en la enc...,El partido popular se recupera muy ligeramente...,politica,0
1,ElDiario.es,https://www.eldiario.es/politica/58-ciudadanos...,2023-07-11,Un 58% de los ciudadanos critican los pactos c...,"El 17,8% de quienes apuestan por Feijóo ven “m...",El PP ha necesitado a Vox para recuperar buena...,politica,0
2,ElDiario.es,https://www.eldiario.es/politica/diputado-ayus...,2023-07-11,Un diputado de Ayuso reúne a un grupo de vícti...,Daniel Portero trata con urgencia de contrarre...,El diputado del PP en la Asamblea de Madrid Da...,politica,0
3,ElDiario.es,https://www.eldiario.es/andalucia/almeria/psoe...,2023-07-11,El PSOE de Mójacar recurre ante el Constitucio...,"Ya con nuevo alcalde del PP, los socialistas r...",Mojácar retomó el ritmo político con la procla...,andalucia,0
4,ElDiario.es,https://www.eldiario.es/castilla-la-mancha/pol...,2023-07-11,La exalcaldesa de Toledo pide a Feijóo que fir...,"La socialista, también número dos al Congreso,...",La exalcaldesa de Toledo y 'número dos' de la ...,castilla-la-mancha,0
...,...,...,...,...,...,...,...,...
4974,HayNoticia.es,https://haynoticia.es/una-empresa-catalana-tri...,2018-11-06,Una empresa catalana triunfa con su papel higi...,,La empresa catalana Banys Nous de Barcelona es...,curiosidades,1
4975,HayNoticia.es,https://haynoticia.es/hospitalizado-tras-inten...,2018-10-31,Hospitalizado tras intentar sacarse una muela ...,,Sucedió el pasado lunes poco después de las 20...,curiosidades,1
4976,HayNoticia.es,https://haynoticia.es/lleva-300-bolsas-de-plas...,2018-10-30,Lleva 300 bolsas de plástico a Mercadona para ...,,Un joven ha sido noticia ayer tras presentarse...,curiosidades,1
4977,HayNoticia.es,https://haynoticia.es/la-dgt-multara-a-los-con...,2018-10-28,La DGT multará a los conductores que no hayan ...,,Este domingo nos ha tocado hacer por segunda v...,curiosidades,1


In [37]:
df["Texto"] = df.Titular + " " + df.Cuerpo
df.head()

Unnamed: 0,Periódico,Hipervínculo,Fecha publicación,Titular,Subtítulo,Cuerpo,Categoría,Target,Texto
0,ElDiario.es,https://www.eldiario.es/politica/pp-recupera-l...,2023-07-11,El PP se recupera ligeramente pero sigue sin o...,La estimación de escaños no se mueve en la enc...,El partido popular se recupera muy ligeramente...,politica,0,El PP se recupera ligeramente pero sigue sin o...
1,ElDiario.es,https://www.eldiario.es/politica/58-ciudadanos...,2023-07-11,Un 58% de los ciudadanos critican los pactos c...,"El 17,8% de quienes apuestan por Feijóo ven “m...",El PP ha necesitado a Vox para recuperar buena...,politica,0,Un 58% de los ciudadanos critican los pactos c...
2,ElDiario.es,https://www.eldiario.es/politica/diputado-ayus...,2023-07-11,Un diputado de Ayuso reúne a un grupo de vícti...,Daniel Portero trata con urgencia de contrarre...,El diputado del PP en la Asamblea de Madrid Da...,politica,0,Un diputado de Ayuso reúne a un grupo de vícti...
3,ElDiario.es,https://www.eldiario.es/andalucia/almeria/psoe...,2023-07-11,El PSOE de Mójacar recurre ante el Constitucio...,"Ya con nuevo alcalde del PP, los socialistas r...",Mojácar retomó el ritmo político con la procla...,andalucia,0,El PSOE de Mójacar recurre ante el Constitucio...
4,ElDiario.es,https://www.eldiario.es/castilla-la-mancha/pol...,2023-07-11,La exalcaldesa de Toledo pide a Feijóo que fir...,"La socialista, también número dos al Congreso,...",La exalcaldesa de Toledo y 'número dos' de la ...,castilla-la-mancha,0,La exalcaldesa de Toledo pide a Feijóo que fir...


In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df["Texto"].tolist(), 
                                                  df["Target"].tolist(), 
                                                  test_size=0.25, 
                                                  stratify=df["Target"],
                                                  random_state=42)

In [39]:
tokenizer = AutoTokenizer.from_pretrained('dccuchile/albert-tiny-spanish')

# AHORA EL MODELO SÍ DISTINGUE MAYUSCULAS Y MINUSCULAS

Downloading (…)okenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [40]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [41]:
# Crear el dataset personalizado para PyTorch
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)

In [42]:
train_dataset = MyDataset(train_encodings, y_train)
test_dataset = MyDataset(test_encodings, y_test)


In [43]:
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": (predictions == labels).mean()}


In [65]:
# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    evaluation_strategy="epoch",
    output_dir="./results"
)

In [66]:
torch.cuda.empty_cache()

In [67]:
model = AutoModelForSequenceClassification.from_pretrained('dccuchile/albert-tiny-spanish', num_labels=2).to(device)

Some weights of the model checkpoint at dccuchile/albert-tiny-spanish were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.bias', 'sop_classifier.classifier.bias', 'predictions.LayerNorm.weight', 'predictions.dense.bias', 'sop_classifier.classifier.weight', 'predictions.dense.weight', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.decoder.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at dccuc

In [68]:
# Crear el entrenador y entrenar el modelo
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [16]:
torch.cuda.amp

<module 'torch.cuda.amp' from 'C:\\Users\\Usuario\\anaconda3\\lib\\site-packages\\torch\\cuda\\amp\\__init__.py'>

In [69]:
trainer.train() # bacth size = 8, epochs =4



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.093696,0.980723
2,0.139100,0.072183,0.985542
3,0.045800,0.047083,0.989558
4,0.019000,0.042895,0.989558


TrainOutput(global_step=1868, training_loss=0.05624395889694737, metrics={'train_runtime': 258.2766, 'train_samples_per_second': 57.829, 'train_steps_per_second': 7.233, 'total_flos': 60151750410240.0, 'train_loss': 0.05624395889694737, 'epoch': 4.0})

In [63]:
trainer.train() # bacth size = 16, epochs =3



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.097129,0.971888
2,No log,0.074418,0.982329
3,0.103900,0.069552,0.982329


TrainOutput(global_step=702, training_loss=0.08451372334080884, metrics={'train_runtime': 157.8143, 'train_samples_per_second': 70.982, 'train_steps_per_second': 4.448, 'total_flos': 45113812807680.0, 'train_loss': 0.08451372334080884, 'epoch': 3.0})

In [48]:
trainer.train() # bacth size = 6, epochs = 3



Epoch,Training Loss,Validation Loss,Accuracy
1,0.1713,0.084252,0.983133
2,0.0569,0.074454,0.984739
3,0.0326,0.057849,0.987149


TrainOutput(global_step=1869, training_loss=0.07367581795600087, metrics={'train_runtime': 169.3547, 'train_samples_per_second': 66.145, 'train_steps_per_second': 11.036, 'total_flos': 45113812807680.0, 'train_loss': 0.07367581795600087, 'epoch': 3.0})

In [18]:
test_preds = []
for i in range(len(X_test)):
    val_encoding = tokenizer(X_test.iloc[i], truncation=True, padding=True, return_tensors="pt").to(device)
    outputs = model(**val_encoding)
    logits = outputs.logits.cpu().detach().numpy()
    test_preds.append(np.argmax(logits))

In [70]:
# bacth size = 8, epochs = 4
print('accuracy del train:')
trainer.evaluate(train_dataset)['eval_accuracy']

0.9989287627209427

In [49]:
# bacth size = 6, epochs = 3
print('accuracy del train:')
trainer.evaluate(train_dataset)['eval_accuracy']

accuracy del train:


0.9970540974825924

In [50]:
print('accuracy del test:')
trainer.evaluate(test_dataset)['eval_accuracy']

accuracy del test:


0.98714859437751

# Loguearse en HuggingFace y guardar mi modelo

In [52]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [53]:
model.push_to_hub("albert-tiny-spanish-fakenews")

pytorch_model.bin:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/natsanchezc/albert-tiny-spanish-fakenews/commit/959349d438034bfc71c80a703e995f6e4b1fac63', commit_message='Upload AlbertForSequenceClassification', commit_description='', oid='959349d438034bfc71c80a703e995f6e4b1fac63', pr_url=None, pr_revision=None, pr_num=None)

In [54]:
tokenizer.push_to_hub("albert-tiny-spanish-fakenews")

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/natsanchezc/albert-tiny-spanish-fakenews/commit/2193c8e18e7fcfd37ee469a387c6fbe50e11e621', commit_message='Upload tokenizer', commit_description='', oid='2193c8e18e7fcfd37ee469a387c6fbe50e11e621', pr_url=None, pr_revision=None, pr_num=None)

# Cargar el modelo desde HuggingFace 

In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("natsanchezc/albert-tiny-spanish-fakenews").to(device)