In [1]:
import numpy as np
import pandas as pd
from skmultilearn.model_selection import IterativeStratification
from auxiliar_functions import process_folds, build_report, load_dataset, predict_deep, load_embedding
from IPython.display import clear_output
import pickle
import torch 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import BCEWithLogitsLoss
from torch.utils.tensorboard import SummaryWriter
from AsymmetricLoss import AsymmetricLossOptimized
from torch.utils.data import Dataset
from sklearn.metrics import precision_score, recall_score, accuracy_score
import gc
import nltk


nltk.download('stopwords')
spanish_stopwords = nltk.corpus.stopwords.words('spanish') + ["UNK"]


clear_output()

## Indicar Fold a trabajar - Debe ser un número de 1 a 10.

In [4]:
FOLD = 1
assert FOLD in range(1, 11)

In [5]:
columns = ['original_text', 'preprocess_text', 'encoded', 'frames',
           'conflicto', 'economico', 'humanidad', 'moral', "fasttext", "elmo", "beto_embedding_mean",
           "beto_embedding_cls", 'Bi-LSTM', "Bi-LSTM_AsymetricLoss", 'Beto-finetunning_cross_entropy',
           "Beto-finetunning_asymetric", 'tf-idf']

df_train = np.load(f"datasets/fold_{FOLD}_train.npy", allow_pickle=True)
df_train = pd.DataFrame(df_train, columns=columns)
df_train.head(2)

Unnamed: 0,original_text,preprocess_text,encoded,frames,conflicto,economico,humanidad,moral,fasttext,elmo,beto_embedding_mean,beto_embedding_cls,Bi-LSTM,Bi-LSTM_AsymetricLoss,Beto-finetunning_cross_entropy,Beto-finetunning_asymetric,tf-idf
0,Japón registró un nuevo déficit comercial réco...,japón registró un nuevo déficit comercial réco...,"[8759, 8914, 9989, 9898, 6584, 8773, 8428, 999...","[0, 1, 0, 0]",0,1,0,0,"[-0.10552764, -0.27450845, -0.04605328, -0.244...","[0.0596153, -0.49882528, -0.41697934, 0.367517...","[-0.23091996, -0.10888031, -0.41678736, 0.6554...","[0.36867273, -0.11450332, -0.70039237, 1.38560...","[0.06928335, 0.43591627, 0.09116903, 0.6328583...","[0.00010779064, -0.005758822, 0.15181892, 0.00...","[0.113271594, -0.13349481, -0.17376488, -0.254...","[-0.2676692, 0.06710381, -0.028014764, -0.1380...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"UDI acusa ""mala memoria"" de la Nueva Mayoría f...",udi acusa mala memoria de la nueva mayoría fre...,"[9610, 8486, 8448, 7205, 10001, 9999, 9927, 97...","[1, 0, 0, 1]",1,0,0,1,"[-0.13006762, -0.27700594, -0.06630208, -0.186...","[0.2213747, -0.7235562, -0.367401, 0.35953364,...","[-0.34605438, 0.023352358, -0.32479692, 0.4963...","[-0.40992618, -0.3073466, 0.21681017, 1.268957...","[0.06715743, 0.27809557, 0.07517977, 0.5410913...","[0.16733605, 0.26489878, 0.8423751, 0.06611008...","[0.24531865, 0.5975589, -0.4633671, -0.7076185...","[-0.16413306, 0.9792285, -0.8462009, -0.353912...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [6]:
df_test = np.load(f"datasets/fold_{FOLD}_test.npy", allow_pickle=True)
df_test = pd.DataFrame(df_test, columns=columns)
df_test.head(2)

Unnamed: 0,original_text,preprocess_text,encoded,frames,conflicto,economico,humanidad,moral,fasttext,elmo,beto_embedding_mean,beto_embedding_cls,Bi-LSTM,Bi-LSTM_AsymetricLoss,Beto-finetunning_cross_entropy,Beto-finetunning_asymetric,tf-idf
0,Emotiva historia sobre baterista de jazz gana...,[UNK] historia sobre baterista de [UNK] gana t...,"[1, 9746, 9965, 3077, 10001, 1, 7881, 8889, 99...","[0, 0, 0, 0]",0,0,0,0,"[-0.17857021, -0.3521735, -0.12289284, -0.1905...","[0.33685637, -0.50221187, -0.4782673, 0.339598...","[-0.25053567, -0.022042425, -0.13752246, 0.278...","[0.17801946, 0.00994021, 0.57248706, 0.7987496...","[0.029036798, 0.23114203, 0.080079846, 0.53790...","[0.02488855, -0.006599463, 0.5742614, 0.051586...","[-0.15388533, -0.2834073, -0.4078992, 0.291033...","[0.16847402, -0.2925244, -0.53301275, -0.12231...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Daniela Seguel alcanzó el mejor ranking de su ...,daniela seguel alcanzó el mejor ranking de su ...,"[8100, 2774, 8471, 9998, 9872, 7795, 10001, 99...","[0, 0, 1, 0]",0,0,1,0,"[-0.1077514, -0.24662022, -0.042856704, -0.235...","[0.011217682, -0.34421003, -0.63198024, 0.1278...","[-0.22911884, -0.10708143, -0.24772839, 0.5958...","[0.3702164, -0.10387488, 0.15107627, 1.2201008...","[0.022602014, 0.325852, 0.07890084, 0.67056113...","[0.011398845, -0.005230801, 0.11592025, -0.005...","[-0.47976056, -0.6112696, 0.5419494, 0.0219888...","[-0.06476429, -0.8697158, -0.1499849, 0.392780...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
