# Configuración (importar dependencias, librerías, ...)

In [None]:
# Hiperparámetros BETO
BATCH_SIZE = 32
NUM_TRAIN_EPOCHS = 10
LEARNING_RATE = 0.00003
MAX_LENGTH = 256
WEIGHT_DECAY = 0.01

model_checkpoint = 'dccuchile/bert-base-spanish-wwm-uncased'

In [None]:
# Hiperparámetros RoBERTa
BATCH_SIZE = 32
NUM_TRAIN_EPOCHS = 10
LEARNING_RATE = 0.00003
MAX_LENGTH = 128
WEIGHT_DECAY = 0.01

model_checkpoint = 'bertin-project/bertin-roberta-base-spanish'

In [None]:
# Hiperparámetros MdeBERTa
BATCH_SIZE = 16
NUM_TRAIN_EPOCHS = 10
LEARNING_RATE = 0.00003
MAX_LENGTH = 128
WEIGHT_DECAY = 0.1

model_checkpoint = 'microsoft/mdeberta-v3-base'

In [None]:
# Hiperparámetros XML
BATCH_SIZE = 16
NUM_TRAIN_EPOCHS = 10
LEARNING_RATE = 0.00003
MAX_LENGTH = 256
WEIGHT_DECAY = 0.01

model_checkpoint = 'xlm-roberta-base'

In [None]:
# Set the seed value all over the place to make this reproducible.
# esto hay que ponerlo justo antes de importar para que los experimentos
# sean reproducible

!pip install pytorch-lightning
import random
import torch
import numpy as np
import os
from pytorch_lightning import seed_everything

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)# Store the average loss after eachepoch so we can plot them.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["TF_DETERMINISTIC_OPS"] = "1" # See:https://github.com/NVIDIA/tensorflow-determinism#confirmed-current-gpu-specific-sources-of-non-determinism-with-solutions
seed_everything(42, workers=True)

!pip install transformers datasets
!pip install sentencepiece
!pip install contractions
!pip install textblob
from google.colab import drive
from datasets import Dataset, DatasetDict, load_metric
import pandas as pd
import sklearn as sk
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
 TrainingArguments, Trainer, pipeline, EarlyStoppingCallback

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.0.2-py3-none-any.whl (719 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.0/719.0 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.7.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.8.0-py3-none-any.whl (20 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]>2021.06.0->pytorch-lightning)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting multidict

INFO:lightning_fabric.utilities.seed:Global seed set to 42


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m106.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m8

In [None]:
# Check that pyTorch is identifying the GPU
if torch.cuda.device_count() > 0:
  print(f'GPU detected. Currently using: "{torch.cuda.get_device_name(0)}"')
else:
  print('Currently using CPU, change the type of the runtime in the \'runtime\' tab')

GPU detected. Currently using: "Tesla T4"


# Preparación de los datos

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Lectura de los ficheros

In [None]:
# Cargamos los datos de entrenamiento y test
train_data_path = '/content/drive/MyDrive/Datasets/homomex_training.csv'

# Los pasamos a dataframes
train_df_full = pd.read_csv(train_data_path, encoding = 'UTF-8', sep=',')

# test_df = pd.read_csv(test_data_path, encoding = 'UTF-8', sep='\t')
print(train_df_full)

'''
nombre_etiqueta = 'label'
campo_texto = 'tweet'

print("distribución original - train completo: ",train_df_full.value_counts(nombre_etiqueta))

df_0 = train_df_full[train_df_full[nombre_etiqueta]==0][:] #[2000:10000]
df_1 = train_df_full[train_df_full[nombre_etiqueta]==1][:]

# Si se quiere hacer undersampling de la clase mayoritaria
# guardamos en df_0 el número de filas de clase 0 que queremos mantener y en df_1 todas las filas de clase 1
#train_df_full = pd.concat([df_0,df_1])
#print("distribuci'n despues del undersampling: ",train_df_full.value_counts(nombre_etiqueta))

# Esto lo hacemos si tenemos un único fichero y hacemos train/valid/test
#train_df, auxiliar_df = train_test_split(train_df_full, test_size = 0.2, shuffle = True, stratify=train_df_full[[nombre_etiqueta]])
#valid_df, test_df = train_test_split(auxiliar_df, test_size = 0.3, shuffle = True, stratify=auxiliar_df[[nombre_etiqueta]])

# Esto lo hacemos si tenemos ficheros de train y test independientes y solo necesitamos train/valid
train_df, valid_df = train_test_split(train_df_full, test_size = 0.2, shuffle = True, stratify=train_df_full[[nombre_etiqueta]])

print("Ejemplos del conjunto completo de entrenamiento ", len(train_df_full))
print("Ejemplos usados para entrenar: ", len(train_df))
print("Ejemplos usados para validar: ", len(valid_df))
print("Ejemplos usados para test: ", len(test_df))

#train_df_full
'''

      index                                             tweets label
0         0  Me quise ligar a una chava ayer y no me pelo, ...     P
1         1           @papaya_rockera eres un puñal, Papayita.     P
2         2  Magnate ofrece 130 mdd al hombre que conquiste...     P
3         3  Los trolebuses del desgobierno de @EPN son idi...     P
4         4  En época de Hitler no se decía "eres gay" y, s...     P
...     ...                                                ...   ...
6995     56  (Igual y cachamos a un transformer con este tu...     P
6996     57  Acabé una temporada de RuPaul's Drag Race en 2...     P
6997     58  @ArielURosas @nuxsilva Ayuññ. Pos es que me di...     P
6998     59  @LaTortilleriaQ si vamos a hacer un vagón "seg...     P
6999     60  @Dana_Corres @ytzmaya @sof_j @laura_lecuona @l...     P

[7000 rows x 3 columns]


'\nnombre_etiqueta = \'label\'\ncampo_texto = \'tweet\'\n\nprint("distribución original - train completo: ",train_df_full.value_counts(nombre_etiqueta))\n\ndf_0 = train_df_full[train_df_full[nombre_etiqueta]==0][:] #[2000:10000]\ndf_1 = train_df_full[train_df_full[nombre_etiqueta]==1][:]\n\n# Si se quiere hacer undersampling de la clase mayoritaria\n# guardamos en df_0 el número de filas de clase 0 que queremos mantener y en df_1 todas las filas de clase 1\n#train_df_full = pd.concat([df_0,df_1])\n#print("distribuci\'n despues del undersampling: ",train_df_full.value_counts(nombre_etiqueta))\n\n# Esto lo hacemos si tenemos un único fichero y hacemos train/valid/test\n#train_df, auxiliar_df = train_test_split(train_df_full, test_size = 0.2, shuffle = True, stratify=train_df_full[[nombre_etiqueta]])\n#valid_df, test_df = train_test_split(auxiliar_df, test_size = 0.3, shuffle = True, stratify=auxiliar_df[[nombre_etiqueta]])\n\n# Esto lo hacemos si tenemos ficheros de train y test independ

**Ficheros procesados**

In [None]:
nombre_etiqueta = 'label'
campo_texto = 'tuit'

# Cargar ficheros de datos de drive
datasets_path = '/content/drive/MyDrive/Datasets/'
train_df = pd.read_csv(datasets_path + 'train_df_mE.csv', encoding = 'UTF-8', sep=',')
test_df = pd.read_csv(datasets_path + 'test_df_mE.csv', encoding = 'UTF-8', sep=',')
valid_df = pd.read_csv(datasets_path + 'valid_df_mE.csv', encoding = 'UTF-8', sep=',')

#test_df.rename(columns = {'content' : 'tuit'},inplace = True)
#test_df.rename(columns={'labels':'labels_G'},inplace = True)
print()
# Para saber cuántas filas hay de cada clase en cada conjunto
print("DISTRIBUCIONES DE CADA UNA DE LAS ETIQUETAS EN TRAIN:")
print("Distribución Original - G ", train_df.value_counts('G'))
print("Distribución Original - L: ", train_df.value_counts('L'))
print("Distribución Original - B: ", train_df.value_counts('B'))
print("Distribución Original - T: ", train_df.value_counts('T'))
print("Distribución Original - O: ", train_df.value_counts('O'))

print("DISTRIBUCIONES DE CADA UNA DE LAS ETIQUETAS EN VALID:")
print("Distribución Original - G ", valid_df.value_counts('G'))
print("Distribución Original - L: ", valid_df.value_counts('L'))
print("Distribución Original - B: ", valid_df.value_counts('B'))
print("Distribución Original - T: ", valid_df.value_counts('T'))
print("Distribución Original - O: ", valid_df.value_counts('O'))


print("DISTRIBUCIONES DE CADA UNA DE LAS ETIQUETAS EN TEST:")
print("Distribución Original - G ", test_df.value_counts('G'))
print("Distribución Original - L: ", test_df.value_counts('L'))
print("Distribución Original - B: ", test_df.value_counts('B'))
print("Distribución Original - T: ", test_df.value_counts('T'))
print("Distribución Original - O: ", test_df.value_counts('O'))



DISTRIBUCIONES DE CADA UNA DE LAS ETIQUETAS EN TRAIN:
Distribución Original - G  G
1    575
0    114
dtype: int64
Distribución Original - L:  L
0    632
1     57
dtype: int64
Distribución Original - B:  B
0    681
1      8
dtype: int64
Distribución Original - T:  T
0    632
1     57
dtype: int64
Distribución Original - O:  O
0    641
1     48
dtype: int64
DISTRIBUCIONES DE CADA UNA DE LAS ETIQUETAS EN VALID:
Distribución Original - G  G
1    99
0    22
dtype: int64
Distribución Original - L:  L
0    112
1      9
dtype: int64
Distribución Original - B:  B
0    120
1      1
dtype: int64
Distribución Original - T:  T
0    105
1     16
dtype: int64
Distribución Original - O:  O
0    109
1     12
dtype: int64
DISTRIBUCIONES DE CADA UNA DE LAS ETIQUETAS EN TEST:
Distribución Original - G  G
1    40
0    12
dtype: int64
Distribución Original - L:  L
0    46
1     6
dtype: int64
Distribución Original - B:  B
0    51
1     1
dtype: int64
Distribución Original - T:  T
0    46
1     6
dtype: int

**Test no etiquetado**

In [None]:
datasets_path = '/content/drive/MyDrive/Datasets/Competicion/'

test_df = pd.read_csv(datasets_path + 'track2_test_no_labels.csv', encoding = 'UTF-8', sep=',')
test_df.rename(columns={'Unnamed: 0':'index'}, inplace=True)
test_df.rename(columns={'content':campo_texto}, inplace=True)
test_df

Unnamed: 0,index,tuit
0,2,"Ash !! La jotita del programa ""hoy"" no deja de..."
1,3,En una boda gay ¿Les avientan arroz con popote?
2,6,@ivalmal Hazlo mariquita jajajajajaja. Hazlo n...
3,10,“@chavezan: Y el que no habla de la Miss unive...
4,12,"Vamos Dinamarca, ya empieza la marica de CR7 a..."
...,...,...
472,3866,Deje la presidencia a una mujer. Que haya 7 mi...
473,3900,@ElizaSonrisas @ballartaexiste KHA?\n\nAhora q...
474,3939,Lia Thomson (mujer transgénero) logra su prime...
475,3941,"“La gente no binarie puede no usar reloj, usar..."


## Limpieza de datos

In [None]:
# Funciones de limpieza
import re
import string
def remove_links(tweet):
    """Takes a string and removes web links from it"""
    tweet = re.sub(r'http\S+', '', tweet)   # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet)  # remove bitly links
    tweet = re.sub(r'\[link\]', '', tweet )   # remove [link]
    tweet = re.sub(r'\[url\]', '', tweet )   # remove [url]
    tweet = re.sub(r'pic.twitter\S+','', tweet)
    tweet = re.sub('url','', tweet)
    return tweet

def remove_users(tweet):
    """Takes a string and removes retweet and @user information"""
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove re-tweet
    tweet = re.sub('(@[A-Za-z0-9-_]+[A-Za-z0-9-_]+)', '', tweet)  # remove tweeted at
    tweet = re.sub(r'\[user\]', '', tweet )   # remove [user]
    return tweet

def remove_hashtags(tweet):
    """Takes a string and removes any hash tags"""
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove hash tags
    return tweet

def remove_hashtags_symb(tweet):
    tweet = re.sub('#', '', tweet)  # remove hash tags
    return tweet

def remove_av(tweet):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    tweet = re.sub('VIDEO:', '', tweet)  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', tweet)  # remove 'AUDIO:' from start of tweet
    return tweet

def remove_emojis(tweet):
    emoj = re.compile("["
        u"\U00002700-\U000027BF"  # Dingbats
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U00002600-\U000026FF"  # Miscellaneous Symbols
        u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols And Pictographs
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00010000-\U0010FFFF"
        u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\ufe0f"  # dingbats

                      "]+", re.UNICODE)
    return re.sub(emoj, '', tweet)

def remove_jumps(tweet):
    tweet = re.sub('\n', '', tweet)  # remove line jumps
    return tweet

#Quita signos de puntuacion
def remove_puntuacion(tweet):
    tabla = str.maketrans('','',string.punctuation)
    return tweet.translate(tabla)

def sinonimos_insultos(tweet):
    tweet = tweet.replace ('joto' , 'maricon')
    tweet = tweet.replace ('jotito' , 'maricon')
    tweet = tweet.replace ('jotos' , 'maricones')
    tweet = tweet.replace ('jotitos' , 'maricones')

    tweet = tweet.replace ('puñal' , 'maricon')
    tweet = tweet.replace ('puñales' , 'maricon')

    tweet = tweet.replace ('mariquin' , 'maricon')
    tweet = tweet.replace ('marolo' , 'maricon')
    tweet = tweet.replace ('mayate' , 'maricon')
    tweet = tweet.replace ('nandi' , 'maricon')
    tweet = tweet.replace ('puteque' , 'maricon')
    tweet = tweet.replace ('petete' , 'maricon')
    tweet = tweet.replace ('mayate' , 'maricon')
    tweet = tweet.replace ('mayate' , 'maricon')

    tweet = tweet.replace ('levis' , 'lesbiana')
    tweet = tweet.replace ('zapatona' , 'lesbiana')

    return tweet

# Función de eliminación de contracción
import contractions
def expand_contraction(tweet):
    tweet = contractions.fix(tweet)
    return tweet

# Función para corregir los typos
from textblob import TextBlob
def correct_spelling(tweet):
    tweetBlob = TextBlob(tweet)
    tweet = tweetBlob.correct()
    return tweet

**Palabras mas comunes en el dataset**

In [None]:
import nltk
from nltk.corpus import stopwords
from collections import Counter

# Cargar los datos
#dataset_path = '/content/drive/MyDrive/Datasets/multi_train_labels.csv'
df = train_df
#pd.read_csv(dataset_path , encoding = 'UTF-8', sep=',')
campo_texto = 'tuit'
"""
df[campo_texto] = df[campo_texto].str.lower()
df[campo_texto] = df[campo_texto].apply(remove_links)
df[campo_texto] = df[campo_texto].apply(remove_users)
df[campo_texto] = df[campo_texto].apply(remove_hashtags_symb)
df[campo_texto] = df[campo_texto].apply(remove_emojis)
df[campo_texto] = df[campo_texto].apply(remove_puntuacion)
"""
# Tokenizar el texto
nltk.download("punkt")
df["tokens"] = df[campo_texto].apply(nltk.word_tokenize)

# Eliminar las palabras irrelevantes
nltk.download("stopwords")
stop_words = set(stopwords.words("spanish"))
df["tokens"] = df["tokens"].apply(lambda x: [word for word in x if not word.lower() in stop_words])

# Contar la frecuencia de las palabras
word_freq = Counter()
for tokens in df["tokens"]:
    word_freq.update(tokens)

# Seleccionar las palabras más comunes
top_n_words = word_freq.most_common(20)

print(top_n_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[(',', 401), ('.', 299), ('!', 235), ('maricon', 174), ('?', 137), ('marica', 74), ('si', 68), ('``', 53), (':', 53), ("''", 51), ('maricons', 51), ('ser', 48), ('pinche', 40), ('gay', 39), ('puto', 39), ('bien', 31), ('jajaja', 30), ('maricas', 30), ('mariquita', 28), ('trans', 26)]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print(df[campo_texto])
df[campo_texto] = df[campo_texto].apply(sinonimos_insultos)
print(df[campo_texto])

NameError: ignored

In [None]:
train_df[campo_texto] = train_df[campo_texto].str.lower()
valid_df[campo_texto] = valid_df[campo_texto].str.lower()
test_df[campo_texto] = test_df[campo_texto].str.lower()

train_df[campo_texto] = train_df[campo_texto].apply(remove_links)
valid_df[campo_texto] = valid_df[campo_texto].apply(remove_links)
test_df[campo_texto] = test_df[campo_texto].apply(remove_links)

train_df[campo_texto] = train_df[campo_texto].apply(remove_users)
valid_df[campo_texto] = valid_df[campo_texto].apply(remove_users)
test_df[campo_texto] = test_df[campo_texto].apply(remove_users)

train_df[campo_texto] = train_df[campo_texto].apply(remove_hashtags_symb)
valid_df[campo_texto] = valid_df[campo_texto].apply(remove_hashtags_symb)
test_df[campo_texto] = test_df[campo_texto].apply(remove_hashtags_symb)

train_df[campo_texto] = train_df[campo_texto].apply(remove_emojis)
valid_df[campo_texto] = valid_df[campo_texto].apply(remove_emojis)
test_df[campo_texto] = test_df[campo_texto].apply(remove_emojis)

train_df[campo_texto] = train_df[campo_texto].apply(remove_jumps)
valid_df[campo_texto] = valid_df[campo_texto].apply(remove_jumps)
test_df[campo_texto] = test_df[campo_texto].apply(remove_jumps)

train_df[campo_texto] = train_df[campo_texto].apply(sinonimos_insultos)
valid_df[campo_texto] = valid_df[campo_texto].apply(sinonimos_insultos)
test_df[campo_texto] = test_df[campo_texto].apply(sinonimos_insultos)

train_df
test_df

Unnamed: 0.1,Unnamed: 0,Index,tuit,G,L,B,T,O
0,688,689,estaban unos vatos gays enfrente de mi y de be...,1,0,0,0,0
1,848,849,no te preocupes yo no atiendo gays ni lesb...,1,1,0,0,0
2,81,82,tengo gustos bien maricas.,1,0,0,0,0
3,521,522,no los bebés son una bendición y tienen derec...,0,1,0,0,0
4,810,811,"ahhh ademas de chairo, mdejo y maricon ??? si...",1,0,0,0,0
5,394,395,o ya mejor me hago puto,1,0,0,0,0
6,753,754,toño es maricon,1,0,0,0,0
7,136,137,"no se conforman con ser jotas, tienen que ser...",1,0,0,0,0
8,499,500,osea eso que son dos maricones mejor hm,1,0,0,0,0
9,507,508,para mi que son los maricas amarillos que se ...,1,0,0,0,0


Guardar ficheros procesados


In [None]:
datasets_path = '/content/drive/MyDrive/Datasets/Procesados/'
train_df.to_csv(datasets_path + 'train_df_mE_procesado.csv', encoding = 'UTF-8', sep=',', header = True)
test_df.to_csv(datasets_path + 'test_df_mE_procesado.csv', encoding = 'UTF-8', sep=',', header = True)
valid_df.to_csv(datasets_path + 'valid_df_mE_procesado.csv', encoding = 'UTF-8', sep=',', header = True)

In [None]:
# Guardar test NO ETIQUETADO LIMPIO en drive
datasets_path = '/content/drive/MyDrive/Datasets/Procesados/'
train_df.to_csv(datasets_path + 'train_df_limpio.csv', encoding = 'UTF-8', sep=',', header = True)
test_df.to_csv(datasets_path + 'test_df_NE_limpio.csv', encoding = 'UTF-8', sep=',', header = True)

# Preparación de los conjuntos para el entrenamiento

In [None]:
# Convertimos los dataframes en objetos datasets
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

# Mostramos los datos en formato pandas
train_dataset.set_format("pandas")
valid_dataset.set_format("pandas")
test_dataset.set_format("pandas")

print(train_df)

train_dataset.reset_format()
valid_dataset.reset_format()
test_dataset.reset_format()



     Unnamed: 0  Index                                               tuit  G  \
0           192    193  Nodal es la lesbiana intensa de la relación.\n...  0   
1           155    156  @fernandeznorona Ahí si, me dijo mariquita, me...  1   
2           319    320                    @GeraLetipichia7 Huele a marica  1   
3           101    102  @AlanPonce_29 @Nora_Ruvalcaba @RoyCampos @PGR_...  1   
4           368    369  @MaestroDoria Ese nombre suena a transexual ja...  0   
..          ...    ...                                                ... ..   
684          71     72  Vecinos! Esta noche proyectaremos con @FESTIVA...  0   
685         106    107            @url  Y esperabas tu arcoíris 🌈 marica?  1   
686         270    271   @EdgardBeltran @AlexBurruel1 no traen nada jotos  1   
687         435    436              Esta Brucita también es mariquita! 💃🏼  1   
688         102    103        @poqkeeet Porque Eres Un Vato 🙄 Puta Lencha  0   

     L  B  T  O  
0    1  0  0  0  
1  

In [None]:
# Asignamos una etiqueta numérica en función de la etiqueta principal
#   Label = 0 --> negativo
#   Label = 1 --> positivo
#   records['X'] --> representa el nombre de la columna a clasificar
def set_labels(records):
  if records['L'] == 0:
    label = 0
  else:
    label = 1
  return {'labels': label}

In [None]:
# Correct the labels of the test split and create a Dataset dict for all the splits (except test_dataset)
# Esto es para cuando tengamos fichero de train y de valid
# dataset = DatasetDict({'train': train_dataset, 'valid': valid_dataset})
dataset_train = train_dataset
dataset_valid = valid_dataset
dataset_test = test_dataset
#dataset = dataset.remove_columns('__index_level_0__')

columnas_bin = ['Unnamed: 0','Index','tuit','L'] #El ultimo parametro es la columna de clasficacion que queremos dejar
dataset_train_bin = dataset_train
dataset_valid_bin = dataset_valid
dataset_test_bin = dataset_test

for col in dataset_train_bin.column_names:
    if col in columnas_bin:
        continue
    else:
        dataset_train_bin = dataset_train_bin.remove_columns(col)

for col in dataset_valid_bin.column_names:
    if col in columnas_bin:
        continue
    else:
        dataset_valid_bin = dataset_valid_bin.remove_columns(col)

for col in dataset_test_bin.column_names:
    if col in columnas_bin:
        continue
    else:
        dataset_test_bin = dataset_test_bin.remove_columns(col)

print(dataset_train_bin,dataset_valid_bin,dataset_test_bin)

######

# Map the functions to the dataset
dataset_train = dataset_train_bin.map(set_labels)
dataset_valid = dataset_valid_bin.map(set_labels)
#dataset_test = dataset_test_bin.map(set_labels)

print(dataset_train,dataset_valid,dataset_test)

Dataset({
    features: ['Unnamed: 0', 'Index', 'tuit', 'L'],
    num_rows: 689
}) Dataset({
    features: ['Unnamed: 0', 'Index', 'tuit', 'L'],
    num_rows: 121
}) Dataset({
    features: ['Unnamed: 0', 'Index', 'tuit', 'L'],
    num_rows: 52
})


Map:   0%|          | 0/689 [00:00<?, ? examples/s]

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'Index', 'tuit', 'L', 'labels'],
    num_rows: 689
}) Dataset({
    features: ['Unnamed: 0', 'Index', 'tuit', 'L', 'labels'],
    num_rows: 121
}) Dataset({
    features: ['Unnamed: 0', 'Index', 'tuit', 'G', 'L', 'B', 'T', 'O'],
    num_rows: 52
})


In [None]:
# Reseteamos el formato para que no haya fallos
dataset_train.reset_format()
dataset_valid.reset_format()
dataset_test.reset_format()

# Proceso de clasificación

## Tokenización

In [None]:
#model_checkpoint = 'dccuchile/bert-base-spanish-wwm-uncased'
#model_checkpoint = 'davidmasip/racism'
#model_checkpoint = 'PlanTL-GOB-ES/roberta-base-bne'
#model_checkpoint = 'PlanTL-GOB-ES/roberta-base-biomedical-es'
#model_checkpoint = 'roberta-base'
#model_checkpoint = 'Jacinto/autotrain-i2c-edos-1988966268'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token='')
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [None]:
# Size of our one-hot-encoded vectors
tokenizer.vocab_size

31002

In [None]:
# Esto es para ver cómo queda el texto una vez tokenizado
# La función tokenizer() hace el tokenizado y devuelve los 'inputs_id' y los 'attention_mask'

print(dataset_train[1][campo_texto])
tokenizado = tokenizer.tokenize(dataset_train[1][campo_texto])
#tokenizado = tokenizer(dataset_train[1]["text"], truncation=True, max_length=MAX_LENGTH, padding=True)
print(tokenizado)

@fernandeznorona Ahí si, me dijo mariquita, me dijo La vestida! Jajajajja jajajaja que nena eres! #quenenaeresnoroña
['@', 'fern', '##and', '##ez', '##nor', '##ona', 'ahí', 'si', ',', 'me', 'dijo', 'mari', '##quita', ',', 'me', 'dijo', 'la', 'vestida', '!', 'ja', '##ja', '##ja', '##j', '##ja', 'ja', '##ja', '##ja', '##ja', 'que', 'nena', 'eres', '!', '[UNK]', 'que', '##nen', '##a', '##eres', '##nor', '##o', '##ña']


In [None]:
# Define the method to be mapped to the dataset to tokenize the data
def tokenize_data(examples):
  #return tokenizer(examples["text"], truncation=True, padding='longest')
  return tokenizer(examples[campo_texto], truncation=True, max_length=MAX_LENGTH, padding=True)

In [None]:
columns_train = dataset_train.column_names  # Coge todas las columnas
columns_valid = dataset_valid.column_names  # Coge todas las columnas
columns_test = dataset_test.column_names
columns_train.remove("labels") # Elimina la columna "labels"
columns_valid.remove("labels") # Elimina la columna "labels"
#columns_test.remove("labels")

# Hace el tokenizado y elimina todas las columnas que no se necesitan
encoded_dataset_train = dataset_train.map(tokenize_data, batched=True, remove_columns=columns_train)
encoded_dataset_valid = dataset_valid.map(tokenize_data, batched=True, remove_columns=columns_valid)
encoded_dataset_test = dataset_test.map(tokenize_data, batched=True, remove_columns=columns_test)
print(encoded_dataset_train,encoded_dataset_valid,encoded_dataset_test)

Map:   0%|          | 0/689 [00:00<?, ? examples/s]

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 689
}) Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 121
}) Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 52
})


In [None]:
len(encoded_dataset_train[3]['input_ids'])

256

In [None]:
encoded_dataset_train[4]['input_ids']

[4,
 985,
 4808,
 2436,
 1137,
 1510,
 1848,
 5103,
 1012,
 16887,
 30991,
 2446,
 2279,
 1431,
 1431,
 5,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 

In [None]:
encoded_dataset_train[4]

{'labels': 0,
 'input_ids': [4,
  985,
  4808,
  2436,
  1137,
  1510,
  1848,
  5103,
  1012,
  16887,
  30991,
  2446,
  2279,
  1431,
  1431,
  5,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


## Carga del modelo

In [None]:
# Cargamos el modelo
n_labels = 2
# cargamos el modelo preentrenado
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint,
                                                              num_labels=n_labels,
                                                              use_auth_token='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')

## Definición de la métricas

In [None]:
# Función para realizar distintas métricas en ejecución

def compute_metrics(eval_pred):
  """
  Compute metrics for Trainer
  """
  labels = eval_pred.label_ids
  print(labels)
  preds = eval_pred.predictions.argmax(-1)

  #preds = np.argmax(preds, axis=-1)
  precision, recall, f1, _ = sk.metrics.precision_recall_fscore_support(labels, preds, average="macro")
  # f1_minoritaria = f1_score(labels, preds, average="micro")
  # f1_mayoritaria = f1_score(labels, preds, average="micro")
  # acc = sk.metrics.accuracy_score(labels, preds, average="micro")
  # AUC = roc_auc_score(labels, preds, average="micro")
  # PREC_REC = average_precision_score(labels, preds, average="micro")
  # return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'AUC': AUC,
           # 'f1_minoritaria': f1_minoritaria, 'f1_mayoritaria': f1_mayoritaria, 'PREC_REC': PREC_REC }
  return { 'f1': f1, 'precision': precision, 'recall': recall }

## Fine-tuning

In [None]:
model_name = model_checkpoint.split("/")[-1]
model_name

'bert-base-spanish-wwm-uncased'

In [None]:
# Seleccionamos el conjunto de entrenamiento y de evaluación balanceados
#encoded_dataset_train.set_format("pandas")
#encoded_dataset_valid.set_format("pandas")
#train, valid = train_test_split(encoded_dataset, test_size = 0.2, shuffle = True, stratify=encoded_dataset['labels'],random_state=42)
#train_dataset=Dataset.from_pandas(train)
#eval_dataset=Dataset.from_pandas(valid)

In [None]:
# Se definen los parámetros del Trainer()
num_train_samples = int(len(encoded_dataset_train))
num_evaluation= int(len(encoded_dataset_valid))
#Seleccionamos el conjunto de entrenamiento y de evaluación
  #num_train_samples = int(len(df_train)*0.8)
  #num_evaluation= int(len(df_train) * 0.2)
  #encodeData_shuffle=encoded_dataset.shuffle(seed=42)
  #train_dataset =encodeData_shuffle.select(range(num_train_samples))
  #eval_dataset = encodeData_shuffle.select(range(num_train_samples,num_train_samples+num_evaluation))
logging_steps = len(encoded_dataset_train) // (2 * BATCH_SIZE * NUM_TRAIN_EPOCHS)
print("********************** loggin_steps", logging_steps)
optim=["adamw_hf", "adamw_torch", "adamw_apex_fused","adafactor","adamw_torch_xla"]

training_args = TrainingArguments(
    output_dir = 'results',
    num_train_epochs = NUM_TRAIN_EPOCHS,
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    #metric_for_best_model = 'eval_loss',
    weight_decay = WEIGHT_DECAY,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    logging_steps = logging_steps,
    save_total_limit = 3,
    optim = optim[1],
    push_to_hub=False
    #push_to_hub=True,
    #push_to_hub_model_id=f"{model_name}-finetuned-amazon_reviews_multi"
)

********************** loggin_steps 1


ImportError: ignored

In [None]:
# Creamos el objeto Trainer()
trainer = Trainer(
    model_init=model_init,
    #model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
    train_dataset = encoded_dataset_train,
    #eval_dataset=dataset['valid'],
    eval_dataset = encoded_dataset_valid,
    tokenizer = tokenizer
)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

In [None]:
trainer.train()

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.02,0.052272,0.497925,0.495868,0.5
2,0.0057,0.055581,0.497925,0.495868,0.5
3,0.0015,0.058779,0.497925,0.495868,0.5
4,0.0008,0.062938,0.497925,0.495868,0.5


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


  _warn_prf(average, modifier, msg_start, len(result))


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


  _warn_prf(average, modifier, msg_start, len(result))


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


  _warn_prf(average, modifier, msg_start, len(result))


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=88, training_loss=0.048234781076396095, metrics={'train_runtime': 63.4658, 'train_samples_per_second': 108.562, 'train_steps_per_second': 3.466, 'total_flos': 127464972991200.0, 'train_loss': 0.048234781076396095, 'epoch': 4.0})

In [None]:
eval = trainer.evaluate()
# Pasamos el resultado a dataframe
dfeval = pd.DataFrame(list(eval.items()), columns = ['Name','Value_Validation'])
dfeval

[1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0]


Unnamed: 0,Name,Value_Validation
0,eval_loss,0.468999
1,eval_f1,0.665376
2,eval_precision,0.836538
3,eval_recall,0.620413
4,eval_runtime,0.5739
5,eval_samples_per_second,210.856
6,eval_steps_per_second,6.97
7,epoch,8.0


# Guardar el modelo

In [None]:
trainer.save_model('/content/drive/MyDrive/Modelos/MultiEtsClases/modeloO_Beto')

In [None]:
import torch
torch.cuda.empty_cache()

## Hacer las predicciones

In [None]:
# Do not execute if you want to use the model just trained
model_path ='/content/drive/MyDrive/Modelos/MultiEtsClases/modeloO_Beto'

model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
# Predicción con pipeline
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
dataset_test


Dataset({
    features: ['index', 'tuit'],
    num_rows: 477
})

In [None]:
#Hacemos las prediciones
'''def get_predictions(records):
  p = pipe(str(records["text"]),truncation=True)

  return {"pred_label":int(p[0]["label"][-1])}  #Nos quedamos con el número de la etiqueta solo
'''
def get_predictions(records):
  result = pipe(records[campo_texto], truncation=True)
  pred_label = result[0]['label']
  score_label = result[0]['score']
  if pred_label == 'LABEL_0':
    pred_label = 0
  else:
    pred_label = 1

  return {'pred_label': pred_label, 'score_label': score_label}

In [None]:
test_dataset_predicted = dataset_test.map(get_predictions)
valid_dataset_predicted = dataset_valid.map(get_predictions)
test_dataset_predicted[0]
valid_dataset_predicted[0]

Map:   0%|          | 0/477 [00:00<?, ? examples/s]



Map:   0%|          | 0/121 [00:00<?, ? examples/s]

{'Unnamed: 0': 453,
 'Index': 454,
 'tuit': '  ayuññ. pos es que me dicen: oph, send nerdez trans.y yo de.. “ahora aplico mi carta arirosas y su poder” * música yugioh *',
 'O': 1,
 'labels': 1,
 'pred_label': 0,
 'score_label': 0.9983468055725098}

In [None]:
test_dataset_predicted.set_format('pandas')
df_test = test_dataset_predicted[:]

valid_dataset_predicted.set_format('pandas')
df_valid = valid_dataset_predicted[:]
df_test

Unnamed: 0,index,tuit,pred_label,score_label
0,2,"Ash !! La jotita del programa ""hoy"" no deja de...",0,0.999058
1,3,En una boda gay ¿Les avientan arroz con popote?,0,0.999275
2,6,@ivalmal Hazlo mariquita jajajajajaja. Hazlo n...,0,0.999234
3,10,“@chavezan: Y el que no habla de la Miss unive...,0,0.999337
4,12,"Vamos Dinamarca, ya empieza la marica de CR7 a...",0,0.999188
...,...,...,...,...
472,3866,Deje la presidencia a una mujer. Que haya 7 mi...,0,0.988978
473,3900,@ElizaSonrisas @ballartaexiste KHA?\n\nAhora q...,0,0.998996
474,3939,Lia Thomson (mujer transgénero) logra su prime...,0,0.985787
475,3941,"“La gente no binarie puede no usar reloj, usar...",0,0.941342


In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
# Añadimos la función de evaluación

def compute_metrics(pred):

  labels = pred[0]
  preds = pred[1]
  precision, recall, f1, _ = sk.metrics.precision_recall_fscore_support(labels, preds, average="macro")
  #acc = sk.metrics.accuracy_score(labels, preds)
  #AUC = roc_auc_score(labels, preds)
  #PREC_REC = average_precision_score(labels, preds)
  #return { 'accuracy': acc, 'f1': f1, 'precision': precision,
         # 'recall': recall, 'AUC': AUC, 'PREC_REC': PREC_REC }
  return { 'f1': f1, 'precision': precision, 'recall': recall }

In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
# Convert the pandas series to python list to apply the compute_metric function
test_labels = df_test['labels'].values.tolist()
test_predictions = df_test['pred_label'].values.tolist()
eval_pred_test = [test_labels, test_predictions]

valid_labels = df_valid['labels'].values.tolist()
valid_predictions = df_valid['pred_label'].values.tolist()
eval_pred_valid = [valid_labels, valid_predictions]

In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
# Informe de resultados
print(f'Modelo entrenado: {model_name}')
print(f'Epochs: {NUM_TRAIN_EPOCHS}')
print(f'Tamaño de batch: {BATCH_SIZE}')
p_test = compute_metrics(eval_pred_test)
dftest = pd.DataFrame([[key, p_test[key]] for key in p_test.keys()], columns=['Name', 'Value'])
print('Test:')
print(dftest)

p_valid = compute_metrics(eval_pred_valid)
dfvalid = pd.DataFrame([[key, p_valid[key]] for key in p_valid.keys()], columns=['Name', 'Value'])
print('Valid:')
print(dfvalid)

Modelo entrenado: bert-base-multilingual-uncased
Epochs: 10
Tamaño de batch: 32
Test:
        Name     Value
0         f1  0.617084
1  precision  0.636054
2     recall  0.604167
Valid:
        Name     Value
0         f1  0.672973
1  precision  0.714602
2     recall  0.648318


In [None]:
guardarTest_data_path = '/content/drive/MyDrive/Predicciones/MultiEt/prediccionesO_Beto.csv'
df_test.to_csv(guardarTest_data_path)

In [None]:
#### Esto es cuando ya tengo las etiquetas de las predicciones y del test (gold standard)
predictions_data_path = '/content/drive/MyDrive/Predicciones/MultiEt/prediccionesT.csv'
predictions_df = pd.read_csv(predictions_data_path, encoding = 'UTF-8', sep=',')
test_df = test_df.replace({"not L": 0, "L": 1})
predictions_df = predictions_df.replace({"notL": 0, "L": 1})
test_labels = test_df['L'].values.tolist()
test_predictions = predictions_df['pred_label'].values.tolist()

In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
print(f'AUC del test: {roc_auc_score(test_labels, test_predictions)}')
print(f'AUC del valid: {roc_auc_score(valid_labels, valid_predictions)}')
print('*********************************')
print(f'PREC_REC del test: {average_precision_score(test_labels, test_predictions)}')
print(f'PREC_REC del valid: {average_precision_score(valid_labels, valid_predictions)}')

AUC del test: 0.3458333333333333
AUC del valid: 0.9279761904761904
*********************************
PREC_REC del test: 0.755
PREC_REC del valid: 0.7821539256198347


In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
### Resultados para el test
print("epoch ", NUM_TRAIN_EPOCHS)
print("batch size:", BATCH_SIZE)
print("max_len :", MAX_LENGTH)

print(classification_report(test_labels, test_predictions))

print('Matriz de confusión')
print(confusion_matrix(test_labels, test_predictions))

epoch  10
batch size: 32
max_len : 128
              precision    recall  f1-score   support

           0       0.17      0.67      0.27        12
           1       0.20      0.03      0.04        40

    accuracy                           0.17        52
   macro avg       0.19      0.35      0.16        52
weighted avg       0.19      0.17      0.10        52

Matriz de confusión
[[ 8  4]
 [39  1]]


In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
### Resultados para el valid
print("epoch ", NUM_TRAIN_EPOCHS)
print("batch size:", BATCH_SIZE)
print("max_len :", MAX_LENGTH)

print(classification_report(valid_labels, valid_predictions))

print('Matriz de confusión')
print(confusion_matrix(valid_labels, valid_predictions))

epoch  10
batch size: 32
max_len : 128
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       105
           1       0.88      0.88      0.88        16

    accuracy                           0.97       121
   macro avg       0.93      0.93      0.93       121
weighted avg       0.97      0.97      0.97       121

Matriz de confusión
[[103   2]
 [  2  14]]


UNIR LAS PREDICCIONES DE CLASIFICACION BINARIA

In [None]:
df1tr = pd.read_csv('/content/drive/MyDrive/Predicciones/MultiEt/prediccionesL_Roberta.csv')
df1tr.rename(columns={'pred_label':'L'},inplace = True)

df2tr = pd.read_csv('/content/drive/MyDrive/Predicciones/MultiEt/prediccionesG_Roberta.csv')
df2tr.rename(columns={'pred_label':'G'},inplace = True)

df3tr = pd.read_csv('/content/drive/MyDrive/Predicciones/MultiEt/prediccionesB_Roberta.csv')
df3tr.rename(columns={'pred_label':'B'},inplace = True)

df4tr = pd.read_csv('/content/drive/MyDrive/Predicciones/MultiEt/prediccionesT_Roberta.csv')
df4tr.rename(columns={'pred_label':'T'},inplace = True)

df5tr = pd.read_csv('/content/drive/MyDrive/Predicciones/MultiEt/prediccionesO_Roberta.csv')
df5tr.rename(columns={'pred_label':'O'},inplace = True)

df_a = pd.concat([df1tr['index'],df1tr['tuit']],axis = 1)
df_g = pd.concat([df1tr['L']],axis = 1)
df_l = pd.concat([df2tr['G']],axis = 1)
df_b = pd.concat([df3tr['B']],axis = 1)
df_t = pd.concat([df4tr['T']],axis = 1)
df_o = pd.concat([df5tr['O']],axis = 1)

df_final_tr = pd.concat([df_a,df_l,df_g,df_b,df_t,df_o],axis = 1)

df_final_tr.to_csv('/content/drive/MyDrive/Predicciones/MultiEt/Competi/PrediccionesFinalesME_Roberta.csv' , index = False)

**Votacion final de las predicciones**

In [None]:
# Definición de la función votaPredicción. Es importante incluir en pred_1 el la predicción del mejor modelo
def get_pred_final(pred_1, pred_2, pred_3):
  nIstancias = len(pred_1)
  labels_final = []
  pred_final = pred_1
  for i in range(nIstancias):
    if pred_2[i] == pred_3[i]:
      labels_final.append(pred_2[i])
    else:
      labels_final.append(pred_1[i])
  lf = pd.DataFrame(labels_final)
  pred_final['pred_label_Vot'] = lf

  return pred_final

In [None]:
#Leer predicciones
pred1 = pd.read_csv('/content/drive/MyDrive/Predicciones/MultiEt/PrediccionesFinalesME2.0.csv', encoding = 'UTF-8', sep=',')
pred2 = pd.read_csv('/content/drive/MyDrive/Predicciones/MultiEt/PrediccionesFinalesME.csv', encoding = 'UTF-8', sep=',')
#pred_3 = pd.read_csv(pred_path + fin_path + 'PrediccionXML.csv', encoding = 'UTF-8', sep=',')
p1_l = pred1['pred_label_L']
p2_l = pred2['pred_label_L']

p1_g = pred1['pred_label_G']
p2_g = pred2['pred_label_G']

p1_b = pred1['pred_label_B']
p2_b = pred2['pred_label_B']

p1_t = pred1['pred_label_T']
p2_t = pred2['pred_label_T']

p1_o = pred1['pred_label_O']
p2_o = pred2['pred_label_O']

finalG = get_pred_final(p1_g,p2_g,p1_t)
finalB = get_pred_final(p1_b,p2_b,p1_t)
finalL = get_pred_final(p1_l,p2_l,p1_t)
finalT = get_pred_final(p1_t,p2_t,p1_t)
finalO = get_pred_final(p1_o,p2_o,p1_t)
#Nuevo .csv pero solo con las predicciones finales
df_a = pd.concat([pred1['Index'],pred1['tuit']],axis = 1)
df_preds = pd.concat([finalL,finalG,finalB,finalT,finalO],axis = 1)
df_preds.columns = ['predL','predG','predB','predT','predO']

df_final = pd.concat([df_a,df_preds],axis = 1)

df_final_tr.to_csv('/content/drive/MyDrive/FicherosFinales/PrediccionesFinalesMEVotacion.csv' , index = False)

df_final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_final['pred_label_Vot'] = lf
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_final['pred_label_Vot'] = lf
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_final['pred_label_Vot'] = lf
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_final['pred_label_Vot'] = lf
A value is trying to be set on a

Unnamed: 0,Index,tuit,predL,predG,predB,predT,predO
0,689.0,estaban unos vatos gays enfrente de mi y de be...,0,1,0,0,0
1,849.0,no te preocupes yo no atiendo gays ni lesb...,1,1,0,0,0
2,82.0,tengo gustos bien maricas.,0,1,0,0,0
3,522.0,no los bebés son una bendición y tienen derec...,1,0,0,0,0
4,811.0,"ahhh ademas de chairo, mdejo y maricon ??? si...",0,1,0,0,0
5,395.0,o ya mejor me hago puto,0,1,0,0,0
6,754.0,toño es maricon,0,1,0,0,0
7,137.0,"no se conforman con ser jotas, tienen que ser...",0,1,0,0,1
8,500.0,osea eso que son dos maricones mejor hm,0,1,0,0,0
9,508.0,para mi que son los maricas amarillos que se ...,0,1,0,0,0


**CONSTRUCCION DEL FICHERO DE COMPETICION**

In [None]:
"""
Formato del modelo

'LGBTphobiaDetectionMultiLabeled' 'index' 'L' 'G' 'B' 'T' 'O'

"""
df_final =  pd.read_csv('/content/drive/MyDrive/Predicciones/MultiEt/Competi/PrediccionesFinalesME_MDeBerta.csv', encoding = 'UTF-8', sep=',')

pred_path = '/content/drive/MyDrive/FicherosFinales/'
ficheroFin = df_final.drop(['tuit'],axis = 1)
ficheroFin = ficheroFin.drop(ficheroFin.index[-1])

ficheroFin = ficheroFin.replace({0 :'X'})
ficheroFin['L'] = ficheroFin['L'].replace({1: "L"})
ficheroFin['G'] = ficheroFin['G'].replace({1: "G"})
ficheroFin['B'] = ficheroFin['B'].replace({1: "B"})
ficheroFin['T'] = ficheroFin['T'].replace({1: "T"})
ficheroFin['O'] = ficheroFin['O'].replace({1: "O"})


with open(pred_path + "prediction_multilabel_file_M.txt", "w") as file:
    for i in range(len(ficheroFin)):
        c1 = ficheroFin.loc[i, 'L']
        c2 = ficheroFin.loc[i, 'G']
        c3 = ficheroFin.loc[i, 'B']
        c4 = ficheroFin.loc[i, 'T']
        c5 = ficheroFin.loc[i, 'O']
        file.write('"LGBTphobiaDetectionMultiLabeled"\t"' + str(i) + '"\t"' +
                   c1 + '"\t"' + c2 +  '"\t"' + c3 +  '"\t"' + c4 +
                   '"\t"' + c5 + '"\n')

ficheroFin

Unnamed: 0,index,G,L,B,T,O
0,2,G,X,X,X,X
1,3,G,X,X,X,X
2,6,G,X,X,X,X
3,10,G,X,X,X,X
4,12,G,X,X,X,X
...,...,...,...,...,...,...
471,3807,G,X,X,X,X
472,3866,G,X,X,X,X
473,3900,X,X,X,T,X
474,3939,X,L,X,T,X


In [None]:
# Construir el fichero de salida que pida cada competición

#predicciones = df.drop(['id','reply_to','sentence','stereotype','labels','score_label'], axis=1)
#predicciones = df.drop([campo_texto,'label_sexist','label_vector','label_category','__index_level_0__','labels','score_label'], axis=1)
predicciones = df_test.drop([campo_texto,'score_label'], axis=1)
predicciones.rename(columns={'pred_label':'label_pred'}, inplace=True)
predicciones

In [None]:
predicciones['label_pred'] = predicciones['label_pred'].map({0:'not sexist',
                             1:'sexist'},na_action=None)
predicciones

In [None]:
# Guardamos el fichero de predicciones
fichero_pred = '/content/drive/MyDrive/EDOS/Predicciones/pred_model4_autotrainFive.csv'
predicciones.to_csv(fichero_pred, index=False, encoding='utf-8',header=True, sep=',')