# Politic_ES

Creación del modelo

In [88]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from datasets import Dataset

In [35]:
df = pd.read_csv(os.path.join("practise_data", "politicES_phase_2_train_public.csv"))
df

Unnamed: 0,label,gender,profession,ideology_binary,ideology_multiclass,tweet
0,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,¡Feliz 28 de febrero a todas las andaluzas y a...
1,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,"Feliz año nuevo, feliz esperanza 💕. Querido 20..."
2,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,🇩🇪🇪🇸 ¡Un placer encontrarme con mi homólogo al...
3,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,El conflicto en Ucrania ha supuesto una dramát...
4,0008c4fab9e97623a60380ee9c88cb20,female,politician,left,left,La Academia de la Llingua Asturiana realiza un...
...,...,...,...,...,...,...
179995,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,Desde un inicio nos opusimos a la escalda mili...
179996,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,En menos de 4 minutos he tratado de analizar e...
179997,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,Un fantasma recorre Euskal Herria y el Estado....
179998,ffd89e81d6f6c783bfb72a4590db4304,male,politician,left,left,Aquí os dejo mis reflexiones hoy en el diario ...


In [36]:
model_name="bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file config.json from cache at /home/brandon/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/brandon/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt

In [37]:
sentence = "How many people did Randy Steven Craft murder?"
print(sentence)
encoding = tokenizer(sentence)
encoding

How many people did Randy Steven Craft murder?


{'input_ids': [101, 2129, 2116, 2111, 2106, 9744, 7112, 7477, 4028, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [38]:
df['num_chars'] = df.apply(lambda row: len(row['tweet']), axis=1)
df['num_tokens'] = df.apply(lambda row: len(tokenizer(row['tweet'])['input_ids']), axis=1)

In [39]:
MAX_LENGTH = df['num_tokens'].max()
MAX_LENGTH

161

In [40]:
df_clean = df[['tweet', 'ideology_binary']].rename(columns={'tweet': 'text', 'ideology_binary': 'label'})
df_clean

Unnamed: 0,text,label
0,¡Feliz 28 de febrero a todas las andaluzas y a...,left
1,"Feliz año nuevo, feliz esperanza 💕. Querido 20...",left
2,🇩🇪🇪🇸 ¡Un placer encontrarme con mi homólogo al...,left
3,El conflicto en Ucrania ha supuesto una dramát...,left
4,La Academia de la Llingua Asturiana realiza un...,left
...,...,...
179995,Desde un inicio nos opusimos a la escalda mili...,left
179996,En menos de 4 minutos he tratado de analizar e...,left
179997,Un fantasma recorre Euskal Herria y el Estado....,left
179998,Aquí os dejo mis reflexiones hoy en el diario ...,left


# Separación en train y test

In [104]:
# Separamos en training y en test
parte_test = 0.3
df_train, df_test = np.split(df_clean, [int((1 - parte_test) * len(df))])

In [105]:
df_train

Unnamed: 0,text,label
0,¡Feliz 28 de febrero a todas las andaluzas y a...,left
1,"Feliz año nuevo, feliz esperanza 💕. Querido 20...",left
2,🇩🇪🇪🇸 ¡Un placer encontrarme con mi homólogo al...,left
3,El conflicto en Ucrania ha supuesto una dramát...,left
4,La Academia de la Llingua Asturiana realiza un...,left
...,...,...
125994,Pero qué cosa tan hermosa es este episodio sob...,left
125995,@user vaya fichaje que han hecho estos maldito...,left
125996,"Por si quedaban dudas, Lula ganó en el 'Ohio' ...",left
125997,Las ausencias de los máximos responsables polí...,left


In [106]:
df_test

Unnamed: 0,text,label
125999,Qué petición tan simple la que hace la cantant...,left
126000,"He ido a ver Cyrano de Bergerac, de la compañí...",right
126001,Brutal esta entrevista de ⁦@user ⁩ a a una ucr...,right
126002,"Cuando llegué a Madrid, mi compañera de piso d...",right
126003,4/ Mientras algunos quieren presentar a Shakir...,right
...,...,...
179995,Desde un inicio nos opusimos a la escalda mili...,left
179996,En menos de 4 minutos he tratado de analizar e...,left
179997,Un fantasma recorre Euskal Herria y el Estado....,left
179998,Aquí os dejo mis reflexiones hoy en el diario ...,left


In [107]:
def tokker(x):
    res = tokenizer(x['text'], padding="max_length", truncation=True, max_length=MAX_LENGTH)
    input_ids, token_type_ids, attention_mask = res['input_ids'], res['token_type_ids'], res['attention_mask']
    return x['text'], x['label'], input_ids, token_type_ids, attention_mask

df_train = df_train.apply(tokker, axis='columns', result_type='expand')
df_train.columns = ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']
df_train

Unnamed: 0,text,label,input_ids,token_type_ids,attention_mask
0,¡Feliz 28 de febrero a todas las andaluzas y a...,left,"[101, 1067, 10768, 3669, 2480, 2654, 2139, 131...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"Feliz año nuevo, feliz esperanza 💕. Querido 20...",left,"[101, 10768, 3669, 2480, 2019, 2080, 22250, 10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,🇩🇪🇪🇸 ¡Un placer encontrarme con mi homólogo al...,left,"[101, 100, 1067, 4895, 2173, 2099, 4372, 8663,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,El conflicto en Ucrania ha supuesto una dramát...,left,"[101, 3449, 4736, 2080, 4372, 15384, 21578, 20...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,La Academia de la Llingua Asturiana realiza un...,left,"[101, 2474, 16926, 2139, 2474, 2222, 2075, 669...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...
125994,Pero qué cosa tan hermosa es este episodio sob...,left,"[101, 2566, 2080, 10861, 2522, 3736, 9092, 201...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
125995,@user vaya fichaje que han hecho estos maldito...,left,"[101, 1030, 5310, 12436, 3148, 10882, 7507, 64...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
125996,"Por si quedaban dudas, Lula ganó en el 'Ohio' ...",left,"[101, 18499, 9033, 10861, 2850, 8193, 4241, 88...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
125997,Las ausencias de los máximos responsables polí...,left,"[101, 5869, 17151, 27742, 2015, 2139, 3050, 20...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [108]:
df_test = df_test.apply(tokker, axis='columns', result_type='expand')
df_test.columns = ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']
df_test

Unnamed: 0,text,label,input_ids,token_type_ids,attention_mask
125999,Qué petición tan simple la que hace la cantant...,left,"[101, 10861, 9004, 27113, 2078, 9092, 3722, 24...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
126000,"He ido a ver Cyrano de Bergerac, de la compañí...",right,"[101, 2002, 8909, 2080, 1037, 2310, 2099, 2233...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
126001,Brutal esta entrevista de ⁦@user ⁩ a a una ucr...,right,"[101, 12077, 9765, 2050, 4372, 7913, 11365, 26...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
126002,"Cuando llegué a Madrid, mi compañera de piso d...",right,"[101, 12731, 28574, 2222, 13910, 5657, 1037, 6...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
126003,4/ Mientras algunos quieren presentar a Shakir...,right,"[101, 1018, 1013, 2771, 4765, 8180, 2632, 1273...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...
179995,Desde un inicio nos opusimos a la escalda mili...,left,"[101, 4078, 3207, 4895, 1999, 27113, 16839, 16...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
179996,En menos de 4 minutos he tratado de analizar e...,left,"[101, 4372, 2273, 2891, 2139, 1018, 8117, 1616...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
179997,Un fantasma recorre Euskal Herria y el Estado....,left,"[101, 4895, 5470, 10230, 2863, 28667, 2953, 28...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
179998,Aquí os dejo mis reflexiones hoy en el diario ...,left,"[101, 1037, 15549, 9808, 2139, 5558, 28616, 22...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [109]:
# Labels se cambian a números
label2num = {
    'left': 0,
    'right': 1,
}

df_train['label'] = df_train['label'].apply(lambda x: label2num[x])
df_test['label'] = df_test['label'].apply(lambda x: label2num[x])

### Guardar como Dataset

In [112]:
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

In [113]:
dataset_test['label']

[0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


## Modelo

In [114]:
TARGET_LABELS = ['left', 'right']
NUM_LABELS = len(TARGET_LABELS)

print('TARGET_LABELS:', TARGET_LABELS, 'num_labels:', NUM_LABELS)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS)

TARGET_LABELS: ['left', 'right'] num_labels: 2


loading configuration file config.json from cache at /home/brandon/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/brandon/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891c

In [115]:
args = TrainingArguments(output_dir="./outputs")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [116]:
args.evaluation_strategy="epoch"
args.per_device_train_batch_size = 32
args.per_device_eval_batch_size = 32

In [117]:
def compute_metrics(pred):
    """recibe un lote prediciones inferidas por el modelo. """
    y_true = pred.label_ids # son las labels en el gold standard
    y_pred = pred.predictions.argmax(-1) # pred.predictions devuelve una lista con las predicciones
                                        # para casda clase. Debemos quedarnos con la de mayor probabilidad.

    # como son varias clases, utilizaremos la macro
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [118]:
trainer = Trainer(
    model=model,            # modelo que será ajustado
    args = args,     # hiperparámetros
    train_dataset=dataset_train, # conjunto training
    eval_dataset=dataset_test,   # conjunto de validación
    compute_metrics=compute_metrics,    # función para computar las métricas
)

In [119]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 125999
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 11814
  Number of trainable parameters = 109483778


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate()