In [1]:
import pandas as pd
import numpy as np
import sys
import os
import json
import transformers
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


  from .autonotebook import tqdm as notebook_tqdm
2023-04-17 11:04:12.301594: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


##  Data

### Train data


In [2]:
f = open ('/u/athbagde/ML_CL/EXIST 2023 Dataset 2/training/EXIST2023_training.json', "r")
  
# Reading from file
data = json.loads(f.read())

In [3]:
class my_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        input_ids = self.encodings['input_ids'][idx]
        target_ids = torch.tensor(self.labels[idx])
        attention_masks = self.encodings['attention_mask'][idx]
        return {"input_ids": input_ids, "labels": target_ids, "attention_mask":attention_masks}

    def __len__(self):
        return len(self.labels)

In [4]:
training_set  = pd.DataFrame(columns=['id','tweet','lang','sex','age','t1_lb','t2_lb','t3_lb'])

In [5]:
for id in data:
    sample = data[id]
    training_set.loc[len(training_set.index)] = [sample['id_EXIST'],sample['tweet'],
                                                sample['lang'],sample['gender_annotators'],
                                                sample['age_annotators'],sample['labels_task1'],
                                                sample['labels_task2'],sample['labels_task3']
                                                ]

In [6]:
training_set = training_set.set_index(['id','tweet','lang']).apply(pd.Series.explode).reset_index()

In [7]:
training_set.head()

Unnamed: 0,id,tweet,lang,sex,age,t1_lb,t2_lb,t3_lb
0,100001,"@TheChiflis Ignora al otro, es un capullo.El p...",es,F,18-22,YES,REPORTED,[OBJECTIFICATION]
1,100001,"@TheChiflis Ignora al otro, es un capullo.El p...",es,F,23-45,YES,JUDGEMENTAL,"[OBJECTIFICATION, SEXUAL-VIOLENCE]"
2,100001,"@TheChiflis Ignora al otro, es un capullo.El p...",es,F,46+,NO,-,[-]
3,100001,"@TheChiflis Ignora al otro, es un capullo.El p...",es,M,46+,YES,REPORTED,[STEREOTYPING-DOMINANCE]
4,100001,"@TheChiflis Ignora al otro, es un capullo.El p...",es,M,23-45,YES,JUDGEMENTAL,[SEXUAL-VIOLENCE]


In [8]:
en_train_set_y = training_set.loc[(training_set['age']=='18-22') & (training_set['lang']=='en')]
es_train_set_y = training_set.loc[(training_set['age']=='18-22') & (training_set['lang']=='es')]
en_train_set_m = training_set.loc[(training_set['age']=='23-45') & (training_set['lang']=='en')]
es_train_set_m = training_set.loc[(training_set['age']=='23-45') & (training_set['lang']=='es')]
en_train_set_o = training_set.loc[(training_set['age']=='46+') & (training_set['lang']=='en')]
es_train_set_o = training_set.loc[(training_set['age']=='46+') & (training_set['lang']=='es')]

### Eval data


In [9]:
f = open ('/u/athbagde/ML_CL/EXIST 2023 Dataset 2/dev/EXIST2023_dev.json', "r")
  
# Reading from file
data = json.loads(f.read())

In [10]:
val_training_set  = pd.DataFrame(columns=['id','tweet','lang','sex','age','t1_lb','t2_lb','t3_lb'])

In [11]:
for id in data:
    sample = data[id]
    val_training_set.loc[len(val_training_set.index)] = [sample['id_EXIST'],sample['tweet'],
                                                sample['lang'],sample['gender_annotators'],
                                                sample['age_annotators'],sample['labels_task1'],
                                                sample['labels_task2'],sample['labels_task3']
                                                ]

In [12]:
val_set =  val_training_set.set_index(['id','tweet','lang']).apply(pd.Series.explode).reset_index()

In [13]:
en_val_set = val_set.loc[val_set['lang']=='en']

In [24]:
en_val_set.head()

Unnamed: 0,id,tweet,lang,sex,age,t1_lb,t2_lb,t3_lb
3294,400001,"@Mike_Fabricant “You should smile more, love. ...",en,F,18-22,NO,-,[-]
3295,400001,"@Mike_Fabricant “You should smile more, love. ...",en,F,23-45,NO,-,[-]
3296,400001,"@Mike_Fabricant “You should smile more, love. ...",en,F,46+,NO,-,[-]
3297,400001,"@Mike_Fabricant “You should smile more, love. ...",en,M,18-22,NO,-,[-]
3298,400001,"@Mike_Fabricant “You should smile more, love. ...",en,M,23-45,YES,REPORTED,"[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINANC..."


## Models

### RoBERTa -- training english data

In [14]:
en_train_set = training_set.loc[training_set['lang']=='en']

In [15]:
# check if we have cuda installed
if torch.cuda.is_available():
    # to use GPU
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [16]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-large')


In [17]:
features = list(en_train_set['tweet'])
targets = list(en_train_set['t1_lb'])
val_features = list(en_val_set['tweet'])
val_targets = list(en_val_set['t1_lb'])

In [18]:
MAX_LEN = 128
tokenized_feature = tokenizer(
                            # Sentences to encode
                            features, 
                            # Add '[CLS]' and '[SEP]'
                            add_special_tokens = True,
                            # Add empty tokens if len(text)<MAX_LEN
                            padding = 'max_length',
                            # Truncate all sentences to max length
                            truncation=True,
                            # Set the maximum length
                            max_length = MAX_LEN, 
                            # Return attention mask
                            return_attention_mask = True,   
                   )

In [19]:

val_tokenized_feature = tokenizer(
                            # Sentences to encode
                            val_features, 
                            # Add '[CLS]' and '[SEP]'
                            add_special_tokens = True,
                            truncation=True,
                               
                   )

In [20]:
le = LabelEncoder()
le.fit(targets)
target_num = le.transform(targets)

In [21]:
le = LabelEncoder()
le.fit(val_targets)
val_target_num = le.transform(val_targets)

In [22]:
val_target_num

array([0, 0, 0, ..., 1, 1, 1])

In [27]:
train_inputs,train_labels,train_masks = tokenized_feature['input_ids'],target_num,tokenized_feature['attention_mask']

In [28]:
val_inputs,val_labels,val_masks = val_tokenized_feature['input_ids'],val_target_num,val_tokenized_feature['attention_mask']

In [38]:
batch_size = 16
# Create the DataLoader for our training set
train_data = my_Dataset(tokenized_feature,target_num)
val_data = my_Dataset(val_tokenized_feature,val_target_num)

In [39]:
from transformers import RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-large", 
    # Specify number of classes
    num_labels = len(set(targets)), 
    # Whether the model returns attentions weights
    output_attentions = False,
    # Whether the model returns all hidden-states 
    output_hidden_states = False
)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'clas

In [40]:
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )

In [41]:
model.cuda()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [42]:
import evaluate
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)
args = TrainingArguments(
    f"RoBERTa-finetuned-task1",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)

In [43]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [44]:
trainer.train()

  target_ids = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [103]:
train_data

<torch.utils.data.dataset.TensorDataset at 0x7fbc2f48dd50>