## Classification with role description
This could be used after determining the first class (for which we have already achieved around 85% accuracy), to make a decision about fine-grained roles.
We will give BERT the next input: [CLS] text with [START_SPAN] named entity [END_SPAN] text [SEP] role description [SEP], 
and perform a binary classification task over the contextualized word embeddings whether a role describes named entity or not.

Intuitively this should be a better solution than just performing multilabel-classifiaction task over 12 or 6 classes, where we just present different classes with zeros and ones -> with given role description, we should know better if these classes describe the observed named entity.

### WIP

- potentially think of translating role descriptions into observed languages
- preprocessing text: ensure that there are no </s> tokens in the text input
- TODO: add token_type_id as parameter in XLMRobertaForSequenceClassification to make difference between context text and role text
- add ids to all inputs in testset for evaluation in the end

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# sub1 = 'drive/My Drive/Colab Notebooks/subtask1.parquet'
# print(sub1)

from pathlib import Path
wd = Path.cwd()
wd = wd.parent.parent
wd = wd / 'merged_data'
sub1 = str(wd) + '/subtask1.parquet'
sub2 = str(wd) + '/description_roles.csv'

In [2]:
import pandas as pd
df = pd.read_parquet(sub1)
df_roles = pd.read_csv(sub2)

In [8]:
df.where(df['art_name'] == 'BG_628.txt').dropna().loc[243].text

'Путин се оплака: Русия може да разчита само на себе си\n\nВладимир Путин се оплака, че никой няма да помогне на Русия, докато води война срещу Украйна.\nРуският президент каза, че страната му може да разчита "само на себе си", докато се обръщаше към военния персонал, събрал се в Георгиевската зала на Големия Кремълски дворец.\nВ забележки, заснети и публикувани в Telegram от държавната телевизия, той говори за необходимостта от стабилна икономика в Русия, която да подкрепя нейното министерство на отбраната.\n"Всеки има значение: и заплатата, и новото оборудване, и поръчките – това е много важно. Това е база", каза Путин.\n"В този смисъл можем да разчитаме само на себе си. Никой няма да дойде и да ни даде нещо с протегната ръка, всеки сам ще си го направи. Ние го можем", добави той.\nКоментарите на Путин идват, след като той подписа договор заедно със севернокорейския лидер Ким Чен Ун за обещание за взаимна военна помощ в случай на атака срещу Русия или Северна Корея.\nСпоразумението г

In [10]:
import re
def labelNum1(row):
    if row['class1'] == 'Antagonist':
        return int(0)
    if row['class1'] == 'Innocent':
        return int(1)
    if row['class1'] == 'Protagonist':
        return int(2)
def cleanText(row):
    text = str(row['text'])
    #text = re.sub(r'[^\w\s]', ' ', text)
    text = text.replace('\n',' ').replace('  ', ' ')
    return text

df['label1'] = df.apply(labelNum1,axis=1)
df['input'] = df.apply(cleanText,axis=1)

def labelNum2(row):
    labels2 = [0 for _ in range(12)]
    if row['label1'] == 2:
        if 'Guardian' in row['classes2']:
            labels2[0] = 1
        if 'Martyr' in row['classes2']:
            labels2[1] = 1
        if 'Peacemaker' in row['classes2']:
            labels2[2] = 1
        if 'Rebel' in row['classes2']:
            labels2[3] = 1
        if 'Underdog' in row['classes2']:
            labels2[4] = 1
        if 'Virtuous' in row['classes2']:
            labels2[5] = 1
    elif row['label1'] == 0:
        if 'Instigator' in row['classes2']:
           labels2[0] = 1
        if 'Conspirator' in row['classes2']:
            labels2[1] = 1
        if 'Tyrant' in row['classes2']:
            labels2[2] = 1
        if  'Foreign Adversary' in row['classes2']:
            labels2[3] = 1
        if 'Traitor' in row['classes2']:
            labels2[4] = 1
        if 'Spy' in row['classes2']:
            labels2[5] = 1
        if 'Saboteur' in row['classes2']:
            labels2[6] = 1
        if 'Corrupt' in row['classes2']:
            labels2[7] = 1
        if 'Incompetent' in row['classes2']:
            labels2[8] = 1
        if 'Terrorist' in row['classes2']:
            labels2[9] = 1
        if 'Deceiver' in row['classes2']:
            labels2[10] = 1
        if 'Bigot' in row['classes2']:
            labels2[11] = 1
    elif row['label1'] == 1:
        if 'Forgotten' in row['classes2']:
            labels2[0] = 1
        if 'Exploited' in row['classes2']:
            labels2[1] = 1
        if 'Victim' in row['classes2']:
            labels2[2] = 1
        if 'Scapegoat' in row['classes2']:
            labels2[3] = 1
    return labels2

df['label2'] = df.apply(labelNum2, axis=1)

In [11]:
#to clean and precise start and end positions of NEs

def find_all_substring_start_end(text, substring):
    matches = re.finditer(re.escape(substring), text)
    positions = [(match.start(), match.end()) for match in matches]

    return positions

def adjust_start_end(row):
    org_text,cl_text,start,end,entity = str(row['text']),str(row['input']),int(row['start']),int(row['end']),str(row['entity'])
    ss1 = find_all_substring_start_end(org_text,entity)
    ss2 = find_all_substring_start_end(cl_text,entity)
    a = 0
    for i in range(len(ss1)):
        if abs((ss1[i][0] - start) + (ss1[i][1] - end) ) <= 2:
            a = i
            break
    if org_text[ss1[a][0]:ss1[a][1]] != cl_text[ss2[a][0]:ss2[a][1]]:
        print("ERROR!")
    return ss2[a][0],ss2[a][1]

df['new_start_end'] = df.apply(adjust_start_end,axis=1)

# add SPAN_START and SPAN_END tokens to input
def addTokensToInput(row):
    inp = row['input']
    start,end = row['new_start_end']
    start = int(start)
    end = int(end)
    token_input = inp[:start] + "[SPAN_START] " + inp[start:end] + " [SPAN_END]" + inp[end:]
    return token_input

df['span_input'] = df.apply(addTokensToInput,axis=1)

# adjust again start and end positions
def upStartEnd(row):
    start,end = row['new_start_end']
    start += len("[SPAN_START] ")
    end += len("[SPAN_START] ")
    return start,end

df['new_start_end'] = df.apply(upStartEnd,axis = 1)

# prepare roles
def concatRoleWithDesc(row):
    return str(row['fine_grained_role']) + ': ' +  str(row['description'].replace('\n',' ').replace('  ', ' ').replace('\r', ' '))

df_roles["description_input"] = df_roles.apply(concatRoleWithDesc, axis=1)

descriptions_antagonist = df_roles.where(df_roles.main_role == 'Antagonist').dropna().description_input.to_list()
descriptions_innocent = df_roles.where(df_roles.main_role == 'Innocent').dropna().description_input.to_list()
descriptions_protagonist = df_roles.where(df_roles.main_role == 'Protagonist').dropna().description_input.to_list()


In [12]:
# prepare data for our second layer model after the first class is determined
antagonists_filter = df.class1 == 'Antagonist'
df_antagonists = df.where(antagonists_filter).dropna()

innocent_filter = df.class1 == 'Innocent'
df_innocent = df.where(innocent_filter).dropna()

protagonist_filter = df.class1 == 'Protagonist'
df_protagonist = df.where(protagonist_filter).dropna()

In [15]:
#before next step i need to split train and test data
def split_dataframe(dataf, test_size=0.2, random_state=None):
    df_shuffled = dataf.sample(frac=1, random_state=random_state).reset_index(drop=True)
    split_idx = int(len(dataf) * (1 - test_size))
    train_df = df_shuffled.iloc[:split_idx]
    test_df = df_shuffled.iloc[split_idx:]
    return train_df, test_df

train_df, test_df = split_dataframe(df_antagonists, test_size=0.2, random_state=42)

In [57]:
#prepare special inputs for our binary classification model

def expand_dataframe(df, descriptions):
    num_classes = len(descriptions)
    
    descriptions_df = pd.DataFrame({'description': descriptions, 'index': range(num_classes)})
    exploded_df = df.explode('label2').reset_index(drop=True)
    exploded_df['index'] = exploded_df.groupby(['art_name', 'entity', 'new_start_end']).cumcount()
    
    # merge descriptions with the exploded df_antagonist
    merged_df = exploded_df.merge(descriptions_df, on='index', how='left').drop('index', axis=1)
    return merged_df

train_df_antagonists = expand_dataframe(train_df, descriptions_antagonist)
test_df_antagonists =  expand_dataframe(test_df, descriptions_antagonist)
#now every row has its description role description and class 0 or 1 whether the description is true for the entity

def concatRoleDescription(row):
    return str(row['span_input']) + ' </s> ' + str(row['description'])

train_df_antagonists['text_role_input'] = train_df_antagonists.apply(concatRoleDescription, axis=1)
test_df_antagonists['text_role_input'] = test_df_antagonists.apply(concatRoleDescription, axis=1)
#now i can use this as a input for tokenizer (watchout max_langth of the input)


In [58]:
train_df_antagonists.loc[13643].text_role_input

'Путин се оплака: Русия може да разчита само на себе си Владимир Путин се оплака, че никой няма да помогне на Русия, докато води война срещу Украйна. Руският президент каза, че страната му може да разчита "само на себе си", докато се обръщаше към военния персонал, събрал се в Георгиевската зала на Големия Кремълски дворец. В забележки, заснети и публикувани в Telegram от държавната телевизия, той говори за необходимостта от стабилна икономика в Русия, която да подкрепя нейното министерство на отбраната. "Всеки има значение: и заплатата, и новото оборудване, и поръчките – това е много важно. Това е база", каза Путин. "В този смисъл можем да разчитаме само на себе си. Никой няма да дойде и да ни даде нещо с протегната ръка, всеки сам ще си го направи. Ние го можем", добави той. Коментарите на Путин идват, след като той подписа договор заедно със севернокорейския лидер Ким Чен Ун за обещание за взаимна военна помощ в случай на атака срещу Русия или Северна Корея. Споразумението гласи, че 

In [59]:
import torch
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizerFast

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#vidi jos dokumentaciju za token_type_ids_s
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2).to(device)
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base", sep_token = '</s>')

def preprocess_function(examples):
    return tokenizer(examples['text_role_input'], padding=True, truncation=True, max_length=8192, return_offsets_mapping=True)

extraTokens = {
    "additional_special_tokens": ["[SPAN_START]", "[SPAN_END]"]
}
num_added_toks = tokenizer.add_special_tokens(extraTokens)
model.resize_token_embeddings(len(tokenizer))

Using device: cpu


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(250004, 768, padding_idx=1)

In [60]:
train_data = train_df_antagonists.loc[ : , ['text_role_input', 'label2', 'new_start_end', 'entity', 'art_name']]
test_data = test_df_antagonists.loc[ : , ['text_role_input', 'label2', 'new_start_end', 'entity', 'art_name']]

In [61]:
train_data['tokenized']=train_data.apply(preprocess_function,axis=1)
test_data['tokenized']=test_data.apply(preprocess_function,axis=1)

In [83]:
# returns indices for spanned text
def indexes(row):
    off_mask = row['tokenized']['offset_mapping']
    start,end = row['new_start_end'][0],row['new_start_end'][1]
    inds = list()
    for p in range(len(off_mask)):
        if off_mask[p][0] >= start and off_mask[p][1] <= end:
            if p != len(off_mask)-1:
                inds.append(p)
    #if len(inds) > 1:
        #print("GREATER THAN 1")
    if len(inds) == 0:
        print(start,end)
    return inds
    
train_data['indexes'] = train_data.apply(indexes,axis=1)

#searching for sep index (one is between text input and role, and the other at the end of sequence)
def index_of_sep(row):
    sep_token_id = tokenizer.sep_token_id
    input_ids = row['input_ids']
    sep_token_indices = [i for i, token_id in enumerate(input_ids) if token_id == sep_token_id]
    
    if len(sep_token_indices) == 0:
        return -1
    else:
        return sep_token_indices[0] # make sure that </s> is removed when preprocessing input texts

train_data['sep_token_check'] = train_data['tokenized'].apply(index_of_sep)

In [None]:
train_data

In [142]:
#postupak samo stavljanja u tensor
train_data['list'] = train_data['tokenized'].apply(lambda x: x['input_ids'])
train_data['attention'] = train_data['tokenized'].apply(lambda x: x['attention_mask'])
ids = train_data['list']
att = train_data['attention']
indexes = train_data['indexes']
sep_index = train_data['sep_token_check']
tids = list()
tatt = list()
print(len(ids),len(att),len(indexes))
for i in range(len(ids)):
    tids.append(torch.tensor(ids[i]))
    tatt.append(torch.tensor(att[i]))

13644 13644 13644


In [None]:
# we need to be inside bert input length which is 512
def extract_for_model_input(ids, att, indexes, sep_index, max_tokens=510, context_window=440):
    tids = []
    tatt = []
    
    for i in range(len(ids)): 
        token_ids = ids[i]
        attention_mask = att[i]
        named_entity_indexes = indexes[i]
        separator_index = sep_index[i]
        
        context_start = max(0, min(named_entity_indexes) - context_window // 2)
        context_end = min(separator_index, max(named_entity_indexes) + context_window // 2)
        
        context_tokens = token_ids[context_start:context_end]
        context_attention = attention_mask[context_start:context_end]
        
        post_sep_tokens = token_ids[separator_index:]
        post_sep_attention = attention_mask[separator_index:]
        
        remaining_tokens = max_tokens - len(context_tokens)
        post_sep_tokens = post_sep_tokens[:remaining_tokens]
        post_sep_attention = post_sep_attention[:remaining_tokens]
        
        final_tokens = torch.cat([context_tokens, post_sep_tokens], dim=0)
        final_attention = torch.cat([context_attention, post_sep_attention], dim=0)
        
        tids.append(torch.tensor(final_tokens, dtype=torch.long))
        tatt.append(torch.tensor(final_attention, dtype=torch.long))
    
    return tids, tatt

tids, tatt = extract_for_model_input(tids, tatt, indexes, sep_index)


  tids.append(torch.tensor(final_tokens, dtype=torch.long))
  tatt.append(torch.tensor(final_attention, dtype=torch.long))


In [144]:
input_ids = list()
att_mask = list()
for ten,att in zip(tids,tatt):
    if len(ten) < 512:
        padding_length = 512 - len(ten)
        padding_tensor = torch.full((padding_length,), tokenizer.pad_token_id, dtype=ten.dtype)
        padding_tensor2 = torch.full((padding_length,), 0, dtype=att.dtype)
        ten = torch.cat((ten,padding_tensor),dim=0)
        att = torch.cat((att,padding_tensor2),dim=0)
    input_ids.append(ten)
    att_mask.append(att)
inputIds = torch.stack(input_ids)
attMask = torch.stack(att_mask)

In [158]:
import numpy as np
inputIds_np = inputIds.numpy()
attMask_np = attMask.numpy()
y_train = train_data['label2'].values
y_train_np = np.array(y_train.tolist(), dtype=np.int8)

In [159]:
X_train_ids = torch.tensor(inputIds_np, dtype=torch.long).to(device)
X_train_mask = torch.tensor(attMask_np, dtype=torch.long).to(device)
y_train = torch.tensor(y_train_np, dtype=torch.long).to(device)
y_train.shape

In [162]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train_ids, X_train_mask, y_train)
#test_dataset = TensorDataset(X_test_ids, X_test_mask, y1_test, y2_test )

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True) #shuffle=True provides data shuffle for batches in different epochs
#test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [33]:
from torch.optim import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 5


for epoch in range(num_epochs):

    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    train_progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{num_epochs}")
    
    for batch in train_progress_bar:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=-1)
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)

        train_progress_bar.set_postfix({'loss': loss.item()})

    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_predictions

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training loss: {avg_train_loss:.4f}, Training accuracy: {train_accuracy:.4f}")

    # model.eval()
    # test_loss = 0 
    # correct_test_predictions = 0
    # total_test_predictions = 0
    
    
    # test_progress_bar = tqdm(test_dataloader, desc=f"Test Epoch {epoch + 1}/{num_epochs}")
    
    # with torch.no_grad():
    #     for batch in test_progress_bar:
    #         input_ids = batch[0].to(device)
    #         attention_mask = batch[1].to(device)
    #         labels = batch[2].to(device)
            
    #         # Forward pass
    #         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    #         loss = outputs.loss
    #         logits = outputs.logits
            
    #         test_loss += loss.item()
            
            
    #         preds = torch.argmax(logits, dim=-1)
    #         correct_test_predictions += (preds == labels).sum().item()
    #         total_test_predictions += labels.size(0)
            
            
    #         test_progress_bar.set_postfix({'loss': loss.item()})
    
    # avg_test_loss = test_loss / len(test_dataloader)
    # test_accuracy = correct_test_predictions / total_test_predictions
    
    # print(f"Test loss: {avg_test_loss:.4f}, Test accuracy: {test_accuracy:.4f}")