# Solution 2

### Using xlm-roberta for vectorization and then a simple neural net for label classification

#### Let's load the data.

In [1]:
from pathlib import Path
wd = Path.cwd()
wd = wd.parent.parent
wd = wd / 'merged_data' 
sub1 = str(wd) + '/subtask1.parquet'
print(sub1)

/home/matijak/Documents/programiranje/projects/semeval/merged_data/subtask1.parquet


In [2]:
import pandas as pd
df = pd.read_parquet(sub1)
df.head()

Unnamed: 0,lang,art_name,entity,start,end,class1,classes2,text
0,BG,BG_670.txt,Запад,152,156,Antagonist,"[Conspirator, Instigator, Foreign Adversary]",Опитът на колективния Запад да „обезкърви Руси...
1,BG,BG_670.txt,САЩ,530,532,Antagonist,[Instigator],Опитът на колективния Запад да „обезкърви Руси...
2,BG,BG_670.txt,НАТО,535,538,Antagonist,[Instigator],Опитът на колективния Запад да „обезкърви Руси...
3,BG,BG_670.txt,Украйна,578,584,Antagonist,[Foreign Adversary],Опитът на колективния Запад да „обезкърви Руси...
4,BG,BG_670.txt,украински войници,633,649,Innocent,[Victim],Опитът на колективния Запад да „обезкърви Руси...


#### Now lets clean article text

In [3]:
import re
def labelNum(row):
    if row['class1'] == 'Antagonist':
        return int(0)
    if row['class1'] == 'Innocent':
        return int(1)
    if row['class1'] == 'Protagonist':
        return int(2)
def cleanText(row):
    text = str(row['text'])
    #text = re.sub(r'[^\w\s]', ' ', text)
    text = text.replace('\n',' ').replace('  ', ' ')
    return text
df['label'] = df.apply(labelNum,axis=1)
df['input'] = df.apply(cleanText,axis=1)
df.loc[448]

lang                                                       EN
art_name                                     EN_UA_103861.txt
entity                                                Chinese
start                                                     791
end                                                       797
class1                                             Antagonist
classes2                                                [Spy]
text        The World Needs Peacemaker Trump Again \n\n by...
label                                                       0
input       The World Needs Peacemaker Trump Again  by Jef...
Name: 448, dtype: object

In [4]:
def find_all_substring_start_end(text, substring):
    # Use re.finditer to find all occurrences of the substring in the text
    matches = re.finditer(re.escape(substring), text)
    
    # Collect the start and end indices of all matches
    positions = [(match.start(), match.end()) for match in matches]
    
    return positions
def adjust_start_end(row):
    org_text,cl_text,start,end,entity = str(row['text']),str(row['input']),int(row['start']),int(row['end']),str(row['entity'])
    ss1 = find_all_substring_start_end(org_text,entity)
    ss2 = find_all_substring_start_end(cl_text,entity)
    #print(ss1,ss2)
    #print(row['text'][start:end])
    a = 0
    for i in range(len(ss1)):
        if abs((ss1[i][0] - start) + (ss1[i][1] - end) ) <= 2:
            a = i
            break
    if org_text[ss1[a][0]:ss1[a][1]] != cl_text[ss2[a][0]:ss2[a][1]]:
        print("ERROR!")
    return ss2[a][0],ss2[a][1]
print(df.loc[0])
df['new_start_end'] = df.apply(adjust_start_end,axis=1)

lang                                                       BG
art_name                                           BG_670.txt
entity                                                  Запад
start                                                     152
end                                                       156
class1                                             Antagonist
classes2         [Conspirator, Instigator, Foreign Adversary]
text        Опитът на колективния Запад да „обезкърви Руси...
label                                                       0
input       Опитът на колективния Запад да „обезкърви Руси...
Name: 0, dtype: object


In [5]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizerFast

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=3).to(device)
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

def preprocess_function(examples):
    return tokenizer(examples['input'], padding=True, truncation=True,max_length=8192,return_offsets_mapping=True)

data = df.loc[ : , ['input','label','new_start_end','entity']]
data['tokenized']=data.apply(preprocess_function,axis=1)

Using device: cuda


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
print(data.iloc[0]['tokenized'])

{'input_ids': [0, 1089, 22617, 1669, 29, 47829, 2097, 32275, 69, 137, 197, 35359, 53335, 2827, 40053, 155, 135, 128601, 29, 12747, 226, 49, 94511, 137, 2687, 591, 7533, 135, 10099, 54293, 35, 25977, 245, 131732, 155, 35, 18777, 183, 159814, 153, 1089, 22617, 1669, 29, 47829, 2097, 32275, 69, 137, 197, 35359, 53335, 2827, 40053, 155, 135, 128601, 29, 12747, 226, 49, 94511, 137, 2687, 591, 7533, 135, 10099, 54293, 35, 25977, 245, 131732, 155, 35, 18777, 183, 159814, 4629, 69, 62086, 16846, 33318, 4, 3756, 77, 63084, 15258, 1669, 29, 92173, 59, 6208, 29, 6047, 39540, 197, 14114, 16641, 44267, 5, 61216, 193342, 43219, 84535, 2262, 36690, 45961, 213358, 222, 31458, 2549, 29, 45775, 59, 29, 103285, 245, 34078, 29, 40108, 47239, 303, 3512, 105, 22192, 4, 12434, 47853, 19737, 245, 6, 163308, 183, 109560, 205, 29, 40108, 135694, 25223, 650, 447, 3873, 8458, 63522, 5, 44, 123209, 24724, 2374, 205, 29, 40108, 4, 20292, 35, 4907, 155386, 74300, 4301, 61, 51192, 205, 49, 159814, 19173, 40053, 218, 

In [7]:
#print(data.loc[0]['tokenized'])
def indexes(row):
    off_mask = row['tokenized']['offset_mapping']
    start,end = row['new_start_end'][0],row['new_start_end'][1]
    inds = list()
    for p in range(len(off_mask)):
        if off_mask[p][0] >= start and off_mask[p][1] <= end:
            if p != len(off_mask)-1:
                inds.append(p)
    #if len(inds) > 1:
        #print("GREATER THAN 1")
    if len(inds) == 0:
        print(start,end)
    return inds
data['indexes'] = data.apply(indexes,axis=1)

In [8]:
print(data.loc[448],data.loc[1000])

input            The World Needs Peacemaker Trump Again  by Jef...
label                                                            0
new_start_end                                           (785, 792)
entity                                                     Chinese
tokenized              [input_ids, attention_mask, offset_mapping]
indexes                                                      [180]
Name: 448, dtype: object input            जयपुर में जलवायु परिवर्तन को लेकर स्टेट लेवल ट...
label                                                            2
new_start_end                                           (623, 636)
entity                                               केन्द्र सरकार
tokenized              [input_ids, attention_mask, offset_mapping]
indexes                                                 [162, 163]
Name: 1000, dtype: object


In [9]:
data['list'] = data['tokenized'].apply(lambda x: x['input_ids'])
data['attention'] = data['tokenized'].apply(lambda x: x['attention_mask'])
ids = data['list']
att = data['attention']
indexes = data['indexes']
tids = list()
tatt = list()
print(len(ids),len(att),len(indexes))
for i in range(len(ids)):
    tids.append(torch.tensor(ids[i]).to(device))
    tatt.append(torch.tensor(att[i]).to(device))
print(tids[0],tatt[0])
print(indexes[448])

2535 2535 2535
tensor([     0,   1089,  22617,   1669,     29,  47829,   2097,  32275,     69,
           137,    197,  35359,  53335,   2827,  40053,    155,    135, 128601,
            29,  12747,    226,     49,  94511,    137,   2687,    591,   7533,
           135,  10099,  54293,     35,  25977,    245, 131732,    155,     35,
         18777,    183, 159814,    153,   1089,  22617,   1669,     29,  47829,
          2097,  32275,     69,    137,    197,  35359,  53335,   2827,  40053,
           155,    135, 128601,     29,  12747,    226,     49,  94511,    137,
          2687,    591,   7533,    135,  10099,  54293,     35,  25977,    245,
        131732,    155,     35,  18777,    183, 159814,   4629,     69,  62086,
         16846,  33318,      4,   3756,     77,  63084,  15258,   1669,     29,
         92173,     59,   6208,     29,   6047,  39540,    197,  14114,  16641,
         44267,      5,  61216, 193342,  43219,  84535,   2262,  36690,  45961,
        213358,    222,  

In [10]:
sliced_ids = list()
sliced_ntids = list()
sliced_att = list()
key_inds = list()
key_ids = list()

def slices(index,size,context_size):
    if (size<context_size):
        return 0,size
    lower_c = int(context_size/2-1)
    upper_c = int(context_size/2)
    #print(lower_c,upper_c)
    if index < lower_c:
        return 0,context_size
    elif index >= lower_c:
        if index + upper_c > size:
            return index-(context_size-(size-index)), size
        else:
            return index-lower_c,index+upper_c+1  


for i in range(len(tids)):
    slower,supper = slices(indexes[i][0],len(tids[i]),512)
    key_id = ids[i][indexes[i][0]]
    #key_tid = tids[i][indexes[i][0]]
    pid = ids[i][slower:supper]
    key_inds.append([])
    for j in indexes[i]:
        
        key_id = ids[i][j]
        if key_id not in pid:
            print(len(ids[i]),key_id,slower,supper,indexes[i])
        key_inds[i].append(pid.index(key_id))
    sliced_ids.append(tids[i][slower:supper])
    sliced_att.append(tatt[i][slower:supper])

Min = 10000
for i in range(len(indexes)):
    if len(indexes[i]) < Min:
        Min = len(indexes[i])
#print(Min)
#print(len(sliced_ids[448]),len(tids[448]),sliced_ids[0],sliced_att[0])
print(key_inds[448],tids[448][99])

[99] tensor(76438, device='cuda:0')


In [11]:
from torch.utils.data import DataLoader, TensorDataset

input_ids_batch = torch.nn.utils.rnn.pad_sequence(sliced_ids, batch_first=True, padding_value=0)
attention_mask_batch = torch.nn.utils.rnn.pad_sequence(sliced_att, batch_first=True, padding_value=0)

dataset = TensorDataset(input_ids_batch, attention_mask_batch)

dataloader = DataLoader(dataset, batch_size=100)

ind = 0

vectors = []

#print(input_ids_batch)
for batch in dataloader:
    #print(len(batch[0]),batch)
    input_for_model = {
        "input_ids": batch[0],
        "attention_mask" : batch[1]
    }

    with torch.no_grad():  # Disable gradients for inference
        outputs = model(**input_for_model,output_hidden_states=True)
    hidden_states = outputs.hidden_states
    last_hs = hidden_states[-1]
    #print(last_hs.shape)
    for i in range(len(last_hs)):
        vectors.append([])
        for j in range(len(key_inds[ind])):
            vectors[len(vectors)-1].append(last_hs[i][key_inds[ind][j]])
        ind+=1        

In [18]:
print(vectors[500])
#print(vectors[448],key_inds[448])

[tensor([ 1.3721e-01,  5.8365e-02, -1.8918e-03,  1.2958e-02,  9.0196e-02,
        -2.8171e-01, -7.7389e-02,  3.1194e-01,  7.1298e-02,  6.7493e-02,
        -1.2208e-01,  9.8245e-03,  5.2699e-01, -2.2419e-01,  2.9159e-03,
         9.0216e-02,  5.0927e-03, -1.8901e-02,  1.9131e-01, -7.9771e-02,
        -1.8396e-01, -3.9475e-03, -2.7776e-02, -4.2522e-03,  1.9864e-01,
        -6.4600e-02,  8.8462e-02, -2.9232e-02,  1.8151e-01, -9.8562e-02,
         5.9715e-02,  1.4491e-02,  9.2272e-03,  9.4080e-02,  3.4745e-02,
         1.6201e-01,  5.4662e-02,  1.8258e-02,  1.2945e-01,  4.1608e-03,
         4.8872e-02, -5.1477e-02,  4.3682e-02,  4.6984e-02,  6.0203e-02,
        -3.5895e-02,  1.5181e-02,  3.4139e-02,  6.0316e-02, -6.0984e-02,
         4.9948e-02, -4.2457e-02,  1.3681e-02,  1.2307e-01,  2.8496e-02,
        -1.0265e-01,  8.4510e-02, -5.2374e-02,  2.6985e-03,  1.7935e-01,
         5.5537e-02,  3.6382e-02,  3.8826e-02, -1.5632e-01,  1.5230e-01,
        -3.0959e-02, -1.3754e-01,  2.8239e-02, -7.

In [26]:
summed_vectors = list()
for i in range(len(vectors)):
    Sum = 0
    for j in range(len(vectors[i])):
        Sum += vectors[i][j]
    summed_vectors.append(Sum)

print(len(summed_vectors))
print(summed_vectors[500])

2535
tensor([ 2.1903e-01,  1.3612e-01,  3.0061e-03,  6.3634e-02,  3.0063e-01,
        -3.2476e-01, -1.2117e-01,  6.2033e-01,  1.6115e-01, -2.3365e-02,
        -3.4973e-01,  3.5852e-02,  1.1043e+00, -5.4091e-01, -2.0531e-02,
         1.3771e-01, -7.5579e-03, -2.4093e-02,  2.6050e-01, -9.7077e-02,
        -8.7336e-02,  7.0913e-03, -2.6048e-01,  4.1789e-03,  3.1120e-01,
        -1.6637e-01,  1.6581e-01, -1.3486e-01,  1.8154e-01, -1.8161e-01,
         9.9744e-02,  2.5785e-02, -1.5426e-02,  2.8224e-02, -2.4930e-02,
         4.1778e-01,  1.2799e-01,  1.0440e-02,  3.9570e-01, -5.4062e-02,
         8.4557e-02, -1.1577e-01, -2.0414e-03,  1.6460e-01,  6.2705e-02,
        -3.9548e-02, -1.2040e-02,  1.2555e-02,  1.3330e-01, -1.0093e-01,
         9.4497e-02, -3.3286e-01,  4.1473e-02,  1.0289e-01, -1.6245e-01,
        -1.8326e-01,  1.8391e-01, -1.2222e-01,  8.1356e-02,  4.7718e-01,
         1.6321e-01,  2.4989e-01,  4.0190e-02,  1.0146e-01, -1.0443e-04,
        -5.5759e-02, -2.1894e-01,  4.6052e-02,