# Multitask Hierachical Neural Network for Persuasion Techniques Detection

This is a solution of kb team for Semeval 2023 task 3 subtask 3

In [None]:
import random
import os
import numpy as np
import torch
def seed_everything(seed=73):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

seed_everything(1234)


In [None]:
lang="en" #set language

## READ input data and spans

In [None]:
import pandas as pd

In [None]:
from tqdm import tqdm

def make_dataframe_subtask3(input_folder, labels_fn=None, spans=None):
    txt_files = [f for  f in os.listdir(spans) if f.endswith('.txt')]
    print("number of files: ", len(txt_files))
    # print()
    df_labels = pd.DataFrame(columns=["id","label", "start", "end"])
    # print(txt_files)
    for i, js in enumerate(txt_files):
        # print(js
        with open(os.path.join(spans, js)) as file:
            line=file.readline()
            # print(line)
            while line!="":
                l=line.split()
                # print(l[0])
                df_labels=df_labels.append({"id":l[0], "label":l[1], "start":l[2], "end":l[3]}, ignore_index=True)
                line=file.readline()

    #MAKE TXT DATAFRAME
    text = pd.DataFrame()
    articles=[]
    for fil in tqdm(filter(lambda x: x.endswith('.txt') and x.startswith("art"), os.listdir(input_folder))):
        iD = fil[7:].split('.')[0]
        # print("------------------------------------------")
        # print(fil)
        art_labels=df_labels[df_labels["id"]==iD]
        art_labels["start"]=art_labels["start"].astype(int)
        art_labels["end"]=art_labels["end"].astype(int)

        article=open(input_folder+fil,'r', encoding="utf-8", errors='ignore').read()
        # print(article)
        for row in art_labels.iterrows():
            start= int(row[1]["start"])#-12
            end= int(row[1]["end"])#-12
            # print(article[start : end], row[1]["label"])
        lines = list(enumerate(open(input_folder+fil,'r', encoding="utf-8", errors='ignore').read().splitlines(),1))
        start_line=0
        end_line=0
        start_ends=()
        for line in lines:
            # print(line[1])
            start_line=end_line
            end_line=end_line+len(line[1])
            # print(start_line)
            od_do=list()
            for span in art_labels[(art_labels.start>=start_line)&(art_labels.end<end_line)].iterrows():
                od=span[1].start-start_line
                do=span[1].end-start_line
                od_do.append({"start":od, "end":do, "label":span[1]["label"]})
            text=text.append({"id": iD, "line":line[0], "text":line[1], "spans":od_do}, ignore_index=True)
            
                
        # text.extend([(iD,) + line for line in lines])
    print(text)
    df_text = pd.DataFrame(text, columns=['id','line','text', "spans"])
    df_text.id = df_text.id.apply(int)
    df_text.line = df_text.line.apply(int)
    df_text = df_text[df_text.text.str.strip().str.len() > 0].copy()
    df_text = df_text.set_index(['id','line'])
    
    df = df_text

    if labels_fn:
        #MAKE LABEL DATAFRAME
        labels = pd.read_csv(labels_fn,sep='\t',encoding='utf-8',header=None)
        labels = labels.rename(columns={0:'id',1:'line',2:'labels'})
        labels = labels.set_index(['id','line'])
        labels = labels[labels.labels.notna()].copy()

        #JOIN
        df = labels.join(df_text)[['text','spans','labels']]
    return df

In [None]:
labels_test_fn="/data/en/dev-labels-subtask-3.txt"
folder_dev="/data/en/dev-articles-subtask-3/"
labels_train_fn="/data/en/train-labels-subtask-3.txt"
folder_train="/data/en/train-articles-subtask-3/"
train_span="/data/"+lang+"/train-labels-subtask-3-spans"
test_span="/data/"+lang+"/dev-labels-subtask-3-spans"

print('Loading training...')
train=make_dataframe_subtask3(folder_train, labels_train_fn, train_span)
print('Loading dev...')
test=make_dataframe_subtask3(folder_dev, labels_test_fn, test_span)

In [None]:
train.reset_index(inplace=True)
test.reset_index(inplace=True)

train

## Change span to IO

In [None]:

import spacy


#add spacy model for chosen language
if lang=="po":
    nlp = spacy.load("pl_core_news_sm")
if lang=="en":
    nlp = spacy.load("en_core_web_sm")
    
train["tokens"]=""
train["pos"]=""
train["mani_tags"]=""




#train
for i, (sentence, annotations) in enumerate(zip(train["text"],train["spans"])):


        doc = self.nlp(text)
        token_list = []
        tag = []

        for token in doc:
            token_list.append(str(token.text))
        # print(token_list)

        if len(spans) == 0:

            for j in range(0, len(token_list)):
                tag.append("O")
        else:
            start = 0
            endprev = 0  # endign of previous tag
            ann_mani = pd.DataFrame(spans)
            ann_mani = ann_mani.sort_values("start")
            token_idx = 0  # count tokend

            for j, ann in ann_mani.iterrows():
                tag_idx = 0
                token_idx = ann["start"]
                for token in self.nlp(text[start:ann["start"]].strip()):
                    tag.append("O")
                    token_idx += 1
                for token in self.nlp(text[ann["start"]:ann["end"]].strip()):
                    if start <= ann["start"]:  # if end of previous is span is earlier
                        start = ann["end"]
                        tag_idx = 0

                    else:
                        tag_idx += 1
                        token_idx += 1  # if token is before end of prev we should count it
                    if (tag_idx == 0):
                        tag.append("B-" + ann["label"])
                        token_idx += 1

                    elif endprev < token_idx:  # append new I only if prevoious tag end
                        tag.append("I-" + ann["label"])
                        token_idx += 1
                    else:
                        token_idx += 1  # count tokens inside previous tag

                    tag_idx += 1
                start = ann["end"]  # set start at the end of tag sequence
                endprev = ann["end"]

            for token in self.nlp(text[start:].strip()):
                tag.append("O")


        train["mani_tags"][i]=tag

 

In [None]:
test["tokens"]=""
test["pos"]=""
test["mani_tags"]=""

#test
for i, (sentence, annotations) in enumerate(zip(test["text"],test["spans"])):
            doc = self.nlp(text)
        token_list = []
        tag = []

        for token in doc:
            token_list.append(str(token.text))
        # print(token_list)

        if len(spans) == 0:

            for j in range(0, len(token_list)):
                tag.append("O")
        else:
            start = 0
            endprev = 0  # endign of previous tag
            ann_mani = pd.DataFrame(spans)
            ann_mani = ann_mani.sort_values("start")
            token_idx = 0  # count tokend

            for j, ann in ann_mani.iterrows():
                tag_idx = 0
                token_idx = ann["start"]
                for token in self.nlp(text[start:ann["start"]].strip()):
                    tag.append("O")
                    token_idx += 1
                for token in self.nlp(text[ann["start"]:ann["end"]].strip()):
                    if start <= ann["start"]:  # if end of previous is span is earlier
                        start = ann["end"]
                        tag_idx = 0

                    else:
                        tag_idx += 1
                        token_idx += 1  # if token is before end of prev we should count it
                    if (tag_idx == 0):
                        tag.append("B-" + ann["label"])
                        token_idx += 1

                    elif endprev < token_idx:  # append new I only if prevoious tag end
                        tag.append("I-" + ann["label"])
                        token_idx += 1
                    else:
                        token_idx += 1  # count tokens inside previous tag

                    tag_idx += 1
                start = ann["end"]  # set start at the end of tag sequence
                endprev = ann["end"]

            for token in self.nlp(text[start:].strip()):
                tag.append("O")

        test["mani_tags"][i]=tag


In [None]:

import pandas as pd 

frames_list=[]
for frames in train["labels"]:
  fs=frames.split(",")
  for f in fs:
    frames_list.append(f)

frames_to_ids = {k: v for v, k in enumerate(set(frames_list))}
ids_to_frames = {v: k for v, k in enumerate(set(frames_list))}
frames_to_ids

In [None]:
from collections import Counter
tags=[]
for tag in train["mani_tags"]:
    tags=tags+tag

    

print("Number of tags: {}".format(len(set(tags))))
c = Counter(tags)

print( c.items())



tags_to_ids = {k: v for v, k in enumerate(set(tags))}
ids_to_tags = {v: k for v, k in enumerate(set(tags))}
tags_to_ids

In [None]:
from tqdm.notebook import tqdm
n_labels=len(frames_to_ids)

def one_hot_encoder(df):
    one_hot_encoding = []
    for i in tqdm(range(len(df))):
        temp = [0]*n_labels
        label_indices = df.iloc[i]["labels"].split(",")
        for index in label_indices:
            temp[frames_to_ids[index]] = 1
        one_hot_encoding.append(temp)
    return pd.DataFrame(one_hot_encoding)

In [None]:
train_ohe_labels = one_hot_encoder(train)
test_ohe_labels = one_hot_encoder(test)

train_ohe_labels

In [None]:
train = pd.concat([train, train_ohe_labels], axis=1)
train

In [None]:
test = pd.concat([test, test_ohe_labels], axis=1)
test

In [None]:
def inspect_category_wise_data(label, n=5):
    samples = train[train[label] == 1].sample(n)
    sentiment = ids_to_frames[label]
    
    print(f"{n} samples from {sentiment} sentiment: \n")
    for text in samples["text"]:
        print(text, end='\n\n')

inspect_category_wise_data(4)


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
BERT_MODEL ="" ## add your BERT MODEL

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL,local_files_only=True)

In [None]:
import transformers


## Model

In [None]:
from torch import nn

class PropagandaClassifier(nn.Module):
    def __init__(self, n_classes,num_labels, do_prob, bert_model):
        super(PropagandaClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model,local_files_only=True)
        

        self.dropout = nn.Dropout(do_prob)
        self.out = nn.Linear(768, n_classes)
        
        self.dropout2 = nn.Dropout(do_prob)
        self.tagger = nn.Linear(768, num_labels)
        self.m=nn.Softmax( dim=2)
       

    def forward(self, input_ids, attention_mask):
        output_bert = self.bert(input_ids, attention_mask=attention_mask)
        
        
        #tokens
        output_tag1=self.dropout(output_bert[0])
        output_tag=self.tagger(output_tag1)
       
        softm=self.m(output_tag)
        
        indexes=torch.argmax(softm, axis=2)
        
        ind=[]
        for i in range(0, indexes.shape[0]):
            one=False
            for j in range(0, indexes.shape[1]):
                
                if indexes[i,j]==1:
                    ind.append(j)
                    one=True
                    break
            if one==False: #jesli brak 1 to tez chcemy miec index
                ind.append(0)
    
        a=torch.range(0,len(indexes)-1,dtype=torch.long)
        
        output_1=output_bert[0][a,ind, :]
        output_2 = self.dropout(output_1)
        output = self.out(output_2)
        return output, output_tag

In [None]:
from transformers import BertTokenizerFast, BertConfig, BertModel




## Prepare dataset

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import numpy as np


class MyDataset(Dataset):

  def __init__(self,df, tokenizer, max_len):
    
    self.article=df["text"]#[t.lower() for t in df["text"]]
    

    self.tokenizer=tokenizer
    self.max_len=max_len
    self.id= df["id"]
    self.line=df["line"]

    if "labels" in df.columns:
       self.labels=df[range(len(frames_to_ids))].values.tolist()
    else:
      self.labels=[]
    self.mani_tags=[]
    if "mani_tags" in df.columns:
        self.mani_tags=df["mani_tags"]
  def __len__(self):
    return len(self.article)
  
  def __getitem__(self,idx):
    if len(self.labels)>0:
      labels=self.labels[idx]
    else:
      labels=0
    if len( self.mani_tags)>0:
        token_word_labels = self.mani_tags[idx][0].split(",") 
        token_labels = [tags_to_ids[label] for label in token_word_labels] 
    else:
        token_labels=[]
    idart=self.id[idx]
    line=self.line[idx]
     # print(labels)
    encoding = self.tokenizer(self.article[idx],
                             is_split_into_words=False,
                             #is_pretokenized=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)#.set_format("torch")

    # create an empty array of -100 of length max_length
    encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
    i = -1
    if len( self.mani_tags)>0:
        for idx, mapping in enumerate(encoding["offset_mapping"]):
    
                if mapping[1]!=0:# next
                    if mapping[0] == 0:#only if begginign of a word
                        i += 1
                    encoded_labels[idx] = token_labels[i]

    

   
    

    items = {key: torch.as_tensor(val) for key, val in encoding.items()}
    items["labels"]=torch.as_tensor(labels) 
    items["id"]=idart
    items["line"]=line
    items['token_labels'] = torch.as_tensor(encoded_labels)


    
    return items

In [None]:
#setup

MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 30
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10



## train_test split

In [None]:
print("TRAIN Dataset: {}".format(train.shape))
print("TEST Dataset: {}".format(test.shape))
training_set = MyDataset(train, tokenizer, MAX_LEN)
test_set = MyDataset(test, tokenizer, MAX_LEN)

# # myDs=MyDataset(bias_lexical, tokenizer)
train_loader=DataLoader(training_set,batch_size=TRAIN_BATCH_SIZE,shuffle=True)
test_loader=DataLoader(test_set,batch_size=VALID_BATCH_SIZE, shuffle=False)


## log metrics

In [None]:
from sklearn import metrics, model_selection, preprocessing
from sklearn.metrics import precision_recall_fscore_support as score


def log_metrics(preds, labels):
    preds = torch.stack(preds)
    # print(preds)
    preds = preds.cpu().detach().numpy()
    # print(preds)
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()


    preds=preds >0.5
    class_rep=classification_report( labels, preds, target_names= frames_to_ids.keys())
    print(class_rep)
    precision,recall,fscore,support=score(labels, preds,average='micro')
    precision,recall,fscore_macro,support=score(labels, preds,average='macro')

    return {"f1_micro":fscore, "f1_macro":fscore_macro}

In [None]:
from transformers import BertTokenizerFast, BertConfig, BertModel
from torch.nn import CrossEntropyLoss
# from transformers.models.bert.modeling_bert import BertModel
from transformers.models.bert import BertPreTrainedModel
from sklearn.metrics import classification_report




def training(epoch, test=True):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    print("Start")
    for idx, batch in enumerate(train_loader):

       

        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)
        tag_labels=batch['token_labels'].to(device, dtype = torch.long)

        optimizer.zero_grad()

        
        output, output_tokens = model(input_ids=ids, attention_mask=mask)
        
        loss_sequence = loss_fct(output, labels.float())
        loss_tokens = loss_fct2(output_tokens.view(-1, len(tags_to_ids)), tag_labels.view(-1))

        loss=loss_sequence+loss_tokens*0.5
        tr_loss += loss.item()
        
        tr_logits=output

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        loss.backward()

        optimizer.step()

        

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

    
    print("Training eval")
    model.eval()
    tr_preds, tr_labels = [], []
    loss=0
    for batch in train_loader:
      
      ids = batch['input_ids'].to(device, dtype = torch.long)
      mask = batch['attention_mask'].to(device, dtype = torch.long)
      labels = batch['labels'].to(device, dtype = torch.long)
      tag_labels=batch['token_labels'].to(device, dtype = torch.long)

      with torch.no_grad():
          output, output_tokens = model(input_ids=ids, attention_mask=mask,)
          loss+=loss_fct(output, labels.float())
          

          preds=torch.sigmoid(output)>0.5
          tr_labels +=[lab.cpu() for lab in labels ]
          tr_preds+=[lab for lab in preds ]
         
    
    loss_train=loss/len(train_loader)
    print(loss_train)
    res=log_metrics(tr_preds, tr_labels)

    print("Test eval")
    model.eval()
    tr_preds, tr_labels = [], []
    loss=0
    if test:
      for batch in test_loader:
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)
        tag_labels=batch['token_labels'].to(device, dtype = torch.long)

        
        with torch.no_grad():
            
            output, output_tokens = model(input_ids=ids, attention_mask=mask,)
            loss+=loss_fct(output, labels.float())
            preds=torch.sigmoid(output)>0.5
            tr_labels.extend(labels)
            tr_preds.extend(preds)
            
      loss_test=loss/len(test_loader)
      res_test=log_metrics(tr_preds, tr_labels)
      print(loss_test)
      return loss_train, loss_test, res["f1_micro"], res["f1_macro"], res_test["f1_micro"], res_test["f1_macro"]
    else:
      return loss_train, 0, res["f1_micro"],res["f1_macro"], 0,0



    

In [None]:
class_weights=[]
for i in range(len(frames_to_ids)):
    class_weights.append((len(train)-sum(train[i]))/sum(train[i]))
class_weights

## Training

In [None]:
train_loss_history=[]
test_loss_history=[]
train_f1micro_history=[]
test_f1micro_history=[]

model= PropagandaClassifier(len(frames_to_ids),len(tags_to_ids), 0.1, BERT_MODEL)
model=model.to(device)


optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE, weight_decay=0.01) # AdamW

class_weights=torch.as_tensor(class_weights).to(device, dtype = torch.float)

loss_fct =  nn.BCEWithLogitsLoss(pos_weight=class_weights)

loss_fct2 =  nn.CrossEntropyLoss()



for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    loss_tr, loss_te, f1_tr, f1_te=training(epoch)
    print(loss_tr, loss_te, f1_tr, f1_te)
    train_loss_history.append(loss_tr)
    test_loss_history.append(loss_te)
    train_f1micro_history.append(f1_tr)
    test_f1micro_history.append(f1_te)

In [None]:
train_loss_history=[t.item() for t in train_loss_history]
test_loss_history=[t.item() for t in test_loss_history]

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_loss_history, "--")
plt.plot(test_loss_history)

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_f1micro_history, "o")
plt.plot(test_f1micro_history)