## Technical level 10 classes no weight
Model: google-bert/bert-base-multilingual-uncased
max_seq_length: 128
train_batch_size: 8,
eval_batch_size: 8,
num_train_epochs: 5


In [48]:
import pandas as pd
import os
import glob
import codecs
import numpy as np
import csv
import math
from tqdm import tqdm, tqdm_notebook, trange

In [49]:
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from transformers import (BertConfig, AutoTokenizer, BertTokenizer, BertForMaskedLM, AutoModelForMaskedLM, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup)

## Config

In [50]:
#PATH TO DATA 
train_articles = "datasets/train"
dev_articles = "datasets/dev"
train_TC_labels = "datasets/train_TC_labels"
dev_TC_template = "datasets/TC_labels_for_eval.txt"  
true_file = "datasets/TC.labels_true.txt"

In [52]:
#techniques = "datasets/propaganda-techniques-names.txt"
techniques = "datasets/propaganda-techniques-names.txt"
PROP_TECH_TO_LABEL = {}  #to dictionary
LABEL_TO_PROP_TECH = {}     #to list
label = 0
with open(techniques, "r") as f:
  for technique in f:
    PROP_TECH_TO_LABEL[technique.replace("\n", "")] = int(label)
    LABEL_TO_PROP_TECH[int(label)] = technique.replace("\n", "")
    label += 1

In [53]:
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()

In [54]:
MODEL_CLASSES = {"bert": (BertConfig, BertForSequenceClassification, BertTokenizer)}  
args = {"data_dir": "bert/",  
        "model_type": "bert", 
        "model_name": "google-bert/bert-base-multilingual-uncased",
        "output_dir": "output_dir/output_TC_10_no_weight",  
        "max_seq_length": 128,  
        "train_batch_size": 8,
        "eval_batch_size": 8,
        "num_train_epochs": 5, 
        "weight_decay": 0,
        "learning_rate": 4e-5,
        "adam_epsilon": 1e-8,
        "warmup_ratio": 0.06,
        "warmup_steps": 0,
        "max_grad_norm": 1.0,
        "gradient_accumulation_steps": 1,
        "save_steps": 2000,
        "overwrite_output_dir": False}

## Preprocess

In [55]:
def convert_dataframe_to_features(dataframe, max_seq_length, tokenizer):
  """
  Converts dataframe into features dataframe, where each feature will
  take form of [CLS] + A + [SEP]
  """
  # Create features
  features = pd.DataFrame(None, range(dataframe.shape[0]), 
                              ["input_ids", "input_mask", "segment_ids", "label_ids"])

  # For each sequence, do:
  for i in range(len(dataframe)):
    # Set first and second part of the sequences
    tokens = tokenizer.tokenize(dataframe["text"][i])

    # If length of the sequence is greater than max sequence length, truncate it
    if len(tokens) > max_seq_length - 2:
        tokens = tokens[:(max_seq_length - 2)]

    # Concatenate the tokens
    tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]

    # Compute the ids
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_length - len(input_ids)
    pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
    input_ids = input_ids + [pad_token] * padding_length
    input_mask = input_mask + [0] * padding_length
    segment_ids = segment_ids + [0] * padding_length
    label_id = dataframe["label"][i]

    # Assert to make sure we have same length
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    # Put the data into features dataframe
    features["input_ids"][i] = input_ids
    features["input_mask"][i] = input_mask
    features["segment_ids"][i] = segment_ids
    features["label_ids"][i] = int(label_id)  
    

  return features

In [56]:
def articles_to_dataframe(article_folder, label_folder):
  """
  Preprocesses the articles into dataframes with sequences with binary tags
  """
  # First sort the filenames and make sure we have label file for each articles
  article_filenames = sorted(glob.glob(os.path.join(article_folder, "*.txt")))
  label_filenames = sorted(glob.glob(os.path.join(label_folder, "*.labels.tsv")))
  assert len(article_filenames) == len(label_filenames)

  # Initialize sequences
  sequences = []

  # For each article, do:
  for i in range(len(article_filenames)):

    # Read in the article
    with codecs.open(article_filenames[i], "r", encoding="utf8") as f:
      article = f.read()

    # Read in the label file and store indices 
    with open(label_filenames[i], "r") as f:
      reader = csv.reader(f, delimiter="\t")
      article_sequences = []
      labels_list = []
      for row in reader:
        article_sequences.append(article[int(row[2]):int(row[3])])
        labels_list.append(PROP_TECH_TO_LABEL[row[1]])

    sequence = pd.DataFrame(None, range(len(article_sequences)), ["label", "text"])
    sequence["label"] = labels_list
    sequence["text"] = article_sequences   

    # Add to the sequences
    sequences.append(sequence)

  # Concatenate all dataframes
  dataframe = pd.concat(sequences, ignore_index=True)

  return dataframe

In [57]:
def generate_training_dataset_from_articles(articles_folders, labels_folders, tokenizer):
  """
  Generates dataset to go into BERT from articles and labels
  """
    
  # For each articles and labels folder set, turn them into dataframes
  dataframe_list = []
  for i in range(len(articles_folders)):
    dataframe_list.append(articles_to_dataframe(articles_folders[i], labels_folders[i]))

  # Concatenate the dataframes to make a total dataframe
  dataframe = pd.concat(dataframe_list, ignore_index=True)

  # Process into features dataframe
  features = convert_dataframe_to_features(dataframe, args['max_seq_length'], tokenizer) 
     
  # Creating TensorDataset from features
  all_input_ids = torch.tensor(features["input_ids"], dtype=torch.long)
  all_input_mask = torch.tensor(features["input_mask"], dtype=torch.long)
  all_segment_ids = torch.tensor(features["segment_ids"], dtype=torch.long)
  all_label_ids = torch.tensor(features["label_ids"], dtype=torch.long)

  dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
  
  return dataset

## Train 

In [58]:
# No weights
def train(train_dataset, model):
  """
  Trains the model with training dataset
  """
  # Initialize various necessary objects
  train_sampler = RandomSampler(train_dataset)
  train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size']) 
    
  # Compute the total time
  t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']
  
  # Set the grouped parameters for optimizer
  no_decay = ['bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
  
  # Compute warmup steps
  warmup_steps = math.ceil(t_total * args['warmup_ratio'])
  args['warmup_steps'] = warmup_steps if args['warmup_steps'] == 0 else args['warmup_steps']
  
  # Initialize optimizer as Adam with constant weight decay and a linear scheduler with warmup
  optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'], num_training_steps=t_total)

  # Initialize variables for training
  global_step = 0
  tr_loss  = 0.0
  model.zero_grad()
  train_iterator = trange(int(args['num_train_epochs']), desc="Epoch")
  
  # Start training
  for _ in train_iterator:
    epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration")
    #epoch_iterator = tqdm.notebook.tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):
      model.train()
      batch = tuple(t.to(device) for t in batch)
      inputs = {'input_ids':      batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2], 
                'labels':         batch[3]}
      outputs = model(**inputs)
      loss = outputs[0]

      if args['gradient_accumulation_steps'] > 1:
        loss = loss / args['gradient_accumulation_steps']
          
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])

      tr_loss += loss.item()
      if (step + 1) % args['gradient_accumulation_steps'] == 0:
        optimizer.step()
        scheduler.step() 
        model.zero_grad()
        global_step += 1

        if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
          output_dir = os.path.join(args['output_dir'], 'checkpoint-{}'.format(global_step))
          if not os.path.exists(output_dir):
            os.makedirs(output_dir)
          model_to_save = model.module if hasattr(model, 'module') else model
          model_to_save.save_pretrained(output_dir)

  return global_step, tr_loss / global_step

In [59]:
config_class, model_class, tokenizer_class = MODEL_CLASSES[args["model_type"]]
config = config_class.from_pretrained(args["model_name"], num_labels=len(PROP_TECH_TO_LABEL))
tokenizer = tokenizer_class.from_pretrained(args["model_name"])
model = model_class(config)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [60]:
train_dataset = generate_training_dataset_from_articles([train_articles], [train_TC_labels], tokenizer)

In [62]:
global_step, tr_loss = train(train_dataset, model)

In [43]:
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(args['output_dir'])
tokenizer.save_pretrained(args['output_dir'])
torch.save(args, os.path.join(args['output_dir'], 'training_args.bin'))

## Evaluate

In [44]:
def generate_TC_eval_dataset_from_article(article_folder, indices_file, tokenizer):
  """
  Generates TC dataset to go into BERT from articles and labels
  """

  # First sort the filenames and make sure we have label file for each articles
  article_filenames = sorted(glob.glob(os.path.join(article_folder, "*.txt")))
  articles = {}

  # For each article, read them in:
  for i in range(len(article_filenames)):
    article_id = os.path.basename(article_filenames[i]).split(".")[0][7:]
    with codecs.open(article_filenames[i], "r", encoding="utf8") as f:
      articles[article_id] = f.read()

  # Read in indices file
  with open(indices_file, "r") as f:
    reader = csv.reader(f, delimiter="\t")
    ids_list = []
    seq_starts = []
    seq_ends = []
    article_sequences = []
    for row in reader:
      ids_list.append(row[0])
      seq_starts.append(row[2])
      seq_ends.append(row[3])
      article_sequences.append(articles[row[0]][int(row[2]):int(row[3])])

  dataframe = pd.DataFrame(None, range(len(ids_list)), ["id", "seq_starts", "seq_ends", "label", "text"])
  dataframe["id"] = ids_list
  dataframe["seq_starts"] = seq_starts
  dataframe["seq_ends"] = seq_ends
  dataframe["label"] = [0] * len(ids_list)
  dataframe["text"] = article_sequences

  # Process into features dataframe
  features = convert_dataframe_to_features(dataframe, args['max_seq_length'], tokenizer)
      
  # Creating TensorDataset from features
  all_input_ids = torch.tensor(features["input_ids"], dtype=torch.long)
  all_input_mask = torch.tensor(features["input_mask"], dtype=torch.long)
  all_segment_ids = torch.tensor(features["segment_ids"], dtype=torch.long)
  all_label_ids = torch.tensor(features["label_ids"], dtype=torch.long)

  dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
  return dataset, dataframe

In [45]:
def classify_techniques(eval_dataset, model):
  """
  Classifies a single article dataset and returns article id with indices list
  """
  # Load the eval data and initialize sampler
  eval_sampler = SequentialSampler(eval_dataset)
  eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'])

  preds = None

  # For each batch, evaluate
  for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"):
    model.eval()
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
      inputs = {'input_ids':      batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2],
                'labels':         batch[3]}
      outputs = model(**inputs)
      logits = outputs[1]

    # Get predictions
    if preds is None:
      preds = logits.detach().cpu().numpy()
    else:
      preds = numpy.append(preds, logits.detach().cpu().numpy(), axis=0)

  # Get the most probable prediction
  preds = numpy.argmax(preds, axis=1)

  return preds

In [1]:
article_filenames = sorted(glob.glob(os.path.join(dev_articles, "*.txt")))

output_file = 'datasets/TC.labels_pred_no_weights.txt'
f = open(output_file, 'w', newline='')
writer = csv.writer(f, delimiter='\t')
eval_dataset, eval_dataframe = generate_TC_eval_dataset_from_article(dev_articles, dev_TC_template, tokenizer)
predictions = classify_techniques(eval_dataset, model)
for i in range(len(predictions)):
  writer.writerow([eval_dataframe["id"][i], LABEL_TO_PROP_TECH[predictions[i]], eval_dataframe["seq_starts"][i], eval_dataframe["seq_ends"][i]])
f.close()

#### References:
The structure of data preporation is the modification of Henry Kim's [implementation](https://medium.com/@jihwangk/fine-grained-propaganda-detection-and-classification-with-bert-dfad4acaa321).

The structure of train() function is a modification of Thilina Rajapakse’s [implementation](https://github.com/ThilinaRajapakse/pytorch-transformers-classification).