In [0]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [0]:
!pip install transformers

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import string

root_path = "/content/drive/My Drive/"
train_folder = root_path + "datasets/train-articles" # check that the path to the datasets folder is correct,
dev_folder = root_path + "datasets/dev-articles"     # if not adjust these variables accordingly
train_labels_file = root_path + "datasets/train-task2-TC.labels"
dev_template_labels_file = root_path + "datasets/dev-task-TC-template.out"

from sklearn.linear_model import LogisticRegression
import glob
import os.path
import numpy as np
import codecs
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords

def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    """
    Read articles from files matching patterns <file_pattern> from
    the directory <folder_name>.
    The content of the article is saved in the dictionary whose key
    is the id of the article (extracted from the file name).
    Each element of <sentence_list> is one line of the article.
    """
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with codecs.open(filename, "r", encoding="utf8") as f:
            articles[article_id] = f.read().replace("\r","")
    return articles

def read_predictions_from_file(filename):
    """
    Reader for the gold file and the template output file.
    Return values are four arrays with article ids, labels
    (or ? in the case of a template file), begin of a fragment,
    end of a fragment.
    """
    articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            article_id, gold_label, span_start, span_end = row.rstrip().split("\t")
            articles_id.append(article_id)
            gold_labels.append(gold_label)
            span_starts.append(span_start)
            span_ends.append(span_end)
    return articles_id, span_starts, span_ends, gold_labels

def get_fragments(articles, article_ids, span_starts, span_ends):
    fragments = []
    prev_windows = []
    next_windows = []
    for i in range(len(article_ids)):
        idx = article_ids[i]
        start = int(span_starts[i])
        end = int(span_ends[i])
        fragment = articles[idx][start:end]
        fragments.append(fragment)
        if (start-300) > 0:
            s = start-300
        else:
            s = 0
        if (end + 300) <= len(articles[idx]):
            e = end + 300
        else:
            e = len(articles[idx])
        next_window_char = articles[idx][end:e]
        prev_window_char = articles[idx][s:start]
        next_window_tokens = next_window_char.split(" ")
        prev_window_tokens = prev_window_char.split(" ")
        if len(next_window_tokens) > 40:
            next_window_tokens = next_window_tokens[0:40]
        if len(prev_window_tokens) > 40:
            prev_window_tokens = prev_window_tokens[-40:]
        next_window = ' '.join(next_window_tokens)
        prev_window = ' '.join(prev_window_tokens)
        prev_windows.append(prev_window)
        next_windows.append(next_window)

    return fragments, next_windows, prev_windows

def get_sentences_containing_fragments(articles, article_ids, span_starts, span_ends, labels):
    d = {}
    for i in range(len(article_ids)):
        idx = article_ids[i]
        fragment = {'start': int(span_starts[i]), 'end' : int(span_ends[i]), 'label' : labels[i]}
        if idx not in d:
          d[idx] = [fragment]
        else:
          d[idx].append(fragment)
    unique_article_ids = sorted(set(article_ids))
    train_sentences = []
    train_sent_labels = []
    for art_id in unique_article_ids:
        article = articles[art_id]
        sentences = article.split('\n')
        sentence_indices = []
        fragments_in_article = d[art_id]
        start_fragment = 0
        sent_start = 0
        sent_end = 0
        for sent in sentences:
            sent_start = sent_end
            if len(sent) == 1:
                sent_end = sent_start + 2
                continue
            if len(sent) == 0:
                sent_end = sent_start + 1
                continue
            sent_end = sent_start + len(sent) - 1
            fragment_in_sent = None
            max_length = 0
            for frag in fragments_in_article:
                if (frag['start'] >= sent_start and frag['start'] < sent_end) or (frag['end'] > sent_start and frag['end'] <= sent_end):
                    frag_lt = min(sent_end,frag['end']) - max(sent_start,frag['start'])
                    if frag_lt > max_length:	
                        fragment_in_sent = frag
                        max_length = frag_lt
            if fragment_in_sent != None:
		            train_sentences.append(sent) 
		            train_sent_labels.append(fragment_in_sent['label'])
            else:
                train_sentences.append(sent)
                train_sent_labels.append('No_Propaganda')
            sent_end = sent_end + 2        
    return train_sentences, train_sent_labels

### MAIN ###

# loading articles' content from *.txt files in the train folder
articles = read_articles_from_file_list(train_folder)

# loading gold labels, articles ids and sentence ids from files *.task-TC.labels in the train labels folder
ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels = read_predictions_from_file(train_labels_file)
print("Loaded %d annotations from %d articles" % (len(ref_span_starts), len(set(ref_articles_id))))

# # compute one feature for each fragment, i.e. the length of the fragment, and train the model
train_fragments, train_next_windows, train_prev_windows = get_fragments(articles, ref_articles_id, ref_span_starts, ref_span_ends)
train_fragment_labels = train_gold_labels

#sentences, sent_labels = get_sentences_containing_fragments(articles, ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels)

In [0]:
#Uncomment to write fragments to csv for analysis
# def data_for_analysis(fragments, next_windows, prev_windows, labels, article_ids):
#     rows = []
#     for i in range(len(fragments)):
#         l = [labels[i],  prev_windows[i], fragments[i], next_windows[i], article_ids[i]]
#         rows.append(l)
#     df = pd.DataFrame(rows, columns=['Propaganda Type','Prev window','Fragment','Next window', 'Article ID'])
#     df.to_csv(root_path + 'analysis_data.csv')
        
# data_for_analysis(fragments, next_windows, prev_windows, fragment_labels, ref_articles_id)

In [0]:
#Create the following datapoints - previous fragment + sentence, sentence + next fragment. These are input to BERT as a sentence pair.
train_prev_fragment_list = []
train_fragment_next_list = []
for i in range(len(train_fragments)):
    prev_fragment = train_prev_windows[i] + train_fragments[i]
    fragment_next = train_fragments[i] + train_next_windows[i]
    train_prev_fragment_list.append(prev_fragment)
    train_fragment_next_list.append(fragment_next)

In [0]:
train_inputs1 = train_prev_fragment_list
train_inputs2 = train_fragment_next_list
train_labels = train_fragment_labels

In [0]:
train_sentences1 = np.array(train_inputs1)
train_sentences2 = np.array(train_inputs2)
train_labels, uniques = pd.factorize(train_labels, sort=True)
print(uniques)

keys = [i for i in range(14)]
values = uniques
dict_types = dict(zip(keys, values))
print(dict_types)

labels = np.array(train_labels)

In [0]:
from transformers import BertTokenizer
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [0]:
# Tokenize all of the sentences and map the tokens to their word IDs.
train_input_ids = []
for s1, s2 in zip(train_sentences1, train_sentences2):
    encoded_sent = tokenizer.encode(
                        s1,                 
                        text_pair = s2,
                        add_special_tokens = True
                   )
    train_input_ids.append(encoded_sent)

print('Original: ', train_sentences1[0])
print('Original 2: ', train_sentences2[0])
print('Token IDs:', train_input_ids[0])

In [0]:
print('Max sentence length: ', max([len(sen) for sen in train_input_ids]))

In [0]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 520
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

In [0]:
train_attention_masks = []
for sent in train_input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    train_attention_masks.append(att_mask)

In [0]:
train_token_type_ids = []
for sent in train_input_ids:
    for i in range(len(sent)):
      if sent[i] == 102:
        break
    t = [0 for i in range(i+1)]
    t =  t + [1] * (len(sent) - len(t))
    train_token_type_ids.append(t)

In [0]:
train_inputs = train_input_ids
train_masks = train_attention_masks

In [0]:
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_type_ids = torch.tensor(train_token_type_ids)

In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_batch_size = 8
train_data = TensorDataset(train_inputs, train_masks, train_token_type_ids, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

In [0]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 14, 
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
model.cuda()

In [0]:
dev_articles = read_articles_from_file_list(dev_folder)
dev_article_ids, dev_span_starts, dev_span_ends, dev_labels = read_predictions_from_file(dev_template_labels_file)
dev_fragments, dev_next_windows, dev_prev_windows = get_fragments(dev_articles, dev_article_ids, dev_span_starts, dev_span_ends)
#dev_sentences, dev_labels = get_sentences_containing_fragments(dev_articles, dev_article_ids, dev_span_starts, dev_span_ends, dev_labels)

dev_prev_fragment_list = []
dev_fragment_next_list = []
for i in range(len(dev_fragments)):
    prev_fragment = dev_prev_windows[i] + dev_fragments[i]
    fragment_next = dev_fragments[i] + dev_next_windows[i]
    dev_prev_fragment_list.append(prev_fragment)
    dev_fragment_next_list.append(fragment_next)

dev_inputs1 = dev_prev_fragment_list
dev_inputs2 = dev_fragment_next_list

dev_sentences1 = np.array(dev_inputs1)
dev_sentences2 = np.array(dev_inputs2)

In [0]:
dev_input_ids = []
for s1,s2 in zip(dev_sentences1, dev_sentences2):
    encoded_sent = tokenizer.encode(
                        s1,                 
                        text_pair = s2,
                        add_special_tokens = True
                   )
    dev_input_ids.append(encoded_sent)

dev_input_ids = pad_sequences(dev_input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

dev_attention_masks = []
for seq in dev_input_ids:
  seq_mask = [float(i>0) for i in seq]
  dev_attention_masks.append(seq_mask) 

dev_token_type_ids = []
for sent in dev_input_ids:
    for i in range(len(sent)):
      if sent[i] == 102:
        break
    t = [0 for i in range(i+1)]
    t =  t + [1] * (len(sent) - len(t))
    dev_token_type_ids.append(t)

prediction_inputs = torch.tensor(dev_input_ids)
prediction_masks = torch.tensor(dev_attention_masks)
prediction_token_type_ids = torch.tensor(dev_token_type_ids)

dev_batch_size = 8
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_token_type_ids)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=dev_batch_size)

In [0]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8 
                )

In [0]:
from transformers import get_linear_schedule_with_warmup
epochs = 6
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [0]:
import numpy as np
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
import random
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
loss_values = []
for epoch_i in range(0, epochs):
    print("")
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
    print('Training')
    t0 = time.time()
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_token_type_ids = batch[2].to(device)
        b_labels = batch[3].to(device)
        model.zero_grad()
        outputs = model(b_input_ids, 
                    token_type_ids=b_token_type_ids, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)            
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

print("Training complete!")

In [0]:
model.eval()
predictions = []

for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_token_type_ids = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids, 
                  attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    predictions = predictions + list(logits)

predicted_labels = []

for i in range(len(predictions)):
    pred_labels_i = np.argmax(predictions[i])
    predicted_labels.append(pred_labels_i)

predicted_labels = [dict_types[pred] for pred in predicted_labels]

# writing predictions to file to be uploaded to semeval task 11 submissions page.
task_TC_output_file = root_path + "output-TC-bert-fragprevnext-epoch-" + str(epoch_i) + ".txt"

with open(task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(dev_article_ids, predicted_labels, dev_span_starts, dev_span_ends):
        fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end))
print("Predictions written to file " + task_TC_output_file) 
print("Done predicting for epoch" + str(epoch_i))