In [None]:
!pip install torchcrf
!pip install tokenizers
!pip install huggingface_hub
!pip install transformers
!pip install sentencepiece
!pip install sacremoses
!pip install pytorch-crf
!pip install git+https://github.com/kmkurn/pytorch-crf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchcrf
  Downloading TorchCRF-1.1.0-py3-none-any.whl (5.2 kB)
Installing collected packages: torchcrf
Successfully installed torchcrf-1.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 16.8 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting huggingface_hub
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 12.4 MB/s 
Installing collected packages: huggingface-hub
Successfully installed huggingface-hub-0.10.1
Looking in indexes: https://pypi.or

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Import modules

In [None]:
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Whitespace, Punctuation

import torch
import pandas as pd
import torch.nn as nn
import json
import math
import logging
import tqdm

In [None]:
import sys    
path_to_module = '/content/gdrive/MyDrive/IT458_project'
sys.path.append(path_to_module)

from utils import TRANSFORMER_PATH, LABEL_MAPPING, convert_examples_to_features, get_data


# Dataset preprocessing

In [None]:
path = "/content/gdrive/MyDrive/IT458_project/"

In [None]:
class GunViolenceDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.texts[index], self.labels[index]

In [None]:
def _gen_label(words, target):
    target_len = len(target)
    tags = ['O'] * len(words)
    for i in range(0, len(words)):
        try:
            if ' '.join(words[i:i+target_len]) == ' '.join(target):
                tags[i] = 'B'
                for j in range(i+1, i+target_len):
                    tags[j] = 'I'
        except IndexError as e:
            print(e)
            exit()

    return ' '.join(tags)

def preprocess(input_file, output_file, target_type):
    df = pd.read_csv(input_file, sep='\t')
    texts = df['Full text'].tolist()
    jsons = df['Json'].tolist()

    new_texts = []
    labels = []

    for text, data in zip(texts, jsons):
        try:
            # use BERT tokenizer to process whitespace and punctuaction
            data = json.loads(data)
            text = text.replace('\u200b', '')
            pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Punctuation()])
            tokenized_text = [t[0] for t in pre_tokenizer.pre_tokenize_str(text)]
            target = data[target_type + '-section'][0]['name']['value']
            tokenized_target = [t[0] for t in pre_tokenizer.pre_tokenize_str(target)]

            # if no target or empty in array, mark every token as O
            if not target:
                raise IndexError

            # generate labels for each tokenized token
            label = _gen_label(tokenized_text, tokenized_target)

            # keep tokenized text that has less than 512 text-length
            if label and len(tokenized_text) < 512:
                new_texts.append(' '.join(tokenized_text))
                labels.append(label)

        except IndexError:
            # mark every token as O
            new_texts.append(' '.join(tokenized_text))
            label = ' '.join(['O'] * len(tokenized_text))
            labels.append(label)

    data = list(zip(new_texts, labels))
    df = pd.DataFrame(data)
    df.columns = ['texts', 'labels']
    df.to_csv(output_file, index=False)

In [None]:
target = "victim"

preprocess(path + 'dataset/train.tsv', path + target + '/train.csv', target)
preprocess(path + 'dataset/dev.tsv', path + target + '/dev.csv', target)
preprocess(path + 'dataset/test.tsv', path + target + '/test.csv', target)

In [None]:
target = "shooter"

preprocess(path + 'dataset/train.tsv', path + target + '/train.csv', target)
preprocess(path + 'dataset/dev.tsv', path + target + '/dev.csv', target)
preprocess(path + 'dataset/test.tsv', path + target + '/test.csv', target)

# Models

### BERT_CRF_Linear

In [None]:
class BERT_CRF_Linear(nn.Module):
    def __init__(self, num_labels):
        super(BERT_CRF_Linear, self).__init__()
        config = torch.hub.load(TRANSFORMER_PATH, 'config', 'bert-base-cased')
        config.max_position_embeddings = 1024
        self.bert = torch.hub.load(TRANSFORMER_PATH, 'model', 'bert-base-cased')
        self.classifier = nn.Linear(768, num_labels)
        self.CRF_model = CRF(num_labels, batch_first=True)

    def forward(self, tokens_tensor, segments_tensors, labels=None):
        bert_output = self.bert(tokens_tensor, token_type_ids=segments_tensors)
        last_hidden_state = bert_output.last_hidden_state
        pooler_output = bert_output.pooler_output

        logits = self.classifier(last_hidden_state)

        # the CRF layer of NER labels
        crf_loss_list = self.CRF_model(logits, labels)
        crf_loss = torch.mean(-crf_loss_list)
        crf_predict = self.CRF_model.decode(logits)

		# the classifier of category & polarity
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.permute(0, 2, 1), labels)
        return torch.tensor(crf_predict).to('cuda'), logits, loss

### BERT_CRF_LSTM

In [None]:
class BERT_CRF_LSTM(nn.Module):
    def __init__(self, num_labels):
        super(BERT_CRF_LSTM, self).__init__()
        config = torch.hub.load(TRANSFORMER_PATH, 'config', 'bert-base-cased')
        config.max_position_embeddings = 1024
        self.bert = torch.hub.load(TRANSFORMER_PATH, 'model', 'bert-base-cased')
        self.lstm = nn.LSTM(768, 768)
        self.classifier = nn.Linear(768, num_labels)
        self.CRF_model = CRF(num_labels, batch_first=True)

    def forward(self, tokens_tensor, segments_tensors, labels=None):
        bert_output = self.bert(tokens_tensor, token_type_ids=segments_tensors)
        last_hidden_state = bert_output.last_hidden_state
        pooler_output = bert_output.pooler_output

        lstm_out, _ = self.lstm(last_hidden_state)
        logits = self.classifier(lstm_out)

        # the CRF layer of NER labels
        crf_loss_list = self.CRF_model(logits, labels)
        crf_loss = torch.mean(-crf_loss_list)
        crf_predict = self.CRF_model.decode(logits)

		# the classifier of category & polarity
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.permute(0, 2, 1), labels)
        return torch.tensor(crf_predict).to('cuda'), logits, loss

### BERT_CRF_BiLSTM

In [None]:
class BERT_CRF_BiLSTM(nn.Module):
    def __init__(self, num_labels):
        super(BERT_CRF_BiLSTM, self).__init__()
        config = torch.hub.load(TRANSFORMER_PATH, 'config', 'bert-base-cased')
        config.max_position_embeddings = 1024
        self.bert = torch.hub.load(TRANSFORMER_PATH, 'model', 'bert-base-cased')
        self.lstm = nn.LSTM(768, 768, bidirectional=True)
        self.classifier = nn.Linear(768, num_labels)
        # self.classifier = nn.Linear(768 * 2, num_labels)
        self.CRF_model = CRF(num_labels, batch_first=True)

    def forward(self, tokens_tensor, segments_tensors, labels=None):
        bert_output = self.bert(tokens_tensor, token_type_ids=segments_tensors)
        last_hidden_state = bert_output.last_hidden_state
        pooler_output = bert_output.pooler_output

        lstm_out, _ = self.lstm(last_hidden_state)
        lstm_out = lstm_out[:, :, :768] + lstm_out[:, :, 768:]

        logits = self.classifier(lstm_out)

        # the CRF layer of NER labels
        crf_loss_list = self.CRF_model(logits, labels)
        crf_loss = torch.mean(-crf_loss_list)
        crf_predict = self.CRF_model.decode(logits)

		# the classifier of category & polarity
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.permute(0, 2, 1), labels)
        return torch.tensor(crf_predict).to('cuda'), logits, loss

# Train and Evaluate model

In [None]:
def evaluate(model, evaluate_X, evaluate_Y, tokenizer, cuda_available, batch_size, max_seq_length, model_type, lr, epochs, path, get_accuracy):

    def _get_prediction(normalized_probs):
        # classify B, I, O based on probabilities
        labels = []
        for sample_prob in normalized_probs:
            max_prob = -math.inf
            label = None
            for i, prob in enumerate(sample_prob):
                if max_prob < prob:
                    max_prob = prob
                    label = i
            labels.append(label)
        return labels

    model.eval()
    num_samples = len(evaluate_X)
    evaluate_set = GunViolenceDataset(evaluate_X, evaluate_Y)
    evaluate_generator = DataLoader(
        evaluate_set,
        batch_size=1,
        shuffle=True,
    )
    num_of_tp = num_of_fn = num_of_fp = num_of_tn = 0

    for i, (evaluate_x, evaluate_y) in enumerate(evaluate_generator):
        tokens, labels = convert_examples_to_features(evaluate_x, evaluate_y, tokenizer, max_seq_length)

        indexed_tokens = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
        segments_ids = [[0] * len(indexed_token) for indexed_token in indexed_tokens]

        if cuda_available:
            segments_tensors = torch.tensor(segments_ids).to('cuda')
            tokens_tensor = torch.tensor(indexed_tokens).to('cuda')
            labels = torch.tensor(labels).to('cuda')
        else:
            segments_tensors = torch.tensor(segments_ids)
            tokens_tensor = torch.tensor(indexed_tokens)
            labels = torch.tensor(labels)

        with torch.no_grad():
            y_pred, logits, loss = model(tokens_tensor, segments_tensors, labels)
            normalized_probs = nn.functional.softmax(logits, dim=1)[0]
            results = y_pred[0]

            # get the real target
            original = ''
            for i, (x, y) in enumerate(zip(evaluate_x[0].split(), evaluate_y[0].split())):
                if y[0] == 'B':
                    original = x + ' '
                    index = i
                    while index + 1 < len(evaluate_y[0].split()) and evaluate_y[0].split()[index + 1][0] == 'I':
                        original += '{} '.format(evaluate_x[0].split()[index + 1])
                        index += 1
                    break
            original = original.strip()

            probabilities = []
            predictions = []
            prediction = []

            for token, tag, prob in zip(tokens[0], results, normalized_probs):
                if tag == 0:
                    # tag == 'B'
                    probabilities.append(prob)

                    if len(prediction) != 0:
                        predictions.append(prediction)
                        prediction = []
                    prediction.append(token)
                elif tag == 1:
                    # tag == 'I'
                    prediction.append(token)
            if len(prediction) != 0:
                predictions.append(prediction)

            # one sentence might generate multiple targets, eg. shooters or victims
            # we need to pick the most possible one, which is the one has the highest probability in 'B' tag
            max_prob = -math.inf
            max_prob_ind = 0
            for i, prob in enumerate(probabilities):
                if max_prob < prob[0]:
                    max_prob_ind = i
                    max_prob = prob[0]

            # calculate true positive, false positive, true negative, false negative
            result = ''
            if len(predictions) != 0:
                result = tokenizer.convert_tokens_to_string(predictions[max_prob_ind])
                if result == original:
                    num_of_tp += 1
                else:
                    num_of_fp += 1
            else:
                if original.strip() != '':
                    num_of_fn += 1
                else:
                    num_of_tn += 1

    accuracy = num_of_tp/num_samples if num_samples != 0 else 0
    precision = num_of_tp/(num_of_tp + num_of_fp) if num_of_tp + num_of_fp != 0 else 0
    recall = num_of_tp/(num_of_tp + num_of_fn) if num_of_tp + num_of_fn != 0 else 0

    if(get_accuracy):
      print('Accuracy : {}'.format(accuracy))
    else:
      print("\nEvaluation :\n")
      print('True positive : {}'.format(num_of_tp))
      print('False positive : {}'.format(num_of_tp))
      print('True negative : {}'.format(num_of_tp))
      print('False negative : {}'.format(num_of_tp))
      print('Accuracy : {}'.format(accuracy))
      print('Precision : {}'.format(precision))
      print('Recall : {}'.format(recall))
      print('F1_score : {}'.format(2 * precision * recall / (precision + recall) if precision + recall != 0 else 0))

In [None]:
def train(train_X, train_Y, learning_rate, cuda_available, epochs, model_type, is_balance, batch_size, max_seq_length, patience, min_delta, baseline, path, evaluate_X, evaluate_Y, lr):

    training_set = GunViolenceDataset(train_X, train_Y)
    training_generator = DataLoader(
        training_set,
        batch_size=batch_size,
        shuffle=True,
    )
    iter_in_one_epoch = len(train_X) // batch_size

    tokenizer = torch.hub.load(TRANSFORMER_PATH, 'tokenizer', 'bert-base-cased') # cased!
    model = None
    if model_type == 'LSTM':
        model = BERT_CRF_LSTM(3)
    elif model_type == 'BiLSTM':
        model = BERT_CRF_BiLSTM(3)
    else:
        model = BERT_CRF_Linear(3)  # 3 different labels: B, I, O

    if cuda_available:
        model.to('cuda')  # move data onto GPU

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    losses = []
    num_no_improve = 0
    best_loss = None
    stopping_epoch = 0

    for epoch in range(1, epochs + 1):
        loss = 0
        with tqdm.tqdm(training_generator, unit="batch") as tepoch:
            for i, (train_x, train_y) in enumerate(tepoch):
                tepoch.set_description(f"Epoch {epoch}")

                # prepare model input
                tokens, labels = convert_examples_to_features(train_x, train_y, tokenizer, max_seq_length)
                indexed_tokens = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
                segments_ids = [[0] * len(indexed_token) for indexed_token in indexed_tokens]

                if cuda_available:
                    segments_tensors = torch.tensor(segments_ids).to('cuda')
                    tokens_tensor = torch.tensor(indexed_tokens).to('cuda')
                    labels = torch.tensor(labels).to('cuda')
                else:
                    segments_tensors = torch.tensor(segments_ids)
                    tokens_tensor = torch.tensor(indexed_tokens)
                    labels = torch.tensor(labels)

                # forward pass
                y_pred, logits, loss = model(tokens_tensor, segments_tensors, labels)
                losses.append((epoch + i / iter_in_one_epoch, loss.item()))

                # display loss
                tepoch.set_postfix(loss="{:.4f}".format(loss.item()))

                # zero out gradients
                optimizer.zero_grad()

                # backward pass
                loss.backward()

                # update parameters
                optimizer.step()

            if not best_loss:
                best_loss = loss
            elif loss <= best_loss + min_delta:
                best_loss = loss
                num_no_improve += 1
            elif loss < baseline:
                num_no_improve += 1
            if num_no_improve > patience:
                stopping_epoch = epoch
                logging.info('Early Stop on epoch {} with the best loss {}'.format(stopping_epoch, best_loss))
                break

        torch.save(model, path + model_type + "_model")
        #evaluate(model, evaluate_X, evaluate_Y, tokenizer, cuda_available, batch_size, max_seq_length, model_type, lr, epochs, path, True)

    return model, tokenizer, stopping_epoch

In [None]:
lr = 1e-4
epochs = 7
batch_size = 40
max_seq_length = 256
is_balance = True
patience = 10
min_delta = 0
baseline = 0.0001
cuda_available = torch.cuda.is_available()

### For victim

In [None]:
input_dir = path + "victim"
output_dir = path + "victim/"
train_X, train_Y = get_data(input_dir + '/train.csv', is_balance)
dev_X, dev_Y = get_data(input_dir + '/dev.csv', is_balance)
test_X, test_Y = get_data(input_dir + '/test.csv')
train_X += dev_X
train_Y += dev_Y

In [None]:
model, tokenizer, stopping_epoch = train(train_X, train_Y, lr, cuda_available, epochs, "Linear", is_balance, batch_size, max_seq_length, patience, min_delta, baseline, output_dir, test_X, test_Y, lr )
evaluate(model, test_X, test_Y, tokenizer, cuda_available, batch_size, max_seq_length, "Linear", lr, stopping_epoch, output_dir, False)

In [None]:
model, tokenizer, stopping_epoch = train(train_X, train_Y, lr, cuda_available, epochs, "LSTM", is_balance, batch_size, max_seq_length, patience, min_delta, baseline, output_dir, test_X, test_Y, lr )
evaluate(model, test_X, test_Y, tokenizer, cuda_available, batch_size, max_seq_length, "LSTM", lr, stopping_epoch, output_dir, False)

In [None]:
model, tokenizer, stopping_epoch = train(train_X, train_Y, lr, cuda_available, epochs, "BiLSTM", is_balance, batch_size, max_seq_length, patience, min_delta, baseline, output_dir, test_X, test_Y, lr )
evaluate(model, test_X, test_Y, tokenizer, cuda_available, batch_size, max_seq_length, "BiLSTM", lr, stopping_epoch, output_dir, False)

### For shooter

In [None]:
input_dir = path + "shooter"
output_dir = path + "shooter/"
train_X, train_Y = get_data(input_dir + '/train.csv', is_balance)
dev_X, dev_Y = get_data(input_dir + '/dev.csv', is_balance)
test_X, test_Y = get_data(input_dir + '/test.csv')
train_X += dev_X
train_Y += dev_Y

In [None]:
model, tokenizer, stopping_epoch = train(train_X, train_Y, lr, cuda_available, epochs, "Linear", is_balance, batch_size, max_seq_length, patience, min_delta, baseline, output_dir, test_X, test_Y, lr )
evaluate(model, test_X, test_Y, tokenizer, cuda_available, batch_size, max_seq_length, "Linear", lr, stopping_epoch, output_dir)

In [None]:
model, tokenizer, stopping_epoch = train(train_X, train_Y, lr, cuda_available, epochs, "LSTM", is_balance, batch_size, max_seq_length, patience, min_delta, baseline, output_dir, test_X, test_Y, lr )
evaluate(model, test_X, test_Y, tokenizer, cuda_available, batch_size, max_seq_length, "LSTM", lr, stopping_epoch, output_dir)

In [None]:
model, tokenizer, stopping_epoch = train(train_X, train_Y, lr, cuda_available, epochs, "BiLSTM", is_balance, batch_size, max_seq_length, patience, min_delta, baseline, output_dir, test_X, test_Y, lr )
evaluate(model, test_X, test_Y, tokenizer, cuda_available, batch_size, max_seq_length, "BiLSTM", lr, stopping_epoch, output_dir)