# Previous version: CrowdTruth-MRE and CrowdTruth_MRE_2
In this notebook I will only use the **CAUSE** table but everything can be performed on the other table the same.

## Part One : Imports
In this part there are the imports of the necessary libraries and data files.

### Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
%config IPCompleter.greedy=True

In [None]:
from matplotlib import pyplot as plt
%pylab inline
import tensorflow as tf

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.metrics import accuracy_score
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import train_test_split as split
from torch.utils.data import DataLoader

In [None]:
from sklearn.metrics import classification_report
import torch.nn.functional as F

### Import data files
(mostly same as in _CrowdTruth_MRE_2_)

In [None]:
cause_raw = pd.ExcelFile(r'E:\Egyetem\tmp\Medical-Relation-Extraction\train_dev_test\ground_truth_cause.xlsx')
cause_train = pd.read_excel(cause_raw, 'train')
cause_dev = pd.read_excel(cause_raw, 'dev')
cause_test = pd.read_excel(cause_raw, 'test')

## Part Two : Preparing the data
Below here happens the preparation of the data for the NLP model.

(mostly same as in _CrowdTruth_MRE_2_)

In [None]:
def extract_labels(df):
    expert = df.expert
    crowd = df.crowd
    label = 0
    if expert == 1:
        label = 1
    elif pd.isnull(expert) and crowd > 0:
        label = 1
    
    return label

In [None]:
def replace_entities(df):
    sentence = df.sentence
    term1 = df.term1
    term2 = df.term2
    return_sentence_part1 = sentence.replace(term1, "ENTITY1")
    return_sentence_part2 = return_sentence_part1.replace(term2, "ENTITY2")
    
    return return_sentence_part2

In [None]:
def create_data_list(df, func):
    return_list = []
    for index, row in df.iterrows():
        tmp = func(row)
        return_list.append(tmp)
    
    return return_list

In [None]:
train_sentence = create_data_list(cause_train, replace_entities)
train_label = create_data_list(cause_train, extract_labels)
train_length = []
for element in train_sentence:
    train_length.append(len(element))

dev_sentence = create_data_list(cause_dev, replace_entities)
dev_label = create_data_list(cause_dev, extract_labels)
dev_length = []
for element in dev_sentence:
    dev_length.append(len(element))

test_sentence = create_data_list(cause_test, replace_entities)
test_label = create_data_list(cause_test, extract_labels)
test_length = []
for element in test_sentence:
    test_length.append(len(element))

In [None]:
import nltk

nltk.download("punkt")
nltk.download("wordnet")

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [None]:
vectorizer = CountVectorizer(max_features=3000, tokenizer=LemmaTokenizer(), stop_words="english")

In [None]:
word_to_ix = vectorizer.fit(train_sentence)
VOCAB_SIZE = len(word_to_ix.vocabulary_)

In [None]:
an = word_to_ix.build_analyzer()

In [None]:
an("Look to my coming on the first light of the fifth day at dawn look to the east")

In [None]:
def create_input(dataset, analyzer, vocabulary):
    indices = []

    for sentence in dataset:
        tokens = analyzer(sentence)
        token_ids = []

        for token in tokens:
            
            if token in vocabulary:
                token_ids.append(vocabulary[token])
            else:
                token_ids.append(3000)

        if not token_ids:
            token_ids.append(3000)
        indices.append(torch.LongTensor(token_ids).to(device))

    return indices

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataset_as_ids = create_input(train_sentence, an, word_to_ix.vocabulary_)

In [None]:
dataset_as_ids[1]

In [None]:
from torch.nn.utils.rnn import pad_sequence

In [None]:
padded = pad_sequence(dataset_as_ids, batch_first=True, padding_value=3001)

In [None]:
def prepare_dataloader_with_padding(tr_data, tr_label,
                                    val_data, val_label,
                                    word_to_ix):
    tr_data_vecs = pad_sequence(
        create_input(tr_data, an, word_to_ix.vocabulary_),
        batch_first=True,
        padding_value=3001,
    )
    tr_labels = torch.LongTensor(tr_label).to(device)
    tr_lens = torch.LongTensor(
        [len(i) for i in create_input(tr_data, an, word_to_ix.vocabulary_)]
    )
    
    tr_sents = tr_data
    
    val_data_vecs = pad_sequence(
        create_input(val_data, an, word_to_ix.vocabulary_),
        batch_first=True,
        padding_value=3001,
    )
    val_labels = torch.LongTensor(val_label).to(device)
    val_lens = torch.LongTensor(
        [len(i) for i in create_input(val_data, an, word_to_ix.vocabulary_)]
    )

    val_sents = val_data
    
    tr_data_loader = [
        (sample, label, length, sent)
        for sample, label, length, sent in zip(
            tr_data_vecs, tr_labels, tr_lens, tr_sents
        )
    ]
    val_data_loader = [
        (sample, label, length, sent)
        for sample, label, length, sent in zip(
            val_data_vecs, val_labels, val_lens, val_sents
        )
    ]

    return tr_data_loader, val_data_loader
    

In [None]:
tr_data_loader, val_data_loader = prepare_dataloader_with_padding(
    train_sentence, train_label, dev_sentence + test_sentence, dev_label + test_label, word_to_ix
)

In [None]:
def create_dataloader_iterators_with_padding(tr_data_loader, val_data_loader, BATCH_SIZE):
    train_iterator = DataLoader(
        tr_data_loader,
        batch_size=BATCH_SIZE,
        shuffle=True,
    )

    valid_iterator = DataLoader(
        val_data_loader,
        batch_size=BATCH_SIZE,
        shuffle=False,
    )

    return train_iterator, valid_iterator

In [None]:
BATCH_SIZE = 128
train_iterator, valid_iterator = create_dataloader_iterators_with_padding(
    tr_data_loader, val_data_loader, BATCH_SIZE
)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size, embedding_dim, hidden_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=3001)
        self.embedding.weight.requires_grad = True

        # Documentation: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            batch_first=True,
            num_layers=1,
            bidirectional=False,
        )
        self.linear = nn.Linear(hidden_dim, num_labels)
        self.dropout = nn.Dropout(0.25)

    def forward(self, text, sequence_lens):
        embedded = self.embedding(text)

        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, sequence_lens, enforce_sorted=False, batch_first=True
        )
        packed_outputs, (h, c) = self.lstm(packed)
        lstm_outputs, lens = nn.utils.rnn.pad_packed_sequence(
            packed_outputs, batch_first=True
        )

        y = self.linear(h[-1])
        log_probs = F.log_softmax(y, dim=1)
        return log_probs

In [None]:
INPUT_DIM = VOCAB_SIZE + 2
OUTPUT_DIM = 2
EMBEDDING_DIM = 100
HIDDEN_DIM = 20
criterion = nn.NLLLoss()

In [None]:
model = LSTMClassifier(OUTPUT_DIM, INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM)

model = model.to(device)
criterion = criterion.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
from sklearn.metrics import precision_recall_fscore_support


def calculate_performance(preds, y):
    """
    Returns precision, recall, fscore per batch
    """
    rounded_preds = preds.argmax(1)

    precision, recall, fscore, support = precision_recall_fscore_support(
        rounded_preds.cpu(), y.cpu()
    )

    return precision[1], recall[1], fscore[1]

In [None]:
import torch.nn.functional as F


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0

    model.train()

    for batch in iterator:
        text_vecs = batch[0]
        labels = batch[1]
        sen_lens = []
        texts = []

        if len(batch) > 2:
            sen_lens = batch[2]
            texts = batch[3]

        optimizer.zero_grad()

        predictions = model(text_vecs, sen_lens)

        loss = criterion(predictions, labels)

        prec, recall, fscore = calculate_performance(predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_prec += prec.item()
        epoch_recall += recall.item()
        epoch_fscore += fscore.item()
    return (
        epoch_loss / len(iterator),
        epoch_prec / len(iterator),
        epoch_recall / len(iterator),
        epoch_fscore / len(iterator),
    )

In [None]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0
    model.eval()

    with torch.no_grad():

        for batch in iterator:
            text_vecs = batch[0]
            labels = batch[1]
            sen_lens = []
            texts = []

            if len(batch) > 2:
                sen_lens = batch[2]
                texts = batch[3]

            predictions = model(text_vecs, sen_lens)
            loss = criterion(predictions, labels)

            prec, recall, fscore = calculate_performance(predictions, labels)

            epoch_loss += loss.item()
            epoch_prec += prec.item()
            epoch_recall += recall.item()
            epoch_fscore += fscore.item()

    return (
        epoch_loss / len(iterator),
        epoch_prec / len(iterator),
        epoch_recall / len(iterator),
        epoch_fscore / len(iterator),
    )

In [None]:
import time

# This is just for measuring training time!
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
def training_loop(epoch_number=15):
    N_EPOCHS = epoch_number

    for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss, train_prec, train_rec, train_fscore = train(
            model, train_iterator, optimizer, criterion
        )
       
        valid_loss, valid_prec, valid_rec, valid_fscore = evaluate(
            model, valid_iterator, criterion
        )

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        

        print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
        print(
            f"\tTrain Loss: {train_loss:.3f} | Train Prec: {train_prec*100:.2f}% | Train Rec: {train_rec*100:.2f}% | Train Fscore: {train_fscore*100:.2f}%"
        )
        print(
            f"\t Val. Loss: {valid_loss:.3f} |  Val Prec: {valid_prec*100:.2f}% | Val Rec: {valid_rec*100:.2f}% | Val Fscore: {valid_fscore*100:.2f}%"
        )

In [None]:
training_loop(epoch_number=15)