# Previous version: CrowdTruth-MRE
This notebook was created because this way the project stays cleaner and more understandeable.

This time I try to organize the notebook more and label everything.
The content of this notebook overlaps with the previous one, at least at the beginning.

## Part One : Imports
In this part there are the imports of the necessary libraries and data files.

### Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
%config IPCompleter.greedy=True

In [None]:
from matplotlib import pyplot as plt
%pylab inline
import tensorflow as tf

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.metrics import accuracy_score
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import train_test_split as split
from torch.utils.data import DataLoader

In [None]:
from sklearn.metrics import classification_report
import torch.nn.functional as F

### Import data files

In [None]:
cause_raw = pd.ExcelFile(r'E:\Egyetem\tmp\Medical-Relation-Extraction\train_dev_test\ground_truth_cause.xlsx')
treat_raw = pd.ExcelFile(r'E:\Egyetem\tmp\Medical-Relation-Extraction\train_dev_test\ground_truth_treat.xlsx')

#### These are the train datasets

In [None]:
##cause_train = pd.read_csv(r'E:\Egyetem\tmp\Medical-Relation-Extraction\ground_truth_cause.csv')
##treat_train = pd.read_csv(r'E:\Egyetem\tmp\Medical-Relation-Extraction\ground_truth_treat.csv')

cause_train = pd.read_excel(cause_raw, 'train')
treat_train = pd.read_excel(treat_raw, 'train')

#### These are the test datasets

In [None]:
##cause_test = pd.read_excel(r'E:\Egyetem\tmp\Medical-Relation-Extraction\train_dev_test\ground_truth_cause.xlsx')
##treat_test = pd.read_excel(r'E:\Egyetem\tmp\Medical-Relation-Extraction\train_dev_test\ground_truth_treat.xlsx')

cause_test = pd.read_excel(cause_raw, 'test')
treat_test = pd.read_excel(treat_raw, 'test')

## Part Two : Preparing the data
Below here happens the preparation of the data for the NLP model.

### Label extraction function:
This function creats a label to every row in the datasets:
- this label is **1**:
    - if the experts' score is 1
    - there is no expert score but the crowd score is greater than 0
- every other case the label is **0**

In [None]:
def extract_labels(df):
    expert = df.expert
    crowd = df.crowd
    label = 0
    if expert == 1:
        label = 1
    elif pd.isnull(expert) and crowd > 0:
        label = 1
    
    return label

### Sentence modification function:
This function modifies the medical sentence:
- switch both *term1* and *term2* in the sencence with the word **ENTITY**

In [None]:
def mod_sentence(df):
    sentence = df.sentence
    term1 = df.term1
    term2 = df.term2
    return_sentence_part1 = sentence.replace(term1, "ENTITY1")
    return_sentence_part2 = return_sentence_part1.replace(term2, "ENTITY2")
    
    return return_sentence_part2

### Creating the learning data to the model
Now it is time to use the functions on the datasets

#### First let's use the *cause* table to create the *sentence-label* pairs for the model training

In [None]:
cause_train_sentence = []
for index, row in cause_train.iterrows():
    tmp_sentence = mod_sentence(row)
    cause_train_sentence.append(tmp_sentence)

cause_train_label = []
for index, row in cause_train.iterrows():
    tmp_label = extract_labels(row)
    cause_train_label.append(tmp_label)

#### Then create the same pairs for the testing

In [None]:
cause_test_sentence = []
for index, row in cause_test.iterrows():
    tmp_sentence = mod_sentence(row)
    cause_test_sentence.append(tmp_sentence)

cause_test_label = []
for index, row in cause_test.iterrows():
    tmp_label = extract_labels(row)
    cause_test_label.append(tmp_label)

#### Now let's do the same to the *treat* table

In [None]:
treat_train_sentence = []
for index, row in treat_train.iterrows():
    tmp_sentence = mod_sentence(row)
    treat_train_sentence.append(tmp_sentence)

treat_train_label = []
for index, row in treat_train.iterrows():
    tmp_label = extract_labels(row)
    treat_train_label.append(tmp_label)
    
treat_test_sentence = []
for index, row in treat_test.iterrows():
    tmp_sentence = mod_sentence(row)
    treat_test_sentence.append(tmp_sentence)

treat_test_label = []
for index, row in treat_test.iterrows():
    tmp_label = extract_labels(row)
    treat_test_label.append(tmp_label)

In [None]:
len(cause_train_sentence)

### Now the data is prepared:
- **cause_train_sentence** / **treat_train_sentence** : these are conitaining the sentences to train the model
- **cause_train_label** / **treat_train_label** : these conitain the labels for the training data
- **cause_test_sentence** / **treat_test_sentence** : these are conitaining the sentences for testing
- **cause_test_label** / **treat_test_label** : these conitain the labels for the testing data

## Part Three : Building a model
Now the data is prepared, it's time for building a model

For the first try I will create a **Logistic Regression**:

In [None]:
lr_c  = LogisticRegression(n_jobs=-1)

cause_vectorizer = CountVectorizer()
X_c = cause_vectorizer.fit(cause_train_sentence)
tr_vectors_c = X_c.transform(cause_train_sentence)
tst_vectors_c = X_c.transform(cause_test_sentence)

In [None]:
lr_c.fit(tr_vectors_c,cause_train_label)

In [None]:
lr_pred_c = lr_c.predict(tst_vectors_c)
print("Logistic Regression Test accuracy : {}".format(accuracy_score(cause_test_label, lr_pred_c)))

In [None]:
lr_t = LogisticRegression(n_jobs=-1)

treat_vectorizer = CountVectorizer()
X_t = treat_vectorizer.fit(treat_train_sentence)
tr_vectors_t = X_t.transform(treat_train_sentence)
tst_vectors_t = X_t.transform(treat_test_sentence)

lr_t.fit(tr_vectors_t,treat_train_label)

lr_pred_t = lr_t.predict(tst_vectors_t)
print("Logistic Regression Test accuracy : {}".format(accuracy_score(treat_test_label, lr_pred_t)))

### Logistic Regression results:
- **CAUSE**:
    - Accuracy: 95.5%
- **TREAT**:
    - Accuracy: 76%

Although this simple solution gives pretty fair results, I am going to try to improve that with a **neural network**: 

In [None]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
vectorizer = CountVectorizer()

word_to_ix = vectorizer.fit(treat_train_sentence)

In [None]:
VOCAB_SIZE = len(word_to_ix.vocabulary_)
OUT_DIM = 2

BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tr_data, val_data = treat_train_sentence, treat_test_sentence
tr_labels, val_labels = treat_train_label, treat_test_label

tr_data_vecs = torch.FloatTensor(word_to_ix.transform(tr_data).toarray())
val_data_vecs = torch.FloatTensor(word_to_ix.transform(val_data).toarray())

In [None]:
tr_data_loader = [(sample, label) for sample, label in zip(tr_data_vecs, tr_labels)]
val_data_loader = [(sample, label) for sample, label in zip(val_data_vecs, val_labels)]

train_iterator = DataLoader(tr_data_loader,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            )

valid_iterator = DataLoader(val_data_loader,
                          batch_size=BATCH_SIZE,
                          shuffle=False,
                          )

In [None]:
class MRE(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(MRE, self).__init__()
    
        self.hidden1 = nn.Linear(vocab_size, 250)
        self.act1 = nn.ReLU()
        
        self.hidden2 = nn.Linear(250, 100)
        self.act2 = nn.ReLU()
        
        self.hidden3 = nn.Linear(100, num_labels)
        self.act3 = nn.Sigmoid()
    
    def forward(self, bow_vec):
        bow_vec = self.hidden1(bow_vec)
        bow_vec = self.act1(bow_vec)
        
        bow_vec = self.hidden2(bow_vec)
        bow_vec = self.act2(bow_vec)
        
        bow_vec = self.hidden3(bow_vec)
        bow_vec = self.act3(bow_vec)
        return F.log_softmax(bow_vec, dim=1)

In [None]:
model = MRE(OUT_DIM, VOCAB_SIZE)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.NLLLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def class_accuracy(preds, y):
    """
    Returns accuracy per batch
    """
    rounded_preds = preds.argmax(1)
    correct = (rounded_preds == y).float()
    
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for texts, labels in iterator:
        texts = texts.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
                  
        predictions = model(texts)

        loss = criterion(predictions, labels)
        acc = class_accuracy(predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        
        for texts, labels in iterator:
            
            texts = texts.to(device)
            labels = labels.to(device)
            
            predictions = model(texts)
            loss = criterion(predictions, labels)
            
            acc = class_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

### Results:
- **CAUSE**:
    - Not yet calculated
- **TREAT**:
    - *Train*:
        - Accuracy: 99.55%
        - Loss: 0.318
    - *Validation*:
        - Accuracy: 79.38%
        - Loss: 0.515

## Now the next step is an LSTM network

In [None]:
import nltk

nltk.download("punkt")
nltk.download("wordnet")

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

### This CountVectorizer will:
- have max 5000 featuers
- use the words' lexical form
- ignore stop words ('/n' etc.)

In [None]:
vectorizer_embed = CountVectorizer(max_features=5000, tokenizer=LemmaTokenizer(), stop_words="english")

word_to_ix_embed = vectorizer_embed.fit(cause_train_sentence)

In [None]:
analyzer = word_to_ix_embed.build_analyzer()

In [None]:
analyzer("i have had the best time playing tennis")

In [None]:
len(word_to_ix_embed.vocabulary_)

### Text to tokens function:
This function wil convert the sentences to an array of numbers:
- if the word is in the vocabulary (defined above), then the word gets the value set in the vocabulary
- if the word is not in the vocabulary then it gets _unknown_ value (this case the value 5000)

In [None]:
def text_to_token(sentences, analyzer, vocab):
    sentences_as_tokens = []
    
    for sentence in sentences:
        tokens = analyzer(sentence)
        
        words_to_tokens = []
        
        for token in tokens:
            if token in vocab:
                words_to_tokens.append(vocab[token])
            else:
                words_to_tokens.append(3000)
        
        if not words_to_tokens:
            words_to_tokens.append(3000)
        
        sentences_as_tokens.append(torch.LongTensor(words_to_tokens).to(device))
    
    return sentences_as_tokens

In [None]:
vocab = word_to_ix_embed.vocabulary_
cause_as_ids = text_to_token(cause_train_sentence, analyzer, vocab)

In [None]:
type(cause_train_sentence)

In [None]:
cause_as_ids[3]

In [None]:
from torch.nn.utils.rnn import pad_sequence

### Same length sentences
Because the sentences should be the same length I have to fill out the shorter ones.
The _padding_value_ will be 5001 because 0-4999 is the value for different words, 5000 is the value for undefined words and the next value is 5001.
This value shows that the sentence is ended and it is just a placeholder value.

In [None]:
cause_padded = pad_sequence(cause_as_ids, batch_first=True, padding_value=5001)

In [None]:
type(cause_train_label)

In [None]:
def dataloaders_wtih_padding(train_data, train_label, val_data, val_label, word_to_ix):
    train_tokens = text_to_token(train_data, analyzer, word_to_ix.vocabulary_)
    tr_vectors = pad_sequence(train_tokens, batch_first=True, padding_value=5001)
    
    tr_labels = torch.LongTensor(train_label).to(device)
    tr_lens = torch.LongTensor(
        [len(i) for i in text_to_token(train_data, analyzer, word_to_ix.vocabulary_)]
    )
    
    tr_sents = train_data
    
    
    val_tokens = text_to_token(val_data, analyzer, word_to_ix.vocabulary_)
    val_vectors = pad_sequence(val_tokens, batch_first=True, padding_value=5001)
    
    val_labels = torch.LongTensor(val_label).to(device)
    val_lens = torch.LongTensor(
        [len(i) for i in text_to_token(val_data, analyzer, word_to_ix.vocabulary_)]
    )
    
    val_sents = val_data
    
    
    tr_data_loader = [
        (sample, label, length, sent)
        for sample, label, length, sent in zip(
            tr_vectors, tr_labels, tr_lens, tr_sents
        )
    ]
    val_data_loader = [
        (sample, label, length, sent)
        for sample, label, length, sent in zip(
            val_vectors, val_labels, val_lens, val_sents
        )
    ]
    
    return tr_data_loader, val_data_loader
    

In [None]:
tr_data_loader, val_data_loader = dataloaders_wtih_padding(
    cause_train_sentence, cause_train_label,
    cause_test_sentence, cause_test_label,
    word_to_ix
)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size, embedding_dim, hidden_dim):
        super(LSTMClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=5001)
        self.embedding.weight.requires_grad = True
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=1, bidirectional=False)
        self.linear = nn.Linear(hidden_dim, num_labels)
        self.dropout = nn.Dropout(0.25)
    
    def forward(self, text, sequence_lens):
        embedded = self.embedding(text)
    
        packed = nn.utils.rnn.pack_padded_sequence(embedded, sequence_lens, enforce_sorted=False, batch_first=True)
        packed_outputs, (h, c) = self.lstm(packed)
        
        y = self.linear(h[-1])
        log_probs = F.log_softmax(y, dim=1)
        return log_probs
        

In [None]:
VOCAB_SIZE = len(word_to_ix_embed.vocabulary_)
INPUT_DIM = VOCAB_SIZE + 2
OUTPUT_DIM = 2
EMBEDDING_DIM = 100
HIDDEN_DIM = 20
criterion = nn.NLLLoss()
BATCH_SIZE = 256

model = LSTMClassifier(OUTPUT_DIM, INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM)
model

In [None]:
train_iterator = DataLoader(
        tr_data_loader,
        batch_size=BATCH_SIZE,
        shuffle=True,
    )

valid_iterator = DataLoader(
        val_data_loader,
        batch_size=BATCH_SIZE,
        shuffle=False,
    )

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)

model = model.to(device)
criterion = criterion.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def train_embed(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0
    
    model.train()
    
    for batch in iterator:
        text_vecs = batch[0]
        labels = batch[1]
        sen_lens = batch[2]
        texts = batch[3]
        
        optimizer.zero_grad()

        predictions = model(text_vecs, sen_lens)
        
        loss = criterion(predictions, labels)

        prec, recall, fscore = calculate_performance(predictions, labels)
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_prec += prec.item()
        epoch_recall += recall.item()
        epoch_fscore += fscore.item()
    
    return(epoch_loss / len(iterator),
           epoch_prec / len(iterator),
           epoch_recall / len(iterator),
           epoch_fscore / len(iterator)
    )

In [None]:
def evaluate_embed(model, iterator, criterion):

    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0
    
    model.eval()

    with torch.no_grad():
        
        for batch in iterator:
            text_vecs = batch[0]
            labels = batch[1]
            sen_lens = batch[2]
            texts = batch[3]
            
            
        predictions = model(text_vecs, sen_lens)
        
        loss = criterion(predictions, labels)

        prec, recall, fscore = calculate_performance(predictions, labels)
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_prec += prec.item()
        epoch_recall += recall.item()
        epoch_fscore += fscore.item()
    
    return(epoch_loss / len(iterator),
           epoch_prec / len(iterator),
           epoch_recall / len(iterator),
           epoch_fscore / len(iterator)
    )

In [None]:
## N_EPOCHS = 15

## best_valid_loss = float("inf")

## for epoch in range(N_EPOCHS):
    ## start_time = time.time()
    
    ## train_loss, train_prec, train_rec, train_fscore = train_embed(model, train_iterator, optimizer, criterion)
    
    ## valid_loss, valid_prec, valid_rec, valid_fscore = evaluate_embed(model, valid_iterator, criterion)
    
    ## end_time = time.time()

    ## epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    ## print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    ## print(f"\tTrain Loss: {train_loss:.3f} | Train Prec: {train_prec*100:.2f}% | Train Rec: {train_rec*100:.2f}% | Train Fscore: {train_fscore*100:.2f}%")
    ## print(f"\t Val. Loss: {valid_loss:.3f} |  Val Prec: {valid_prec*100:.2f}% | Val Rec: {valid_rec*100:.2f}% | Val Fscore: {valid_fscore*100:.2f}%")