In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks")
!ls

In [None]:
import pandas as pd

import csv
import os
path = '/content/drive/My Drive/Colab Notebooks/'

### Depending on the input text format: <br>
#### there are 3 kinds:<br>
sentence_idx  &emsp;        word       &emsp;     tag<br>
1           &emsp;           EU         &emsp;     O<br>
2         &emsp;            word      &emsp;       O<br>
<br>
1            &emsp;       Blackburn   &emsp;   I-PER<br>
2           &emsp;         1996         &emsp;     O<br>
3          &emsp;           .          &emsp;      O<br>
<br>
1         &emsp;       -<DOCSTART>-     &emsp;     O<br>
2        &emsp;                       &emsp;       O<br>


#### Conclusion: <br>
If I want to move input file to torchtext dataset.SentencetTaggingDataset.split, I only need word and tag columns. And I also want to remove DOCSTART tag. <br>
We can also find that sentence split with blank line, not only in origina file, but also in input format for tortchtext.

In [None]:
def readfile(filename, csv_outfile, tsv_outfile):
  with open(filename, 'rb') as f:
    strip_line = (line.strip('\n') for line in f)
    lines = (line.split(' ') for line in strip_line if line)
        
    with open(csv_outfile, 'w') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(('sentence_idx', 'word', 'NER_tag'))
        writer.writerows(lines)
  
    data = pd.read_csv(filename)
    data = data.drop(df[df['word']=='-DOCSTART-'].index)
    data['word'].fillna('NA', inplace=True)

    indices = data.loc[data['sentence_idx'] == 1].index.tolist()
    df = pd.DataFrame({'sentence_idx':0,'word':np.nan,'NER_tag':np.nan}, index=[i-0.5 for i in indices])
    df_new = data.append(df)
    df_new = df_new.sort_index()
    df_new = df_new.reset_index(drop=True)
    df_new.drop[df_new['sentence_idx'].index]
    df_new.drop('sentence_idx', axis=1)
    df_new.drop(-0.5,axis=0)

  df_new.to_csv(tsv_outfile, sep = '\t', index=False, header=False)
    
  return True

## I have already done these steps in other file, I just use the cleaned data for training.

In [None]:
! pip install torchtext==0.6.0

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.data import Field, BucketIterator
from torchtext.datasets import SequenceTaggingDataset

import spacy
import numpy as np

import time
import random

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import torch.nn.functional as F

In [None]:
SEED = 1000
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
class Data(object):

  def __init__(self, path, batch_size):
    # list all the fields
    self.word_field = Field(lower=False)
    self.tag_field = Field(unk_token=None)
    # create dataset using built-in parser from torchtext
    self.train_dataset, self.val_dataset = SequenceTaggingDataset.splits(path=path, train="train.tsv", validation="dev.tsv", fields=(("word", self.word_field), ("tag", self.tag_field)))
    #self.test_dataset = SequenceTaggingDataset.splits(path=path, test="test1.tsv", fields=(("word", self.word_field), ("tag", self.tag_field)))
    # convert fields to vocabulary list
    # min_freq is min word frequency, words occurring less than 3 times will be ignored from vocab
    self.word_field.build_vocab(self.train_dataset.word, min_freq=3)
    self.tag_field.build_vocab(self.train_dataset.tag)
    
    # create iterator for batch input
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.train_iter, self.val_iter = BucketIterator.splits(datasets=(self.train_dataset, self.val_dataset),batch_size=batch_size, device=device)
    #self.test_iter = BucketIterator.splits(datasets=(self.train_dataset, self.val_dataset, self.test_dataset),batch_size=batch_size, device=device)
    # prepare padding index to be ignored during model training/evaluation
    self.word_pad_idx = self.word_field.vocab.stoi[self.word_field.pad_token]
    self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]

In [None]:
data = Data(
    path=path,
    batch_size=16
)

In [None]:
print('This train dataset have %d sentences.'%len(data.train_dataset))
print('This validation dataset have %d sentences.'%len(data.val_dataset))

In [None]:
print(data.tag_field.pad_token)

In [None]:
print(data.word_field.vocab.stoi)

In [None]:
print(data.tag_field.vocab.stoi)

In [None]:
class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, input_dim, emb_dropout,
                 lstm_dropout, fc_dropout, output_dim, word_pad_idx):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.emb_dropout = emb_dropout
        self.fc_dropout = fc_dropout
        self.output_dim = output_dim
        self.word_pad_idx = word_pad_idx
        
        self.word_embeddings = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim, padding_idx=word_pad_idx)
        # dropout before bilstm layer
        self.emb_dropout = nn.Dropout(emb_dropout)
        
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim,
                            num_layers=1, bidirectional=True, dropout=lstm_dropout)
        # dropout after bilstm layer
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        #self.fc_dropout = nn.Dropout(fc_dropout)

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        embedding_out = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedding_out)
        tag_space = self.fc(self.fc_dropout(lstm_out))
        tag_score = F.elu(tag_space)
        return tag_score
    
    def init_weight(self):
        for name, param in self.named_parameters():
            nn.init.normal_(param.data, mean=0, std=0.1)
            
    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [None]:
bilstm = BiLSTM(
    embedding_dim=100, hidden_dim=256, input_dim =len(data.word_field.vocab), emb_dropout=0.3, lstm_dropout=0.33, fc_dropout=0.25, output_dim=128, word_pad_idx = data.word_pad_idx)
print(f"The model has {bilstm.count_parameters():,} trainable parameters.")
print(bilstm)

In [None]:
model = bilstm
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
n_epochs = 10

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
class NER(object):

  def __init__(self, model, data, optimizer, loss_fn):
    self.model = model
    self.data = data
    self.optimizer = optimizer
    self.loss_fn = loss_fn(ignore_index=self.data.tag_pad_idx)

  @staticmethod
  def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

  def accuracy(self, preds, y):
    max_preds = preds.argmax(dim=1, keepdim=True)  # get the index of the max probability
    non_pad_elements = (y!=self.data.tag_pad_idx).nonzero()  # prepare masking for paddings
    correct = max_preds[non_pad_elements].squeeze(1)
    correct_sum = correct.eq(y[non_pad_elements]).sum()
    accuracy = correct_sum / torch.FloatTensor([y[non_pad_elements].shape[0]])
    
    return accuracy


  def epoch(self):
      epoch_loss = 0
      epoch_acc = 0
      self.model.train()
      for batch in self.data.train_iter:
        # text :sent len, batch size
        text = batch.word
        # tags :sent len, batch size
        true_tags = batch.tag
        self.optimizer.zero_grad()
        pred_tags = self.model(text)
        # to calculate the loss and accuracy, we flatten both prediction and true tags
        # flatten pred_tags to [sent len, batch size, output dim]
        pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
        # flatten true_tags to [sent len * batch size]
        true_tags = true_tags.view(-1)
        #confusion_matrix = confusion_matrix(true_tags, pred_tags)
        batch_loss = self.loss_fn(pred_tags, true_tags)
        batch_acc = self.accuracy(pred_tags, true_tags)
        batch_loss.backward()
        self.optimizer.step()
        epoch_loss += batch_loss.item()
        epoch_acc += batch_acc.item()
      return epoch_loss / len(self.data.train_iter), epoch_acc / len(self.data.train_iter)

  def evaluate(self, iterator):
      epoch_loss = 0
      epoch_acc = 0
      self.model.eval()
      with torch.no_grad():
          for batch in iterator:
              text = batch.word
              true_tags = batch.tag
              pred_tags = self.model(text)
              pred_tags = pred_tags.view(-1, pred_tags.shape[-1])
              true_tags = true_tags.view(-1)
              #confu_matrix = confusion_matrix(true_tags, pred_tags)
              batch_loss = self.loss_fn(pred_tags, true_tags)
              batch_acc = self.accuracy(pred_tags, true_tags)
              epoch_loss += batch_loss.item()
              epoch_acc += batch_acc.item()
      return epoch_loss / len(iterator), epoch_acc / len(iterator)

  # main training sequence
  def train(self, n_epochs):
    for epoch in range(n_epochs):
        start_time = time.time()
        train_loss, train_acc = self.epoch()
        end_time = time.time()
        epoch_mins, epoch_secs = NER.epoch_time(start_time, end_time)
        print(f"Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
        print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%")
        val_loss, val_acc = self.evaluate(self.data.val_iter)
        print(f"\tValidation Loss: {val_loss:.3f} | Validation Acc: {val_acc * 100:.2f}%")
    print(f"\tValidation Loss: {val_loss:.3f} | Validation Acc: {val_acc * 100:.2f}%")
    #print(f"\tVallidation Confusion Metrix: {confu_matrix:.3f}%")

  def infer(self, sentence, true_tags=None, sentence_idxs=None):
    self.model.eval()
    # tokenize sentence
    tokens = [token for token in sentence]
    # transform to indices based on corpus vocab
    numericalized_tokens = [self.data.word_field.vocab.stoi[t] for t in tokens]
    # begin prediction
    token_tensor = torch.LongTensor(numericalized_tokens)
    token_tensor = token_tensor.unsqueeze(-1)
    predictions = self.model(token_tensor)
    # convert results to tags
    top_predictions = predictions.argmax(-1)
    predicted_tags = [self.data.tag_field.vocab.itos[t.item()] for t in top_predictions]
    # print inferred tags
    #max_len_token = len(tokens) + len("word")
    #max_len_tag = len(predicted_tags) + len("pred")
    #print('sentence_idx.       word.      gold tag.       pred tag" )
    #pred_outfile = open('/content/drive/My Drive/Colab Notebooks/dev1.out', 'w')
    #for i, token in enumerate(tokens):
      #print(predicted_tags[i])
      #pred_outfile.write(str(sentence_idxs[i]) + '\t' + token + '\t' + true_tags[i] + '\t' + predicted_tags[i] + "\n")
    #pred_outfile.close()
    return predicted_tags

In [None]:
ner = NER(
  model=bilstm,
  data=data,
  optimizer=optimizer,
  loss_fn=nn.CrossEntropyLoss
)

In [None]:
ner.train(n_epochs)

In [None]:
# save model
torch.save(model, '/content/drive/My Drive/Colab Notebooks/blstm1.pt')

In [None]:
#dev
dev_df = pd.read_csv('dev.csv')
dev_df.head()

In [None]:
# get prediction for dev file
dev_sentence = dev_df['word'].tolist()
dev_sentence_idx = dev_df['sentence_idx'].tolist()
dev_true_tags = dev_df['NER_tag'].tolist

In [None]:
dev_pred_tags = ner.infer(sentence=dev_sentence, true_tags=dev_true_tags, sentence_idxs=dev_sentence_idx)

In [None]:
dev_pred_tags_df = pd.DataFrame({'pred_tag': dev_pred_tags})
new_dev_df = pd.concat([dev_df,dev_pred_tags_df], axis=1)
new_dev_df

In [None]:
new_dev_df.to_csv('/content/drive/My Drive/Colab Notebooks/dev1_eval.txt', sep = ' ', index=False, header=False)

In [None]:
out_dev_df = new_dev_df.drop(columns='NER_tag')
out_dev_df.head()

In [None]:
out_dev_df.to_csv('/content/drive/My Drive/Colab Notebooks/dev1.out', sep = ' ', index=False, header=False)

In [None]:
# get prediction for text file
test_df = pd.read_csv('test.csv')
test_df.head()


In [None]:
test_sentence = test_df['word'].tolist()
test_sentence_idx = test_df['sentence_idx'].tolist()

In [None]:
test_pred_tags = ner.infer(sentence=test_sentence, sentence_idxs=test_sentence_idx)

In [None]:
test_pred_tags_df = pd.DataFrame({'pred_tag': test_pred_tags})
new_test_df = pd.concat([test_df,test_pred_tags_df], axis=1)
new_test_df.head()

In [None]:
new_test_df.to_csv('/content/drive/My Drive/Colab Notebooks/test1.out', sep = ' ', index=False, header=False)

In [None]:
!pip install perl