In [1]:
try:
  from torchcrf import CRF
except:
  !pip install pytorch-crf
  from torchcrf import CRF
try: 
  import pytorch_lightning.metrics as metrics
except:
  !pip install pytorch_lightning 
  import pytorch_lightning.metrics as metrics

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [11]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import plotly as plt
import os

import numpy as np
import collections 
torch.manual_seed(1)

<torch._C.Generator at 0x7f4944251810>

# Here starts logic for working with data.
Some points to notice:
- we use preprocessed files 
- we use DataManager as the main instrument to preprocess everything
- add your dir for func get_sentences

In [4]:
def get_sentences(file_paths: list):
      """
      :param file_paths: paths of file in the dir
      :return: list of sentences parts
      """
      sentences = []
      for file_name in file_paths:
          with open(file_name) as f:
              sentence = []
              for line in f:
                  line_parts = line.strip().split("   ")
                  if len(line_parts) != 1:
                      sentence.append(line_parts)
                  elif len(line_parts) == 1:
                      sentences.append(sentence)
                      sentence = []
      return [sent for sent in sentences if len(sent) > 0]

my_dir_path = "../input/data-mistakes"
files = [f for f in os.listdir(my_dir_path) if f.endswith(".txt")]

In [5]:
os.chdir("../input/data-mistakes")
data = get_sentences(files)

print(f"now we will train on {len(data)} sentences")

now we will train on 484297 sentences


In [6]:
#customized data manager
class DataManager:
  def __init__(self):
    self.PAD = "<pad>"


    self.word_to_id = dict()
    self.mistakes_to_id = dict()
    self.speech_to_id = dict()
    self.tree_to_id = dict()

    self.singletons = []

    self.train = []
    self.val = []
    self.test = []
    
    self.lengths = []

  def build_vocabs(self, data_train: list, embedding_path=None):
    """
    creates dicts for words, mistakes and part_speech tokens to use it then for decoding
    :param data_train: data organised in conll format
    :param embedding_path: path to embeddings, isn't used for 07/01/20
    """

    data = data_train

    word_counter = collections.Counter()
    mistakes_counter = collections.Counter()
    parts_speech_counter = collections.Counter()
    parts_tree_counter = collections.Counter()

    for sent in data:
        for line in sent:
            word_counter.update([line[0]])  # add word
            mistakes_counter.update([line[1]])
            parts_speech_counter.update([line[2]])
            parts_tree_counter.update([line[3]])

    # words to ids
    self.word_to_id = collections.OrderedDict([(self.PAD, 0)])
    for word, count_w in word_counter.most_common():
        if word not in self.word_to_id:
            self.word_to_id[word] = len(self.word_to_id) + 1

    self.singletons = [word for word in word_counter if word_counter[word] == 1]  # единички

    # mistakes to ids
    self.mistakes_to_id = collections.OrderedDict([(self.PAD, 0)])
    for mistake, count_m in mistakes_counter.most_common():
        if mistake not in self.mistakes_to_id:
            self.mistakes_to_id[mistake] = len(self.mistakes_to_id)

    # parts of speech to ids
    self.speech_to_id = collections.OrderedDict([(self.PAD, 0)])
    for speech, count_s in parts_speech_counter.most_common():
        if speech not in self.speech_to_id:
            self.speech_to_id[speech] = len(self.speech_to_id)

    # parts of tree to ids
    self.tree_to_id = collections.OrderedDict([(self.PAD, 0)])
    for tree, count_t in parts_tree_counter.most_common():
        if tree not in self.speech_to_id:
            self.tree_to_id[tree] = len(self.tree_to_id)

  def decode_sentences(self, sentences: list, max_sent_length = 20):
    """
    :param sentences: raw data from get_sentences func
    :param max_sent_length: maximum length of sent, default value is 20
    :return: decoded_sents and labels
    """
    decoded_sents = []
    decoded_labels = []
    

    for sent in sentences:
        # we do not take too long sentences 
        if len(sent) >= max_sent_length:
            continue
        else:
            self.lengths.append(len(sent))

        #decoded_sent[0] - words, [1] - tokens, [2] - trees
        decoded_sent = np.zeros((3, max_sent_length))
        decoded_label = np.zeros(max_sent_length)

        decoded_sent[:] = self.mistakes_to_id[self.PAD]
        decoded_label[:] = self.mistakes_to_id[self.PAD]

        for word_line_ind, word_line in enumerate(sent):
            decoded_sent[0][word_line_ind] = self.word_to_id[word_line[0]]
            decoded_sent[1][word_line_ind] = self.speech_to_id[word_line[2]]
            decoded_sent[2][word_line_ind] = self.tree_to_id[word_line[3]]

            decoded_label[word_line_ind] = self.mistakes_to_id[word_line[1]]

        decoded_sents.append(decoded_sent)
        decoded_labels.append(decoded_label)

    return decoded_sents, decoded_labels

  # is not used in a new version 
  def get_batch(self, sentences: list, labels:list, batch_size:int):
    """
    :param sentences: data in numeric form - words, mistakes (labels), part_speech, tree
    sentences are in format:
    [[[tokens, part_speech, tree], [labels]], [[tokens, part_speech, tree], [labels]]]
    :param batch_size: the size of batch
    yields: batch
    """
    x, y = sentences, labels
    n_samples = len(x)

    batches = []

    # Shuffle at the start of epoch
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)

        batch_idx = indices[start:end]
        x_s = np.array(x, "int_")[batch_idx]
        y_s = np.array(y, "int_")[batch_idx]

        data_local = [(x, y) for x, y in zip(x_s, y_s)]
        batches.append(data_local)
    return batches
  
  def nextBatch(self, X, y, start_index):
        last_index = start_index + config["batch_size"]
        X_batch = list(X[start_index:min(last_index, len(X))])
        y_batch = list(y[start_index:min(last_index, len(X))])
        if last_index > len(X):
            left_size = last_index - (len(X))
            for i in range(left_size):
                index = np.random.randint(len(X))

                X_batch.append(X[index])
                y_batch.append(y[index])
        X_batch = np.array(X_batch)
        y_batch = np.array(y_batch)
        return X_batch, y_batch

  def get_train_data(self, data: list, train_size=0.8, val_size=0.05):
    """
    :param data: all_data in list format with labels
    :param batch_size: the size of batch
    :return: yields batch
    """
    X, y = self.decode_sentences(data)
    num_samples = len(X)
    split_point = (1 - train_size) / 2

    X_train = X[:int(num_samples * train_size)]
    y_train = y[:int(num_samples * train_size)]

    X_val = X[int(num_samples * train_size):int(num_samples * (train_size + val_size))]
    y_val = y[int(num_samples * train_size):int(num_samples * (train_size + val_size))]
                                                
    X_test = X[int(num_samples * (train_size + val_size)):]
    y_test = y[int(num_samples * (train_size + val_size)):]

    return X_train, y_train, X_val, y_val, X_test, y_test
  
  def encode_one_sent(self, sent, mode="tokens"):
    if mode == "tree":
      encoder = self.tree_to_id
    elif mode == "mistakes":
      encoder = self.mistakes_to_id
    elif mode == "words":
      encoder = self.word_to_id
    else:
      encoder = self.speech_to_id

    encoded_sent = [encoder[w] for w in sent]
    return encoded_sent 

  def decode_one_sent(self, sent, mode="tokens"):
    if mode == "tree":
      decoder = self.tree_to_id
    elif mode == "mistakes":
      decoder = self.mistakes_to_id
    elif mode == "words":
      decoder = self.word_to_id
    else:
      decoder = self.speech_to_id

    decoder_ = {v[1]:v[0] for v in decoder.items()}

    decoded_sent = [decoder_[w] for w in sent]
    return decoded_sent

In [7]:
manager = DataManager()
manager.build_vocabs(data)

x_train, y_train, x_val, y_val, x_test, y_test = manager.get_train_data(data)
assert len(x_train) == len(y_train)
print(f"The size of training set is {len(x_train)}, the size of validation set is {len(x_val)}, the size of testing set is {len(x_test)}")

The size of training set is 322653, the size of validation set is 20166, the size of testing set is 60498


In [8]:
X_train = [i[1] for i in x_train]
X_val = [i[1] for i in x_val]
X_test = [i[1] for i in x_test]

In [9]:
os.chdir("/kaggle/working") #here kaggle was used, so i change dir for developing

**Here you can choose what to do: detection or classification**

In [13]:
def make_detection_errors(data):
    data_copy = data 
    for sent_ind, sent in enumerate(data):
        for w_ind in range(len(sent)):
            if sent[w_ind] > 1:
                data_copy[sent_ind][w_ind] = 2
    return data_copy

Y_train = make_detection_errors(y_train)
Y_val = make_detection_errors(y_val)
Y_test = make_detection_errors(y_test)

In [10]:
def make_labels_sents(data):
    all_sents = np.zeros(len(data))
    
    for sent_ind, sent in enumerate(data):
        for w in sent:
          if w > 1:
            all_sents[sent_ind] = 1
    return all_sents

Y_train = make_labels_sents(y_train)
Y_val = make_labels_sents(y_val)
Y_test = make_labels_sents(y_test)

# Start working with Model

In [12]:
config = {
    "epoch": 20,
    "hidden_size": 128,
    "lr":  3*np.e - 5,
    "n_layers": 4,
    "length": 20,
    "batch_size": 32,
    "optimizer": "Adam", 
    "vocab_size": 31, 
    "early_stop": True, 
    "binary": True, 
    "pad_idx": 0, 
    "ignore_index": 0, 
    "output_dim": 3
}

config["vocab_size"] = len(manager.speech_to_id)

iterations = len(X_train) // config["batch_size"] #how often we can learn the network for each epoch 
val_iterations = len(X_val) // config["batch_size"]

In [17]:
class ModelDetectionBinary(nn.Module):
    def __init__(self, config, manager):
      
        self.embedding_dim = 200
        self.hidden_dim = config["hidden_size"]
        self.vocab_size = config["vocab_size"] 
        self.num_layers = config["n_layers"]
        self.tag_to_ix = manager.mistakes_to_id
        self.learning_rate = config["lr"]
    
        super().__init__()
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=config["pad_idx"])
        
        self.lstm = nn.LSTM(self.embedding_dim, 
                            self.hidden_dim, 
                            num_layers = self.num_layers, 
                            bidirectional = True, 
                            batch_first = True)
        
        self.linear_1 = nn.Linear(self.hidden_dim * 2, 3)

    def forward(self, text):
        text = self.embedding(text)
        
        outputs, (hidden, cell) = self.lstm(text)
        
        l1 = self.linear_1(outputs)
        outputs = torch.sigmoid(l1)
        return outputs

In [None]:
class SimpleModel(nn.Module):
    def __init__(self, config, manager):
      
        self.embedding_dim = 200
        self.hidden_dim = config["hidden_size"]
        self.vocab_size = config["vocab_size"] 
        self.num_layers = config["n_layers"]
        self.tag_to_ix = manager.mistakes_to_id
        self.learning_rate = config["lr"]
        self.output_dim = config["output_dim"]
    
        super().__init__()
        
        self.lstm = nn.LSTM(self.vocab_size, 
                            self.hidden_dim, 
                            num_layers = 2, 
                            bidirectional = True, 
                            batch_first = True) #bidirectional stacked layers
        
        self.linear_1 = nn.Linear(self.hidden_dim * 2, 1)

    def forward(self, text):
        #embedded = self.embedding(text)
        #.view(len(text), 1, -1)
        outputs, (hidden, cell) = self.lstm(text)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        #hidden = [batch size, hid dim * num directions]

        l1 = self.linear_1(hidden)
        
        outputs = torch.sigmoid(l1)
        return outputs

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def init_weights(model):
    for name, param in model.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)

In [18]:
model_detection = ModelDetectionBinary(config, manager)
model_detection.apply(init_weights)

optim = torch.optim.Adam(model_detection.parameters(), lr=config["lr"])
crit = nn.CrossEntropyLoss(ignore_index=config["ignore_index"])

In [19]:
def train_detection(model, optimizer, criterion, x_train, y_train, x_val, y_val):
  best_f1_val, unprogressed = 0, 0
  train_losses, train_metrics = [], []
  val_losses, val_metrics = [], []
  patient = 5
    
  for epoch in range(N_EPOCHS):
    model.train()
    print(f"Current epoch is {epoch}")
    start_time_epoch = time.time()

    # before every epoch shuffle the data 
    sh_index = np.arange(len(x_train))
    np.random.shuffle(sh_index)
    X_train = np.array(x_train)[sh_index]
    y_train = np.array(y_train)[sh_index]

    for iteration in range(iterations):
      sentence, tags = manager.nextBatch(X_train, y_train, start_index=iteration*config["batch_size"])
      sentence = torch.tensor(sentence, dtype=torch.long).cuda()
      tags = torch.tensor(tags, dtype=torch.long).view(-1).cuda()
     
      optimizer.zero_grad()
      
      predictions = model(sentence).cuda()

      predictions = predictions.view(-1, predictions.shape[-1])
      # predictions = predictions.view(-1, predictions.shape[-1])
      # tags = tags.view(-1)
      loss = criterion(predictions, tags) 
      
      loss.backward()
      optimizer.step()

    train_losses.append(loss.detach())
    f1, precision = count_metrics(predictions, tags)
    train_metrics.append(f1)
    print(f'Train Epoch: {epoch} Loss: {loss.data} Precision {precision}, f1 {f1}')
    print(loss.detach())
    print(f"Iime consumed is {-(start_time_epoch - time.time())} \n")
    
    torch.cuda.empty_cache()
    
    #validation pool
    val_time = time.time()
    for iterat in range(val_iterations):
        sentence_val, tags_val = manager.nextBatch(x_val, y_val, start_index=iterat*config["batch_size"])
        sentence_val = torch.tensor(sentence_val, dtype=torch.long).cuda()
        tags_val = torch.tensor(tags_val, dtype=torch.long).view(-1).cuda()
        
        predictions_val = model(sentence_val).cuda()
        predictions_val = predictions_val.view(-1, predictions_val.shape[-1])
        
        loss_val = criterion(predictions_val, tags_val)
        f1, precision = count_metrics(predictions_val, tags_val)
        val_metrics.append(float(f1)) #check f1 only 
        val_losses.append(loss_val.detach())
    print(loss_val.detach())
    print(f"validation epoch {epoch} took {-(val_time- time.time())} seconds and f1 is {f1}")

  if np.array(val_metrics).mean() > best_f1_val:
    unprogressed = 0
    best_f1_val = np.array(val_metrics).mean()
    best_epoch = epoch
    print(f"saved the new best model with f1: {best_f1_val} at epoch {epoch}")
  else:
    unprogressed += 1

  if config["early_stop"]:
    if unprogressed >= patient:
        print(f"early stopped, no progress obtained within {patient} epochs")
        print(f"overall best f1 is {best_f1_val} at {best_epoch} epoch")
        return batch_losses
    
  return train_losses, val_losses, train_metrics, val_metrics

In [None]:
def train(model, optimizer, criterion, x_train, y_train):
  train_losses, train_metrics = [], []

  for epoch in range(10):
    print(f"Current epoch is {epoch}")
    start_time_epoch = time.time()

    # before every epoch shuffle the data 
    sh_index = np.arange(len(x_train))
    np.random.shuffle(sh_index)
    X_train = np.array(x_train)[sh_index]
    y_train = np.array(y_train)[sh_index]

    for iteration in range(iterations):
      sentence, tags = manager.nextBatch(X_train, y_train, start_index=iteration*config["batch_size"])
      sentence = torch.tensor(sentence, dtype=torch.long)
      sentence = F.one_hot(sentence, num_classes=config["vocab_size"]).type(torch.float).cuda()
      tags = torch.tensor(tags, dtype=torch.float).cuda()
        
     
      optimizer.zero_grad()
      
      predictions = model(sentence).squeeze().cuda()

      # predictions = predictions.view(-1, predictions.shape[-1])
      # tags = tags.view(-1)

      loss = criterion(predictions, tags) 
      
      loss.backward()
      optimizer.step()

    train_losses.append(loss.detach())
    print(f'Train Epoch: {epoch} Loss: {loss.data}')
    print(f"Iime consumed is {-(start_time_epoch - time.time())} \n")
    print(predictions)
  return train_losses

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [21]:
model_det = model_detection.to(device)
criterion = crit.to(device)

In [22]:
def count_metrics(y_pred, y_true):
  res = []
  for metric, values in metrics_kinds.items():
    metric = metric.to(device)
    value_metric = metric(y_pred, y_true)
    metrics_kinds[metric].append(value_metric) 
    res.append(value_metric)
  return res

In [24]:
config["target_size"] = 3

In [25]:
metrics_kinds = {
          metrics.F1(num_classes = config["target_size"]) : []}

In [27]:
N_EPOCHS = config["epoch"]
train_losses, val_losses, train_metrics, val_metrics = train_detection(model_det, optim, criterion, X_train, Y_train, X_val, Y_val)

Current epoch is 0


KeyboardInterrupt: 

In [28]:
def save_model(model, path = "model.pt"):
  torch.save(model.state_dict(), path)

In [None]:
def load_model(model_class, config, manager, path="model.pt"):
  model = model_class(config, manager)
  model.load_state_dict(torch.load(path))
  model.eval()
  return model
  print("success")

In [None]:
!pip install plotly
import plotly.graph_objects as go

In [None]:
def plot_results(result:list, mode="training"):
  x = np.array([i for i in range(len(result))])
  y = np.array(result)

  fig = go.Figure()
  
  fig.add_trace(go.Scatter(x=x, y=y, name=mode,
                    line_shape='linear'))
  fig.show()