<a href="https://colab.research.google.com/github/Igor-Tukh/nlproc-hse/blob/master/hw04_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
! pip install stanfordnlp



In [0]:
import os
import stanfordnlp
import nltk.data
import re
import pandas as pd
import numpy as np
import dill
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

In [0]:
from tqdm import tqdm_notebook as tqdm
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestRegressor
from nltk.stem.snowball import SnowballStemmer
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

In [13]:
stanfordnlp.download('ru', force=True)

Using the default treebank "ru_syntagrus" for language "ru".
Would you like to download the models for: ru_syntagrus now? (Y/n)

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.

Downloading models for: ru_syntagrus
Download location: /root/stanfordnlp_resources/ru_syntagrus_models.zip


100%|██████████| 236M/236M [00:03<00:00, 58.1MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/ru_syntagrus_models.zip
Extracting models file for: ru_syntagrus
Cleaning up...Done.


In [89]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
root_path = 'gdrive/My Drive/nlp/'

RESOURCES_PATH = 'gdrive/My Drive/nlp/resources'
TRAIN_DATASET_PATH = os.path.join(RESOURCES_PATH, 'train_qa.csv')
INPUT_FILE = os.path.join(RESOURCES_PATH, 'test_in.csv')
OUTPUT_FILE = os.path.join(RESOURCES_PATH, 'output.txt')

In [0]:
def load_train_dataset():
    data = pd.read_csv(TRAIN_DATASET_PATH) #, sep='\t')
    return data

In [92]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
def find_sentence(text, answer):
  ind = text.find(answer)
  if ind == -1:
    return None
  sentences = nltk.sent_tokenize(text)
  cur_len = 0
  for sentence in sentences:
    if cur_len + len(sentence) > ind - 100:
      word_ind = sentence.find(answer)
      # if (answer == 'в литве') and word_ind != -1:
      #   print(sentence, word_ind)
      if word_ind != -1:
        return sentence, word_ind
    cur_len += len(sentence) + 1
  return None

In [149]:
find_sentence('Привет. Мы с тобой не виделись сто лет.', 'не виделись')

('Мы с тобой не виделись сто лет.', 11)

In [0]:
def preapare_train_dataset():
  data = load_train_dataset()
  dataset = []
  for ind, (q, a, t) in enumerate(zip(data['question'], data['answer'], data['paragraph'])):
    a = a.lower()
    t = t.lower()
    q = q.lower()

    if a[-3:] == '...':
      a = a[:-3]
    if a[-1:] == '.' or a[-1:] == '?':
      a = a[:-1]
    
    answer_pos = find_sentence(t, a)
    if answer_pos is not None:
      q = re.split('(\W)', q)
      s = re.split('(\W)', answer_pos[0])
      dataset.append((q, (answer_pos[1], answer_pos[1] + len(a)), s, ind))
  return np.array(dataset)

In [202]:
data = preapare_train_dataset()

в 1926 году в литве произошёл военный переворот, возглавивший его лидер партии таутининков (от литовского tauta — народ) антанас сметона установил авторитарный режим. 12


In [203]:
print(f'Dataset shape is {data.shape}')

Dataset shape is (49908, 4)


In [0]:
def to_inds(t, q, stemmer, stemmed_dict):
  seq = []

  def process_words(words):
    for word in words:
      stemmed_word = stemmer.stem(word)
      if stemmed_word not in stemmed_dict:
          stemmed_dict[stemmed_word] = len(stemmed_dict)
      seq.append(stemmed_dict[stemmed_word] + 2)
  
  process_words(t)
  seq.append(0)  # sep
  process_words(q)

  return seq

In [0]:
stemmer = SnowballStemmer('russian')

In [206]:
stemmed_dict = {}

dataset = []
for datapoint in tqdm(data):
  dataset.append(torch.tensor(to_inds(datapoint[2], datapoint[0], stemmer, stemmed_dict)))

HBox(children=(IntProgress(value=0, max=49908), HTML(value='')))




In [0]:
datasetp = pad_sequence(dataset, padding_value=1, batch_first=True)

In [0]:
def get_ys(t, a):
  cur_len = 0
  ys = torch.zeros(2, requires_grad=True)
  for ind, word in enumerate(t):
    if len(word) == 0:
      continue
    if cur_len == a[0]:
      ys[0] = ind
    if cur_len == a[1]:
      ys[1] = (ind - 1)
    cur_len += len(word)
  if cur_len == a[1]:
    ys[1] = len(t) - 1
  # if len(ys) != 2:
  #   cur_len = 0
  #   print(a)
  #   for ind, word in enumerate(t):
  #     print(cur_len, word)
  #     cur_len += len(word)
  # assert len(ys) == 2
  return ys

In [0]:
y = []

for ind, datapoint in enumerate(data):
  y.append(get_ys(datapoint[2], datapoint[1]))

In [0]:
class LSTM(nn.Module):
  def __init__(self, data_shape):
    super(LSTM, self).__init__()
    self.word_embeddings = nn.Embedding(data_shape, 64)
    self.lstm = nn.LSTM(64, 64, bidirectional=True, batch_first=True)
    self.fc1 = nn.Linear(128, 2)
    self.softmax = nn.LogSoftmax(dim=2)
    
  def forward(self, x):
    x = self.word_embeddings(x)
    x, _ = self.lstm(x)
    y = self.fc1(x)
    y = torch.transpose(y, 1, 2)
    y = self.softmax(y)
    return y

In [0]:
model = LSTM(len(stemmed_dict) + 2)
optimizer = torch.optim.Adam(model.parameters())

In [0]:
def plot_losses(train_losses, test_losses):
  plt.title('Losses')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  epoches = range(0, len(train_losses))
  plt.plot(epoches, train_losses, label='train loss')
  plt.plot(epoches, test_losses, label='test loss')
  plt.legend()
  plt.show()

In [0]:
def train_model(model, optimizer, lossf, train_batches, test_batches, epoches_number):
  train_losses = []
  test_losses = []

  for epoch in tqdm(range(epoches_number)):
    losses = []
    for batch in train_batches:
      x, y = batch[0], batch[1]
      optimizer.zero_grad()
      output = model(x.long())
      loss = lossf(output, y)
      loss.backward()
      losses.append(loss.detach().numpy())
    train_losses.append(np.mean(np.array(losses)))

    with torch.no_grad():
      losses = []
      for batch in test_batches:
        x, y = batch[0], batch[1]
        optimizer.zero_grad()
        output = model(x.long())
        loss = lossf(output, y)
        losses.append(loss.detach().numpy())  
      test_losses.append(np.mean(np.array(losses)))

    if epoch % 2 == 0 or epoch == epoches_number - 1:
      print(f'Epoch: {epoch}, train loss: {train_losses[-1]}, test loss: {test_losses[-1]}')

  plot_losses(train_losses, test_losses)

In [0]:
final_dataset = list(zip(datasetp, y))

In [0]:
train, test = train_test_split(final_dataset, test_size=0.15)

In [0]:
bath_size = 64
epoches_number = 8

In [0]:
train_batches = torch.utils.data.DataLoader(train, batch_size=bath_size)
test_batches = torch.utils.data.DataLoader(test, batch_size=bath_size)

In [0]:
base_loss = nn.NLLLoss()

def get_loss(output, y):
  y0, y1 = y[:, 0].reshape(-1), y[:, 1].reshape(-1)
  loss0 = base_loss(output[:, 0], y0.long())
  loss1 = base_loss(output[:, 1], y1.long())
  return loss0 + loss1

In [0]:
train_model(model, optimizer, get_loss, train_batches, test_batches, epoches_number)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

Epoch: 0, train loss: 13.255812644958496, test loss: 13.253984451293945
Epoch: 2, train loss: 13.255812644958496, test loss: 13.253984451293945


In [0]:
def run_task(model):
    with open(OUTPUT_FILE, 'w') as output_file:
        test_data = pd.read_csv(INPUT_FILE, sep='\t')
        for quid, pid, q, p in tqdm(zip(test_data['question_id'], test_data['paragraph_id'], test_data['question'], test_data['paragraph'])):
            candidates, rq, rs, s = generate_candidates(p, q)
            X_test = [build_features(p, q, c, rq=rq, rs=rs, candidate_sentence=s) for c in candidates]
            X_test = discretaze_X(X_test, train=False)
            y_test = model.predict(X_test)
            y_max = None
            ans = None
            for ind in range(len(candidates)):
                if y_max is None or y_test[ind] > y_max:
                    y_max = y_test[ind]
                    ans = candidates[ind]
            output_file.write(f'{quid}\t{ans}\n')