In [1]:
!python -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 6.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
import os
import math
import time
from datetime import timedelta
import spacy
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import spacy

import torchvision
from torchvision.transforms import ToTensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/My Drive/Personal/Projects/DepressionCounselorBot"

/content/drive/My Drive/Personal/Projects/DepressionCounselorBot


In [None]:
spacy_eng = spacy.load("en")

In [None]:
class VocabularyEnglish:
    def __init__ (self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK >"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def __len__ (self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
      return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_vocabulary (self, sentence_list):
        idx = 4
        frequencies = {}

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                    if frequencies[word] == self.freq_threshold:
                        self.stoi[word] = idx
                        self.itos[idx] = word
                        idx += 1

    def numericalize (self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"] for token in tokenized_text    
        ]

In [None]:
class MentalHealthDataset(Dataset):
    def __init__(self, data_file_dir, transforms=None, freq_threshold=2):
        self.submission_comment_pairs = pd.read_csv(data_file_dir, lineterminator='\n')

        self.submissions = self.submission_comment_pairs["Submission"]
        self.comments = self.submission_comment_pairs["Comment"]
        self.full_pair = self.submissions + " " + self.comments

        self.vocab_en = VocabularyEnglish(freq_threshold)
        self.vocab_en.build_vocabulary(self.full_pair.tolist())

        self.transforms = transforms

    def __len__ (self):
        return len(self.submission_comment_pairs)

    def __getitem__ (self, idx):
        submission = self.submissions[idx]
        comment = self.comments[idx]

        if self.transforms is not None:
            submission = self.transforms(submission)
            comment = self.transforms(comment)

        numericalized_submission = [self.vocab_en.stoi["<SOS>"]]
        numericalized_submission += self.vocab_en.numericalize(submission)
        numericalized_submission.append(self.vocab_en.stoi["<EOS>"])

        numericalized_comment = [self.vocab_en.stoi["<SOS>"]]
        numericalized_comment += self.vocab_en.numericalize(comment)
        numericalized_comment.append(self.vocab_en.stoi["<EOS>"])

        return torch.tensor(numericalized_submission), torch.tensor(numericalized_comment)

In [None]:
class DataCollate:
    def __init__ (self, pad_idx):
      self.pad_idx = pad_idx

    def __call__ (self, batch):
      submissions = [item[0] for item in batch]
      submissions = pad_sequence(submissions, batch_first=False, padding_value=self.pad_idx)

      comments = [item[1] for item in batch]
      comments = pad_sequence(comments, batch_first=False, padding_value=self.pad_idx)

      return submissions, comments

In [None]:
dataset = MentalHealthDataset("./clean_submission_comment_score_pairs_depression_help.csv", transforms=None)

In [None]:
batch_size = 32 # 128
src_vocab_size = len(dataset.vocab_en)
trg_vocab_size = src_vocab_size
src_pad_idx = dataset.vocab_en.stoi["<PAD>"]

num_workers = 8
pin_memory = True

In [None]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=pin_memory, collate_fn=DataCollate(pad_idx=src_pad_idx))
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=pin_memory, collate_fn=DataCollate(pad_idx=src_pad_idx))

  cpuset_checked))


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.scale = nn.Parameter(torch.ones(1))

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(
            0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.scale * self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class Transformer(nn.Module):
    def __init__(self, embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout, max_len, device):
        super(Transformer, self).__init__()

        self.embedding_size = embedding_size

        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)

        self.src_positional_encoding = PositionalEncoding(embedding_size, dropout, max_len)
        self.trg_positional_encoding = PositionalEncoding(embedding_size, dropout, max_len)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_embedding = self.src_positional_encoding(self.src_word_embedding(src))
        trg_embedding = self.trg_positional_encoding(self.trg_word_embedding(trg))

        src_padding_mask = self.make_src_mask(src)
        trg_padding_mask = self.make_src_mask(trg)
        trg_no_peak_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length)

        out = self.transformer(
            src_embedding,
            trg_embedding,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=trg_padding_mask,
            tgt_mask=trg_no_peak_mask,
        )
        
        out = self.fc(out)
        
        return out

In [None]:
def save_checkpoint(state, filename="checkpoint.pth.tar"):
    print("Saving checkpoint")
    torch.save(state, filename)

def load_checkpoint(checkpoint, model, optimizer):
    print("Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [None]:
def generate_comment(model, sentence, device, max_length=50):

    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_eng(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens.insert(0, "<SOS>")
    tokens.append("<EOS>")

    text_to_indices = [dataset.vocab_en.stoi[token] for token in tokens]

    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [dataset.vocab_en.stoi["<SOS>"]]

    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == dataset.vocab_en.stoi["<EOS>"]:
            break

    translated_sentence = [dataset.vocab_en.itos[idx] for idx in outputs]

    return translated_sentence[1:]


In [None]:
# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

load_model = False
save_model = True

# Training hyperparameters
num_epochs = 10000
learning_rate = 3e-4

# Model hyperparameters
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 150
forward_expansion = 4



model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = dataset.vocab_en.stoi["<PAD>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
# if load_model:
#     load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

In [None]:
def epoch_train():
  model.train()
  losses = []

  for batch_idx, (inp_data, target) in enumerate(train_dataloader):

    # Get input and targets and get to cuda
    inp_data = inp_data.to(device)
    target = target.to(device)

    # Forward prop
    output = model(inp_data, target[:-1, :])

    # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
    # doesn't take input in that form. For example if we have MNIST we want to have
    # output to be: (N, 10) and targets just (N). Here we can view it in a similar
    # way that we have output_words * batch_size that we want to send in into
    # our cost function, so we need to do some reshapin.
    # Let's also remove the start token while we're at it
    output = output.reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()

    loss = criterion(output, target)
    losses.append(loss.item())

    # Back prop
    loss.backward()
    # Clip to avoid exploding gradient issues, makes sure grads are
    # within a healthy range
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Gradient descent step
    optimizer.step()

    # plot to tensorboard
    # writer.add_scalar("Training loss", loss, global_step=step)
    # step += 1
  return sum(losses) / len(losses)

In [None]:
def epoch_test():
  model.eval()
  losses = []

  for batch_idx, (inp_data, target) in enumerate(test_dataloader):
    with torch.no_grad():

      # Get input and targets and get to cuda
      inp_data = inp_data.to(device)
      target = target.to(device)

      # Forward prop
      output = model(inp_data, target[:-1, :])

      # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
      # doesn't take input in that form. For example if we have MNIST we want to have
      # output to be: (N, 10) and targets just (N). Here we can view it in a similar
      # way that we have output_words * batch_size that we want to send in into
      # our cost function, so we need to do some reshapin.
      # Let's also remove the start token while we're at it
      output = output.reshape(-1, output.shape[2])
      target = target[1:].reshape(-1)

      loss = criterion(output, target)
      losses.append(loss.item())


  return sum(losses) / len(losses)

In [None]:
# train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
#     (train_data, valid_data, test_data),
#     batch_size=batch_size,
#     sort_within_batch=True,
#     sort_key=lambda x: len(x.src),
#     device=device,
# )

sentence = "Struggling to cope. I just wish I had the power to change everything."
train_loss_hist = []
test_loss_hist = []

for epoch in range(num_epochs):
  start_time = time.time()

  if save_model:
    checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
    }
    save_checkpoint(checkpoint)

  mean_train_loss = epoch_train()
  
  mean_test_loss = epoch_test()

  translated_sentence = generate_comment(
    model, sentence, device, max_length=50
  )
  print(f"Translated example sentence: \n {translated_sentence}")

  train_loss_hist.append(mean_train_loss)
  test_loss_hist.append(mean_test_loss)    

  scheduler.step(mean_train_loss)

  elapsed_time = time.time() - start_time
  curr_lr = optimizer.param_groups[0]['lr']
  print(f'Epoch {epoch} Train Loss: {mean_train_loss} Test Loss: {mean_test_loss} LR:{curr_lr} Time: {time.strftime("%H:%M:%S.{}".format(str(elapsed_time % 1)[2:])[:15], time.gmtime(elapsed_time))}')


# running on entire test data takes a while
# score = bleu(test_data[1:100], model, german, english, device)
# print(f"Bleu score {score * 100:.2f}")

Saving checkpoint


  cpuset_checked))


Translated example sentence: 
 ['you', 'you', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']
Epoch 0 Train Loss: 5.726942675454276 Test Loss: 5.794261693954468 LR:0.0003 Time: 00:01:52.039682
Saving checkpoint
Translated example sentence: 
 ['i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i']
Epoch 1 Train Loss: 5.692608946845645 Test Loss: 5.794719457626343 LR:0.0003 Time: 00:01:50.859410
Saving checkpoint
Translated example sentence: 
 ['i', 'm', 'm', 'm', 'm', 'the', 'm', 'the', 'm', 'the', 'm', 'the', 'm', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'i', 'm', 'm', 'm', 'm'

KeyboardInterrupt: ignored

In [None]:
m/