<h1 style="text-align: center; font-weight: bold">Training - Pipeline</h1>

---

# 1 - Notebook Setup

## 1.1 - Imports

In [11]:
# visualization
import matplotlib.pyplot as plt
import tqdm

# data manipulation
from torch.utils.data import random_split, Dataset, DataLoader
from pathlib import Path

import torch.nn as nn
import pandas as pd
import numpy as np
import torch
import json
import os

## 1.2 - Dataset path

In [2]:
try:
    from google.colab import drive 
    drive.mount('/content/gdrive')
    df_path = Path('/content/gdrive/MyDrive/')
    figure_path = Path('/content')
    ENV = 'Google Colab'
except Exception as error:
    df_path = Path(os.getcwd().replace('notebooks', 'data/processed'))
    figure_path = Path(os.getcwd().replace('notebooks', 'figures'))
    ENV = 'Local Machine'

print(f'[\033[1;33mENVIRONMENT\033[0m]: {ENV}')
print(f'Dataset located in {df_path}')

[[1;33mENVIRONMENT[0m]: Local Machine
Dataset located in c:\Users\Matheus\WorkSpace\ChatBot\data\processed


## 1.3 - Device (GPU|CPU)

In [3]:
device = torch.device(device='cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    gpu_available = '\033[1;32mAvailable\033[0m'
else:
    gpu_available = '\033[1;31mNot Available\033[0m'
print(f'[\033[1;36mGPU\033[0m]: {gpu_available}')

[[1;36mGPU[0m]: [1;31mNot Available[0m


# 2 - Custom Dataset

## 2.1 - Loading dataset

In [9]:
data_frame = pd.read_csv(df_path.joinpath('cleaned_dataframe.csv'), encoding='utf-8', delimiter=",", on_bad_lines='skip')
with open(file=df_path.joinpath('data_info.json'), mode='r') as json_file:
    json_content = json.load(fp=json_file)
    word_dict = json_content['word_dict']
    question_len = json_content['question_length']
    answer_len = json_content['answer_length']
    vocabulary = json_content['num_vocabulary']

In [10]:
data_frame.head(5)

Unnamed: 0,question,answers
0,first work generally recognized artificial int...,warren mcculloch and walter pitts 1943
1,sources drawn formation first work generally r...,knowledge of the basic physiology and function...
2,created hebbian learning rule,donald hebb 1949
3,first neural network built,1950
4,first neural network called,the snarc


## 2.2 - Splitting dataset

In [13]:
cleaned_dataframe_np = np.asarray(a=data_frame)

split_size = [0.8, 0.1, 0.1]
train_split, test_split, valid_split = random_split(dataset=cleaned_dataframe_np, lengths=split_size)

In [14]:
train_df = cleaned_dataframe_np[train_split.indices]
test_df = cleaned_dataframe_np[test_split.indices]
valid_df = cleaned_dataframe_np[valid_split.indices]

In [15]:
for df_name, data in zip(['train', 'test', 'valid'], [train_df, test_df, valid_df]):
    print(f'{df_name:5}: {data.shape}')

train: (3377, 2)
test : (422, 2)
valid: (422, 2)


## 2.3 - Creating custom dataset class

In [16]:
class CustomDataset(Dataset):

    def __init__(self, dataset: np.ndarray, word_dict: dict, voc_size: int, max_len_in: int = 30, max_len_out: int = 500) -> None:
        
        self.x = dataset[:,0]
        self.y = dataset[:,1]

        self.wd = word_dict
        self.vs = voc_size
        self.mli = max_len_in
        self.mlo = max_len_out
    
    def __len__(self) -> int:
        return len(self.x)
    
    def __getitem__(self, index: int) -> torch.Tensor:
        
        x = self.x[index]
        y = self.y[index]

        x, y = self.gen_data(x=x, y=y)
        x = torch.from_numpy(x)
        y = torch.from_numpy(y)

        return x, y
    
    def gen_data(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:

        x_data = []
        y_data = np.zeros(shape=[self.vs, self.mlo])
        y_data[self.wd['<pad>'],:] = 1

        x_data.append(self.wd['<start>'])
        for word in x.split(' '):
            x_data.append(self.wd[word])
        x_data.append(self.wd['<stop>'])
        while len(x_data) < self.mli:
            x_data.append(self.wd['<pad>'])
        x_data = np.asarray(a=x_data)

        for idx, word in enumerate(y.split(' ')):
            num = self.wd[word]
            y_data[num,idx] = 1
            y_data[self.wd['<pad>'],idx] = 0
        y_data[self.wd['<stop>'],idx + 1] = 1
        y_data[self.wd['<pad>'],idx + 1] = 0

        return x_data, y_data

In [19]:
train_custom_df = CustomDataset(dataset=train_df, word_dict=word_dict, voc_size=vocabulary, max_len_in=question_len, max_len_out=answer_len)
test_custom_df = CustomDataset(dataset=test_df, word_dict=word_dict, voc_size=vocabulary, max_len_in=question_len, max_len_out=answer_len)
valid_custom_df = CustomDataset(dataset=valid_df, word_dict=word_dict, voc_size=vocabulary, max_len_in=question_len, max_len_out=answer_len)

In [20]:
for x, y in test_custom_df:
    print(f'X: {x.size()}')
    print(f'Y: {y.size()}')
    break

X: torch.Size([30])
Y: torch.Size([5249, 400])


# 3 - Model

In [21]:
class ChatBot(nn.Module):

    def __init__(self, in_f: int, out_shape: tuple, word_dict: dict) -> None:
        
        self.len = out_shape[1]
        self.w = out_shape[0]
        self.lstm = nn.LSTM(input_size=in_f, hidden_size=1024, num_layers=2)
        self.lin = nn.Linear(in_features=1024, out_features=out_shape[0])
        self.emb = nn.Embedding(num_embeddings=len(word_dict), embedding_dim=self.len)
    
    def forward(self, x) -> torch.Tensor:
        
        pred = torch.zeros(size=(x.size(0), self.w, self.len)).to(device=device)

        x = self.emb(x)
        h, c = 0, 0

        for idx in range(self.len):
            out, (h, c) = self.lstm(x[:,idx], (h, c))
            out = self.lin(out)
            pred[:,idx] = out
        
        return pred
    
    def gen_text(self, x) -> torch.Tensor:
        
        pred = torch.zeros(size=(x.size(0), self.w, self.len)).to(device=device)