# Starter code for the skeleton notebook

If you are running this notebook on **Google Colab**, make sure you are using a GPU runtime.

This notebook mounts drive to load embeddings and data you can find them on the kaggle link on https://www.kaggle.com/competitions/quora-insincere-questions-classification/data

When running Colab, it automatically grabs scripts from
https://github.com/LLeon360/aiprojects-nlp-quora-questions

Checkout [data/starting_dataset.py](data/EmbeddingsDataset.py) for the dataset processing code. \
Checkout [networks/StartingNetwork.py](networks/LSTMEncoder.py) for the neural network code. \
Checkout [train_functions/starting_train.py](train_functions/lstm_train.py) for the training code.

### Mount Drive (Google Colab)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Grab scripts from GitHub Repo

In [None]:
!git clone https://github.com/LLeon360/aiprojects-nlp-quora-questions scripts
!mv  -v scripts/* .

Cloning into 'scripts'...
remote: Enumerating objects: 239, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 239 (delta 15), reused 26 (delta 10), pack-reused 203[K
Receiving objects: 100% (239/239), 112.80 KiB | 3.89 MiB/s, done.
Resolving deltas: 100% (109/109), done.
renamed 'scripts/acmprojects.yml' -> './acmprojects.yml'
renamed 'scripts/constants.py' -> './constants.py'
renamed 'scripts/data' -> './data'
renamed 'scripts/kaggle.json' -> './kaggle.json'
renamed 'scripts/main.ipynb' -> './main.ipynb'
renamed 'scripts/networks' -> './networks'
renamed 'scripts/README.md' -> './README.md'
renamed 'scripts/train_functions' -> './train_functions'


### Imports

In [None]:
import os

import constants

from data.StartingDataset import StartingDataset
from networks.StartingNetwork import StartingNetwork
from train_functions.lstm_train import lstm_train

from data.EmbeddingDataset import EmbeddingDataset
from networks.LSTMEncoder import LSTMEncoder

import torch
from torch.utils.data import random_split, WeightedRandomSampler, BatchSampler

import pandas as pd

import csv
import numpy as np

from sklearn.model_selection import train_test_split

### Constants

In [None]:
# EPOCHS = 100
# BATCH_SIZE = 32
# N_EVAL = 100
# VAL_SPLIT = 0.1

from constants import EPOCHS, BATCH_SIZE, N_EVAL, VAL_SPLIT
VAL_SPLIT = 0.05
EPOCHS = 1;

### GPU Support


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Define hyperparameters

In [None]:
hyperparameters = {"epochs": EPOCHS, "batch_size": BATCH_SIZE}

### Load Embeddings

You need to have the embeddings installed and stored in the matching filepath

In [None]:
full_content = pd.read_csv('/content/drive/MyDrive/AI/quora_nlp/glove.6B.300d.txt', delim_whitespace = True, quoting=csv.QUOTE_NONE)

In [None]:
# full_content.head()

In [None]:
#separate words and embeddings
i_word = full_content.iloc[:,0]
i_embeddings = full_content.iloc[:,1:]

In [None]:
# from series to numpy
vocab_npa = np.array(i_word)
embs_npa = np.array(i_embeddings)

In [None]:
# prepend special padding token and unknown token
vocab_npa = np.insert(vocab_npa, 0, '<pad>')
vocab_npa = np.insert(vocab_npa, 1, '<unk>')

In [None]:
pad_emb_npa = np.zeros((1, embs_npa.shape[1]))
unk_emb_npa = np.mean(embs_npa, axis=0, keepdims=True)

#insert embeddings for pad and unk tokens to embs_npa.
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))

In [None]:
print(vocab_npa.shape)
print(embs_npa.shape)

(400001,)
(400001, 300)


### Split data

In [None]:
entire_df = pd.read_csv("/content/drive/MyDrive/AI/quora_nlp/train.csv")
# entire_df = pd.read_csv("train.csv")

In [None]:
train_df, val_df = train_test_split(entire_df, test_size=VAL_SPLIT)
test_df = pd.read_csv("/content/drive/MyDrive/AI/quora_nlp/test.csv")

In [None]:
print(len(train_df))
print(len(val_df))
# print(len(test_df))

1240815
65307


#### Class imbalance

In [None]:
# pull out negative and positives
negative_df = entire_df[entire_df["target"] == 0]
positive_df = entire_df[entire_df["target"] == 1]
print(len(negative_df))
print(len(positive_df))
print(len(negative_df) / len(positive_df))

1225312
80810
15.16287588169781


#### Weighted Sampler

There is a pretty significant class imbalance, mostly negative cases so use weighted sampler to train the model on a balance of both

In [None]:
weights = np.ones(len(train_df))
weights[train_df.target==1] *= 15
weights /= (len(train_df)) # Pytorch docs says probabilities don't have to add up to 1, but when you don't do this it doesn't work :(

sampler = WeightedRandomSampler(weights=weights, num_samples=len(train_df), replacement=True)

### Initialize datasets and model


In [None]:
config = {
    #model configurations
    'batch_size':32,
    'max_seq_length':100,
    'lr':1e-3,
    'label_count':2,
    'dropout_prob':2e-1,
    'hidden_size':256,
    'lstm_unit_cnt':2,

    #embeddings configurations
    'pretrained_embeddings':embs_npa,
    'freeze_embeddings':True,
    'vocab':vocab_npa,
    'pad_token':'<pad>',
    'unk_token':'<unk>',

    #data
    'train_df': train_df,
    'val_df': val_df,
    'test_df': test_df,

    'device': device,
}

In [None]:
# starting fc network, ignore for embeddings and lstm
# data_path = "mini_train.csv"

# train_dataset = StartingDataset(data_path)
# #val split
# generator1 = torch.Generator().manual_seed(42)
# train_dataset, val_dataset = random_split(train_dataset, [1-VAL_SPLIT, VAL_SPLIT], generator = generator1)
# model = StartingNetwork()


In [None]:
# print(len(train_dataset))
# print(len(val_dataset))

In [None]:
model = LSTMEncoder(config)
model.to(device)

LSTMEncoder(
  (embedding): Embedding(400001, 300)
  (lstm): LSTM(300, 256, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
train_dataset = EmbeddingDataset(
    df = config['train_df'],
    vocab = config['vocab'],
    max_seq_length = config['max_seq_length'],
    pad_token = config['pad_token'],
    unk_token = config['unk_token']
)

val_dataset = EmbeddingDataset(
    df = config['val_df'],
    vocab = config['vocab'],
    max_seq_length = config['max_seq_length'],
    pad_token = config['pad_token'],
    unk_token = config['unk_token']
)


### Test Sampler

In [None]:
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_sampler=BatchSampler(sampler,32, True)
)

### Train model


In [None]:
lstm_train(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    train_sampler = sampler,
    model=model,
    hyperparameters=hyperparameters,
    n_eval=N_EVAL,
    device=device
)


In [None]:
%load_ext tensorboard
%tensorboard --logdir runs

In [None]:
PATH = "entire_model.pt"

# Save
torch.save(model, PATH)