<a href="https://colab.research.google.com/github/IainHigh/MLP/blob/main/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pytorch_pretrained_bert --quiet
!pip install datasets --quiet

In [10]:
# Imports
import os
import sys
import math
import nltk
import torch
import random
import string
import datasets
import statistics
import spacy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm.notebook import tqdm
from abc import ABC, abstractmethod

from nltk.corpus import brown
from gensim.models import Word2Vec
from sklearn.manifold import TSNE

import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
# from torchtext.vocab import Vectors
# from transformers import AutoTokenizer
import torch.optim as optim

from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert import BertForMaskedLM

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# @title Set random seed

# @markdown Executing `set_seed(seed=seed)` you are setting the seed

# for DL its critical to set the random seed so that students can have a
# baseline to compare their results to expected results.
# Read more here: https://pytorch.org/docs/stable/notes/randomness.html

# Call `set_seed` function in the exercises to ensure reproducibility.
import random
import torch

def set_seed(seed=None, seed_torch=True):
  """
  Handles variability by controlling sources of randomness
  through set seed values

  Args:
    seed: Integer
      Set the seed value to given integer.
      If no seed, set seed value to random integer in the range 2^32
    seed_torch: Bool
      Seeds the random number generator for all devices to
      offer some guarantees on reproducibility

  Returns:
    Nothing
  """
  if seed is None:
    seed = np.random.choice(2 ** 32)
  random.seed(seed)
  np.random.seed(seed)
  if seed_torch:
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

  print(f'Random seed {seed} has been set.')


# In case that `DataLoader` is used
def seed_worker(worker_id):
  """
  DataLoader will reseed workers following randomness in
  multi-process data loading algorithm.

  Args:
    worker_id: integer
      ID of subprocess to seed. 0 means that
      the data will be loaded in the main process
      Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details

  Returns:
    Nothing
  """
  worker_seed = torch.initial_seed() % 2**32
  np.random.seed(worker_seed)
  random.seed(worker_seed)

In [6]:
# @title Set device (GPU or CPU). Execute `set_device()`
# especially if torch modules used.

# inform the user if the notebook uses GPU or CPU.

def set_device():
  """
  Set the device. CUDA if available, CPU otherwise

  Args:
    None

  Returns:
    Nothing
  """
  device = "cuda" if torch.cuda.is_available() else "cpu"
  if device != "cuda":
    print("WARNING: For this notebook to perform best, "
        "if possible, in the menu under `Runtime` -> "
        "`Change runtime type.`  select `GPU` ")
  else:
    print("GPU is enabled in this notebook.")

  return device

In [7]:
SEED = 42
set_seed(seed=SEED)
DEVICE = set_device()

Random seed 42 has been set.
GPU is enabled in this notebook.


In [8]:
train_csv_path = "/content/drive/MyDrive/mlp-project/data/train_interactions.csv"
val_csv_path = "/content/drive/MyDrive/mlp-project/data/test_interactions.csv"
test_csv_path = "/content/drive/MyDrive/mlp-project/data/validate_interactions.csv"

In [9]:
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)
val_df = pd.read_csv(val_csv_path)

In [10]:
print(train_df.shape)

(19719025, 5)


In [11]:
train_df.head()

Unnamed: 0,user_id,book_id,date_added,read_at,started_at
0,17277ab9d32482da501c252052235561,25695484,2017-08-15 14:42:19,2017-08-17 17:12:49,2017-08-15 14:42:21
1,17277ab9d32482da501c252052235561,23411534,2017-08-15 14:40:37,2017-03-01 00:00:00,2017-03-01 00:00:00
2,17277ab9d32482da501c252052235561,16130,2017-04-21 19:41:34,,2017-04-21 19:41:34
3,17277ab9d32482da501c252052235561,17562818,2016-10-20 10:52:14,2016-10-22 00:00:00,2016-10-20 00:00:00
4,17277ab9d32482da501c252052235561,27066704,2016-08-10 17:00:25,2016-08-11 21:57:00,2016-08-10 17:00:25


In [12]:
book_csv_path = "/content/drive/MyDrive/mlp-project/data/books_filtered_by_language.csv"

In [13]:
book_data_df = pd.read_csv(book_csv_path)

In [14]:
print(book_data_df.shape)

(434423, 7)


In [15]:
book_data_df.head(10)

Unnamed: 0,isbn,language_code,description,isbn13,book_id,title,num_pages
0,743294297.0,eng,Addie Downs and Valerie Adler were eight when ...,9780743294294.0,6066819,Best Friends Forever,368.0
1,,eng,,,33394837,The House of Memory (Pluto's Snitch #2),318.0
2,,eng,,9781621086949.0,21401188,Glimmering Light,160.0
3,,eng,,,30227122,"The 30s (Fantastic Films of the Decades, #2)",255.0
4,1479174661.0,eng,Arrianna Williams is an ordinary 25 yr. old wo...,9781479174669.0,16037548,Untold Secrets: Fire & Ice,168.0
5,1483985644.0,eng,Embrace the word of God with the inspirational...,9781483985640.0,18628482,Understand God's Word - Walk in the Truth,412.0
6,9780807843.0,eng,"""This critical, historical, and theoretical st...",,13598461,Labor and Desire: Women's Revolutionary Fictio...,236.0
7,980145988.0,eng,"Elfin mercenaries, Lark and her brother Orin, ...",9780980145984.0,13598465,Faminelands: The Carp's Eye (Book 1),100.0
8,1850294607.0,eng,"Restore, revamp, repair, and revitalize your h...",9781850294603.0,427479,Terence Conran's DIY By Design: Over 30 Projec...,256.0
9,,eng,The questions plaguing Captain America's dream...,,13571772,Captain America: Winter Soldier (The Ultimate ...,146.0


In [16]:
mod_book_csv_path = "/content/drive/MyDrive/mlp-project/data/books_filtered_by_language_modified_desc.csv"

In [17]:
mod_book_data_df = pd.read_csv(mod_book_csv_path)

In [18]:
print(mod_book_data_df.shape)

(406913, 8)


In [19]:
mod_book_data_df.head()

Unnamed: 0,isbn,language_code,description,isbn13,book_id,title,num_pages,modified_description
0,743294297,eng,Addie Downs and Valerie Adler were eight when ...,9780743294294.0,6066819,Best Friends Forever,368.0,addie downs valerie adler eight first meet dec...
1,1479174661,eng,Arrianna Williams is an ordinary 25 yr. old wo...,9781479174669.0,16037548,Untold Secrets: Fire & Ice,168.0,arrianna williams ordinary yr. old woman think...
2,1483985644,eng,Embrace the word of God with the inspirational...,9781483985640.0,18628482,Understand God's Word - Walk in the Truth,412.0,embrace word god inspirational book understand...
3,9780807843,eng,"""This critical, historical, and theoretical st...",,13598461,Labor and Desire: Women's Revolutionary Fictio...,236.0,critical historical theoretical study look lit...
4,980145988,eng,"Elfin mercenaries, Lark and her brother Orin, ...",9780980145984.0,13598465,Faminelands: The Carp's Eye (Book 1),100.0,elfin mercenary lark brother orin hunt perfect...


In [20]:
for text in list(mod_book_data_df.head()['modified_description']):
  print(text)

addie downs valerie adler eight first meet decide best friend forever wake tragedy betrayal teenage year everything change val fame fortune addie stay behind small midwestern town destiny however store two twenty-five year later val show addie front door blood coat terror face beginning wild adventure two woman join love history find strength together could find alone
arrianna williams ordinary yr. old woman think stumble across special book soon see special book really least think know book come life arrianna longer read page story shortly read book run old childhood friend damian quickly fall last long hurt find comfort another man thinking life completely normal soon find special gift put life danger archangel name gabriel come rescue turn thing get bad hunt gift man seek comfort desperate need save life meanwhile feeling gabriel grow stronger wonder could true love could angel ever love way love defeat fall angel demon many secret many lie much heartache happen journey discover rea

In [21]:
import re

# Tokenization
tok = spacy.load("en_core_web_sm")
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [22]:
for text in list(mod_book_data_df.head()['modified_description']):
  print(tokenize(text))

['addie', 'downs', 'valerie', 'adler', 'eight', 'first', 'meet', 'decide', 'best', 'friend', 'forever', 'wake', 'tragedy', 'betrayal', 'teenage', 'year', 'everything', 'change', 'val', 'fame', 'fortune', 'addie', 'stay', 'behind', 'small', 'midwestern', 'town', 'destiny', 'however', 'store', 'two', 'twenty', 'five', 'year', 'later', 'val', 'show', 'addie', 'front', 'door', 'blood', 'coat', 'terror', 'face', 'beginning', 'wild', 'adventure', 'two', 'woman', 'join', 'love', 'history', 'find', 'strength', 'together', 'could', 'find', 'alone']
['arrianna', 'williams', 'ordinary', 'yr', ' ', 'old', 'woman', 'think', 'stumble', 'across', 'special', 'book', 'soon', 'see', 'special', 'book', 'really', 'least', 'think', 'know', 'book', 'come', 'life', 'arrianna', 'longer', 'read', 'page', 'story', 'shortly', 'read', 'book', 'run', 'old', 'childhood', 'friend', 'damian', 'quickly', 'fall', 'last', 'long', 'hurt', 'find', 'comfort', 'another', 'man', 'thinking', 'life', 'completely', 'normal', 's

In [23]:
#add_modified_description("/content/goodreads_data/books_filtered_by_language.csv", "/content/goodreads_data/books_filtered_by_language_mod_desc.csv")

In [28]:
from collections import Counter
import re

def preprocess_data(training_df, val_df, test_df):

    print("\n\nPreprocessing data...")

    # Calculate the time_to_start for each interaction
    training_df["time_to_start_seconds"] = pd.to_datetime(training_df["started_at"]) - pd.to_datetime(training_df["date_added"])
    val_df["time_to_start_seconds"] = pd.to_datetime(val_df["started_at"]) - pd.to_datetime(val_df["date_added"])
    test_df["time_to_start_seconds"] = pd.to_datetime(test_df["started_at"]) - pd.to_datetime(test_df["date_added"])

    training_df["time_to_start_seconds"] = training_df["time_to_start_seconds"].dt.total_seconds()
    val_df["time_to_start_seconds"] = val_df["time_to_start_seconds"].dt.total_seconds()
    test_df["time_to_start_seconds"] = test_df["time_to_start_seconds"].dt.total_seconds()

    # Remove time_to_start_seconds that are less than or equal to 0
    training_df = training_df[training_df["time_to_start_seconds"] > 0]
    val_df = val_df[val_df["time_to_start_seconds"] > 0]
    test_df = test_df[test_df["time_to_start_seconds"] > 0]

    # Read the book data csv
    book_data = pd.read_csv(mod_book_csv_path)

    # Tokenization
    tok = spacy.load("en_core_web_sm")
    def tokenize (text):
        #text = re.sub(r"[^\x00-\x7F]+", " ", text)
        # regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
        # nopunct = regex.sub(" ", text.lower())
        return [token.text for token in tok.tokenizer(text)]

    # Count number of occurences of each word
    counts = Counter()
    for text in list(book_data['modified_description']):
      counts.update(tokenize(text))
      # counts.update(text)

    # Deleting infrequent words
    print("num_words before:",len(counts.keys()))
    for word in list(counts):
        if counts[word] < 2:
            del counts[word]
    print("num_words after:",len(counts.keys()))

    # Creating vocabulary
    vocab2index = {"":0, "UNK":1}
    words = ["", "UNK"]
    for word in counts:
        vocab2index[word] = len(words)
        words.append(word)

    def encode_sentence(text, vocab2index, N=200):
        # tokenized = tokenize(text)
        encoded = np.zeros(N, dtype=int)
        # enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
        enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in text])
        length = min(N, len(enc1))
        encoded[:length] = enc1[:length]
        return encoded, length

    book_data['encoded'] = book_data['modified_description'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))

    # Merge the book data with the training and test data
    training_df = training_df.merge(book_data, how="left", on="book_id")
    val_df = val_df.merge(book_data, how="left", on="book_id")
    test_df = test_df.merge(book_data, how="left", on="book_id")


     # Convert the language code into an integer
    lang_map = {
        "eng": 0,
        "en-US": 1,
        "en-GB": 2,
    }

    test_df["language_code"] = test_df["language_code"].map(lang_map)
    val_df["language_code"] = val_df["language_code"].map(lang_map)
    training_df["language_code"] = training_df["language_code"].map(lang_map)

    # For each user, calculate the average time_to_start_seconds for all other books they have read
    user_avg_time_to_start = training_df.groupby("user_id")["time_to_start_seconds"].mean()
    user_avg_time_to_start = user_avg_time_to_start.rename("user_avg_time_to_start")
    training_df = training_df.merge(user_avg_time_to_start, how="left", on="user_id")

    user_avg_time_to_start = val_df.groupby("user_id")["time_to_start_seconds"].mean()
    user_avg_time_to_start = user_avg_time_to_start.rename("user_avg_time_to_start")
    val_df = val_df.merge(user_avg_time_to_start, how="left", on="user_id")

    user_avg_time_to_start = test_df.groupby("user_id")["time_to_start_seconds"].mean()
    user_avg_time_to_start = user_avg_time_to_start.rename("user_avg_time_to_start")
    test_df = test_df.merge(user_avg_time_to_start, how="left", on="user_id")

    # Remove unnecessary columns - ones that are not useful for training the model
    training_df = training_df.drop(
        columns=["isbn", "isbn13", "date_added", "read_at", "started_at", "title", "description"]
    )
    val_df = val_df.drop(
        columns=["isbn", "isbn13", "date_added", "read_at", "started_at", "title", "description"]
    )
    test_df = test_df.drop(
        columns=["isbn", "isbn13", "date_added", "read_at", "started_at", "title", "description"]
    )

    training_df = training_df.dropna()
    val_df = val_df.dropna()
    test_df = test_df.dropna()

    return training_df, val_df, test_df


In [None]:
processed_train_df, processed_val_df, processed_test_df = preprocess_data(train_df, val_df, test_df)



Preprocessing data...


In [28]:
processed_train_df.to_csv("/content/drive/MyDrive/mlp-project/data/processed_train_df_2.csv")
processed_val_df.to_csv("/content/drive/MyDrive/mlp-project/data/processed_val_df_2.csv")
processed_test_df.to_csv("/content/drive/MyDrive/mlp-project/data/processed_test_df_2.csv")

In [29]:
processed_train_df.head(10)

Unnamed: 0,time_to_start_seconds,language_code,num_pages,user_avg_time_to_start
0,2.0,0.0,352.0,43665840.0
1,10846945.0,2.0,241.0,43665840.0
2,3656012.0,0.0,343.0,43665840.0
3,8397258.0,0.0,329.0,43665840.0
4,2370.0,0.0,384.0,43665840.0
6,2265323.0,0.0,113.0,43665840.0
7,1644523.0,0.0,323.0,43665840.0
8,4690108.0,0.0,290.0,43665840.0
11,286504.0,0.0,399.0,43665840.0
12,140313991.0,0.0,110.0,43665840.0


## PyTorch training loop

In [None]:
def train_model_regr(model, epochs=10, lr=0.001):

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.float()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.mse_loss(y_pred, y.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss = validation_metrics_regr(model, val_dl)
        if i % 5 == 1:
            print("train mse %.3f val rmse %.3f" % (sum_loss/total, val_loss))

def validation_metrics_regr (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.float()
        y_hat = model(x, l)
        loss = np.sqrt(F.mse_loss(y_hat, y.unsqueeze(-1)).item())
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total