In [52]:

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split
import os, csv
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import re
from nltk.tokenize import wordpunct_tokenize

# Read data (Datamodule)

In [83]:
movies_train = pd.read_csv('../dataset/movies_train.csv', escapechar='\\', quoting=csv.QUOTE_NONE)
movies_train, movies_val = train_test_split(movies_train, test_size=0.2, random_state=42)
movies_test = pd.read_csv('../dataset/movies_test.csv', escapechar='\\', quoting=csv.QUOTE_NONE)
print(len(movies_train), len(movies_val), len(movies_test))


2484 622 777


In [84]:
movies_test

Unnamed: 0,title,genre
0,the great muppet caper,"[""Children's"", 'Comedy']"
1,doctor zhivago,"['Drama', 'Romance', 'War']"
2,frankenstein meets the wolf man,['Horror']
3,for your eyes only,['Action']
4,the mirror,['Drama']
...,...,...
772,the inheritors,['Drama']
773,"the karate kid, part ii","['Action', 'Adventure', 'Drama']"
774,a league of their own,"['Comedy', 'Drama']"
775,algiers,"['Drama', 'Romance']"


### Title

In [94]:
TITLE_MAX_LEN = 15
pad_token = '<PAD>'
unk_token = '<UNK>'

def tokenize(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = wordpunct_tokenize(text)
    return tokens

def create_vocab(dataset, column='title'):
    df = dataset.copy()
    titles = df[column].tolist()
    vocab = set()
    for title in titles:
        tokens = tokenize(title)
        vocab.update(tokens)
    vocab = list(vocab)
    vocab.append(pad_token)
    vocab.append(unk_token)
    return vocab

def onehot_vectorize(title, title2int):
    tokens = tokenize(title)
    # title_vec = np.zeros(TITLE_MAX_LEN, dtype=np.float32)
    # title_vec.fill(title2int[pad_token])
    # for i, token in enumerate(tokens):
    #     if i >= TITLE_MAX_LEN:  
    #         break
    #     if token in title2int:
    #         title_vec[i] = title2int[token]
    #     else:
    #         title_vec[i] = title2int[unk_token]
    # return title_vec
    tokens = tokens[:TITLE_MAX_LEN]
    while len(tokens) < TITLE_MAX_LEN:
        tokens.append(pad_token)
    title_vec = np.zeros((TITLE_MAX_LEN,len(title2int)), dtype=np.float32)
    for i, token in enumerate(tokens):
        if token in title2int:
            title_vec[i][title2int[token]] = 1
        else:
            title_vec[i][title2int[unk_token]] = 1
    return title_vec

In [89]:
def multi_hot_genres(genres,  genres_dict):
    genres = genres.strip('][').replace("'", "").split(', ')
    multi_hot = np.zeros(len(genres_dict))
    for genre in genres:
        if genre in genres_dict:
                multi_hot[genres_dict[genre]] = 1
    return multi_hot

## Dataset

In [90]:
class titleDataset(Dataset):
    def __init__(self, df, title2int, genre2dict):
        self.df = df
        self.title2int = title2int
        self.genre2dict = genre2dict

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        title = self.df.iloc[idx]['title']
        title_vec = onehot_vectorize(title, self.title2int)
        genres = self.df.iloc[idx]['genre']
        genres_vec = multi_hot_genres(genres, self.genre2dict)
        return title_vec, genres_vec

In [92]:
class title_genres_dataset(pl.LightningDataModule):
    def __init__(self, batch_size=32, data_folder='../dataset/'):
        super().__init__()
        self.batch_size = batch_size
        self.data_folder = data_folder
        self.prepare_data()

    def prepare_data(self):
        # load data
        movies_train = pd.read_csv(self.data_folder + 'movies_train.csv', escapechar='\\', quoting=csv.QUOTE_NONE)
        movies_test = pd.read_csv(self.data_folder + 'movies_test.csv', escapechar='\\', quoting=csv.QUOTE_NONE)

        # title process
        vocab = create_vocab(movies_train.merge(movies_test), column='title')
        self.vocab_size = len(vocab)
        title2int = {word: i for i, word in enumerate(vocab)}

        # genres process
        genres_list = []
        with open(self.data_folder + 'genres.txt', 'r') as f:
            genres_list = [g.replace('\n','') for g in f.readlines()]
        genre2int = {genre: i for i, genre in enumerate(genres_list)} 

        # create dataset
        # split train and val
        movies_train, movies_val = train_test_split(movies_train, test_size=0.2, random_state=42)
        movies_train.reset_index(drop=True, inplace=True)
        movies_test.reset_index(drop=True, inplace=True)
        movies_val.reset_index(drop=True, inplace=True)
        train = titleDataset(movies_train, title2int, genre2int)
        val = titleDataset(movies_val, title2int, genre2int)
        test = titleDataset(movies_test, title2int, genre2int)
        self.movies_train_dataloader = DataLoader(train, batch_size=self.batch_size, shuffle=True)
        self.movies_val_dataloader = DataLoader(val, batch_size=self.batch_size, shuffle=False)
        self.movies_test_dataloader = DataLoader(test, batch_size=self.batch_size, shuffle=False)
        
        # movies_train = movies_train.values.tolist()
        # movies_val = movies_val.values.tolist()
        # movies_test = movies_test.values.tolist()
        # self.movies_train_dataloader = DataLoader(movies_train, batch_size=self.batch_size, shuffle=True)
        # self.movies_val_dataloader = DataLoader(movies_val, batch_size=self.batch_size, shuffle=False)
        # self.movies_test_dataloader = DataLoader(movies_test, batch_size=self.batch_size, shuffle=False)

    def train_dataloader(self):
        return self.movies_train_dataloader

    def val_dataloader(self):
        return self.movies_val_dataloader
    
    def test_dataloader(self):
        return self.movies_test_dataloader
    

# Model (RNN) define

In [87]:
class titleRNN(pl.LightningModule):
    def __init__(self, device="cpu", input_size=15, hidden_size=32, num_layers=2, batch_first=True, bidirectional = True):
        super(titleRNN,self).__init__()
        self.dev = device # device variable was taken, so using dev instead :(
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=batch_first, nonlinearity='relu', bidirectional=bidirectional)
        self.linear = nn.Linear(hidden_size, 18)
        self.softmax = nn.Softmax(dim=1)
        
    
    def forward(self,x):
        x,_ = self.rnn(x)
        x = self.linear(x)
        x = self.softmax(x)
        return x
    
    def configure_optimizers(self):
        return optim.Adam(self.parameters(),lr=0.001)
    
    def cross_entropy_loss(self,logits,labels):
        return F.cross_entropy(logits,labels)
    
    def training_step(self, train_batch, batch_idx):
        title, genres = train_batch
        print("-----------------------------------", title.shape, genres.shape)
        title_tensor = torch.tensor(title).to(self.dev)
        genre_tensor = torch.tensor(genres).to(self.dev)

        output = self.forward(title_tensor)
        loss = self.cross_entropy_loss(output, genre_tensor)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        title, genres = val_batch
        title_tensor = torch.tensor(title).to(self.dev)
        genre_tensor = torch.tensor(genres).to(self.dev)

        output = self.forward(title_tensor)
        loss = self.cross_entropy_loss(output, genre_tensor)
        self.log('val_loss', loss)
    

# Training

In [95]:
NUM_EPOCHS = 20
BATCH_SIZE = 32
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

titleDATASET = title_genres_dataset(batch_size=BATCH_SIZE)
train_dataloader = titleDATASET.train_dataloader()
val_dataloader = titleDATASET.val_dataloader()

titleModel = titleRNN(device, hidden_size=titleDATASET.vocab_size)

trainer = pl.Trainer(max_epochs=NUM_EPOCHS, num_sanity_val_steps=0)
trainer.fit(titleModel, train_dataloader, val_dataloader)

ParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 5


In [None]:
test_dataloader = titleDATASET.test_dataloader()
trainer.test(test_dataloader)

TypeError: `model` must be a `LightningModule` or `torch._dynamo.OptimizedModule`, got `DataLoader`