<a href="https://colab.research.google.com/github/JadeBenson/Wikipedia_DeepLearning/blob/main/multimodal_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')
import pickle

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

raw_data_path = '/content/drive/MyDrive/deep-learning/movies_and_posters.csv'
destination_folder = '/content/drive/My Drive/multimodal/'

wiki = pd.read_csv(raw_data_path)
wiki = wiki[~wiki['Genre'].isna()]

wiki['Genre'] = wiki['Genre'].apply(lambda x: x.split('|'))
genres = wiki['Genre'].sum()
genres = Counter(genres)
genres = [k for k, v in genres.items() if v > 900]
genres = dict(zip(genres, range(len(genres))))

one_hot_genres = []
for genre_list in wiki['Genre']: 
    one_hot_genre = np.zeros(len(genres))
    for genre in genre_list: 
        try:
            one_hot_genre[genres[genre]] = 1
        except KeyError: 
            continue
    one_hot_genres.append(one_hot_genre)

labels = np.array(one_hot_genres)
genres_i = [k for k, v in sorted(genres.items(), key=lambda x: x[1])]
wiki['label'] = list(labels)
wiki = wiki[['plot', 'label']]

In [3]:
import spacy
import re
import string

#tokenization
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

#count number of occurences of each word
counts = Counter()
for index, row in wiki.iterrows():
    counts.update(tokenize(row['plot']))

#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 81340
num_words after: 56459


In [4]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [5]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

wiki['encoded'] = wiki['plot'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
wiki.head()

Unnamed: 0,plot,label,encoded
0,"Set in the second half of the 22nd century, th...","[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[2, 3, 4, 5, 6, 7, 4, 8, 9, 10, 11, 4, 12, 13..."
1,A series of murders of rich young women throug...,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[16, 210, 7, 211, 7, 212, 213, 38, 214, 215, ..."
2,"Adam, a San Francisco-based artist who works a...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[[460, 11, 16, 461, 462, 454, 463, 152, 464, 1..."
3,{{Plot|dateAct 1Act 2Act 3Act 4Act 5 Finally n...,"[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[[8, 534, 1, 11, 535, 11, 535, 11, 535, 11, 53..."
4,The film opens with Mary Poppins perched in a...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[[4, 12, 436, 37, 561, 562, 11, 563, 3, 16, 56..."


## PyTorch Dataset

In [21]:
posters = np.load('/content/drive/MyDrive/deep-learning/formatted_posters.npy')[wiki.index]

X = list(wiki['encoded'])
y = list(wiki['label'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)
posters_train, posters_valid = train_test_split(posters, test_size=0.2, random_state=1)

In [24]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class MovieDataset(Dataset):
    def __init__(self, X, Y, posters):
        self.X = X
        self.posters = posters
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1], self.posters[idx]

train_ds = MovieDataset(X_train, y_train, posters_train)
valid_ds = MovieDataset(X_valid, y_valid, posters_valid)

batch_size = 32
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [92]:
import torch.nn.functional as F
import random

criterion = nn.BCELoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, epochs=10, lr=0.001):
    
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l, p in train_dl:
            x = x.long().to(device)
            y = y.float().to(device)
            p = p.float().to(device)
            y_pred = model(x, l, p)
            optimizer.zero_grad()
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss = validation_metrics(model, val_dl)
        print(f"Epoch {i}: train loss {round(sum_loss/total, 4)}, val loss {round(val_loss, 4)}")

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l, p in valid_dl:
        x = x.long().to(device)
        y = y.float().to(device)
        p = p.float().to(device)
        with torch.no_grad():
            y_hat = model(x, l, p)
        loss = criterion(y_hat, y)
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total

In [114]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class LSTM(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=3)
        self.fc0 = nn.Linear(hidden_dim, 16)

        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(43008, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 16)
        
        self.fc4 = nn.Linear(32, 16)
        self.fc5 = nn.Linear(16, len(genres_i))
        self.sigmoid = nn.Sigmoid()

        
    def forward(self, x, s, p):
        x = self.embeddings(x)
        x = self.dropout(x)
        x = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x)
        x = self.fc0(ht[-1])

        p = p.view(-1, 3, 268, 182)
        p = self.pool(F.relu(self.conv1(p)))
        p = self.pool(F.relu(self.conv2(p)))
        p = torch.flatten(p, 1) # flatten all dimensions except batch
        p = F.relu(self.fc1(p))
        p = F.relu(self.fc2(p))
        p = F.relu(self.fc3(p))

        out = torch.cat([x, p], 1)
        out = F.relu(self.fc4(out))
        out = F.relu(self.fc5(out))
        out = self.sigmoid(out)

        return out

In [110]:
model =  LSTM(vocab_size, 64, 32).to(device)

In [111]:
torch.cat([torch.tensor([[1,2,3], [9,9,9]]), 
           torch.tensor([[1,2,3], [9,9,9]])], 1)

tensor([[1, 2, 3, 1, 2, 3],
        [9, 9, 9, 9, 9, 9]])

In [113]:
train_model(model, epochs=50, lr=0.01)

Epoch 0: train loss 0.6931, val loss 0.6931
Epoch 1: train loss 0.6931, val loss 0.6931
Epoch 2: train loss 0.6931, val loss 0.6931


KeyboardInterrupt: ignored

In [98]:
start = True

for x, y, l, p in val_dl:
    x = x.long().to(device)
    y = y.float().to(device)
    p = p.float().to(device)
    with torch.no_grad():
        y_hat = model(x, l, p)
    logits = y_hat.detach().cpu().numpy()
    label_ids = y.to('cpu').numpy()
    if start: 
        probs = logits 
        test_labels = label_ids
        start = False
    else:
        probs = np.concatenate((probs, logits))
        test_labels = np.concatenate((test_labels, label_ids))
    

In [13]:
from sklearn import metrics
# precision, recall, accuracy, f1, roc_auc
genres_i = [k for k, v in sorted(genres.items(), key=lambda x: x[1])]

metric_dict = {
    'f1_score': [],
    'precision': [],
    'recall': [],
    'accuracy': [],
    'roc_auc': []
}
for i, genre in enumerate(genres_i):
    metric_dict['f1_score'].append(metrics.f1_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['precision'].append(metrics.precision_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['recall'].append(metrics.recall_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['accuracy'].append(metrics.accuracy_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['roc_auc'].append(metrics.roc_auc_score(test_labels[:, i], probs[:, i] > .5))

metric_dict['genres'] = genres_i
metric_df = pd.DataFrame(metric_dict)
metric_df.sort_values('roc_auc', ascending=False, inplace=True)
metric_df

Unnamed: 0,f1_score,precision,recall,accuracy,roc_auc,genres
0,0.422744,0.472376,0.38255,0.829313,0.649554,Action
3,0.518863,0.524556,0.513292,0.659722,0.627238,Comedy
8,0.383961,0.463343,0.327801,0.814693,0.623306,Crime
1,0.337539,0.371528,0.309249,0.846491,0.616758,Horror
5,0.633759,0.618182,0.650142,0.612208,0.610947,Drama
7,0.276243,0.380711,0.216763,0.85636,0.582858,Adventure
4,0.293888,0.405018,0.230612,0.801535,0.578352,Romance
6,0.19337,0.207101,0.181347,0.893275,0.564327,Family
9,0.200542,0.256944,0.164444,0.892178,0.560916,Mystery
2,0.208024,0.244755,0.180879,0.80519,0.544462,Thriller


In [99]:
from sklearn import metrics
# precision, recall, accuracy, f1, roc_auc
genres_i = [k for k, v in sorted(genres.items(), key=lambda x: x[1])]

metric_dict = {
    'f1_score': [],
    'precision': [],
    'recall': [],
    'accuracy': [],
    'roc_auc': []
}
for i, genre in enumerate(genres_i):
    metric_dict['f1_score'].append(metrics.f1_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['precision'].append(metrics.precision_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['recall'].append(metrics.recall_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['accuracy'].append(metrics.accuracy_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['roc_auc'].append(metrics.roc_auc_score(test_labels[:, i], probs[:, i] > .5))

metric_dict['genres'] = genres_i
metric_df = pd.DataFrame(metric_dict)
metric_df.sort_values('roc_auc', ascending=False, inplace=True)
metric_df

Unnamed: 0,f1_score,precision,recall,accuracy,roc_auc,genres
0,0.0,0.0,0.0,0.842836,0.5,Action
1,0.0,0.0,0.0,0.880482,0.5,Horror
2,0.0,0.0,0.0,0.87098,0.5,Thriller
3,0.0,0.0,0.0,0.647661,0.5,Comedy
4,0.0,0.0,0.0,0.803728,0.5,Romance
5,0.0,0.0,0.0,0.460526,0.5,Drama
6,0.0,0.0,0.0,0.932018,0.5,Family
7,0.0,0.0,0.0,0.877193,0.5,Adventure
8,0.0,0.0,0.0,0.824927,0.5,Crime
9,0.0,0.0,0.0,0.921784,0.5,Mystery


In [100]:
probs

array([[0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5],
       ...,
       [0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5]], dtype=float32)