<a href="https://colab.research.google.com/github/JadeBenson/Wikipedia_DeepLearning/blob/main/multimodal_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')
import pickle

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

raw_data_path = '/content/drive/MyDrive/deep-learning/movies_and_posters.csv'
character_data_path = '/content/drive/MyDrive/deep-learning/MovieSummaries/character.metadata.tsv'
destination_folder = '/content/drive/My Drive/multimodal/'

wiki = pd.read_csv(raw_data_path)
wiki = wiki[~wiki['Genre'].isna()]
chars = pd.read_csv(character_data_path,
                    sep='\t',
                    names=['wiki_movie_id', 'fb_movie_id', 'release_date', 'char_name', 'actor_dob', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age', 'fb_map_id', 'fb_char_id', 'fb_actor_id'])


wiki['Genre'] = wiki['Genre'].apply(lambda x: x.split('|'))
genres = wiki['Genre'].sum()
genres = Counter(genres)
genres = [k for k, v in genres.items() if v > 900]
genres = dict(zip(genres, range(len(genres))))

one_hot_genres = []
for genre_list in wiki['Genre']: 
    one_hot_genre = np.zeros(len(genres))
    for genre in genre_list: 
        try:
            one_hot_genre[genres[genre]] = 1
        except KeyError: 
            continue
    one_hot_genres.append(one_hot_genre)

labels = np.array(one_hot_genres)
genres_i = [k for k, v in sorted(genres.items(), key=lambda x: x[1])]
wiki['label'] = list(labels)

mask = ((chars['actor_gender'].isna()) | (chars['fb_actor_id'].isna()) | (chars['actor_age'].isna()))
chars = chars[~mask]

actor_count = chars.groupby('wiki_movie_id')[['fb_actor_id']].count()
chars['male_ratio'] = chars['actor_gender'] == 'M'
male_ratio = chars.groupby('wiki_movie_id')[['male_ratio']].mean()
actor_age = chars.groupby('wiki_movie_id')[['actor_age']].mean()

actors = pd.concat([actor_count, male_ratio, actor_age], axis=1)
wiki = wiki.merge(actors, left_on='wiki_ID', right_index=True, how='inner')

wiki['tabular_input'] = list(wiki[['year','IMDB Score', 'fb_actor_id', 'actor_age', 'male_ratio']].values.round(2))
wiki = wiki[['plot', 'label', 'tabular_input']]

In [3]:
import spacy
import re
import string

#tokenization
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

#count number of occurences of each word
counts = Counter()
for index, row in wiki.iterrows():
    counts.update(tokenize(row['plot']))

#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 10:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 78780
num_words after: 24138


In [4]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [5]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

wiki['encoded'] = wiki['plot'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
wiki.head()

Unnamed: 0,plot,label,tabular_input,encoded
0,"Set in the second half of the 22nd century, th...","[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2001.0, 4.9, 13.0, 43.46, 0.54]","[[2, 3, 4, 5, 6, 7, 4, 8, 9, 10, 11, 4, 12, 13..."
1,A series of murders of rich young women throug...,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1987.0, 6.4, 2.0, 29.0, 0.5]","[[16, 206, 7, 207, 7, 208, 209, 37, 210, 211, ..."
2,"Adam, a San Francisco-based artist who works a...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[1997.0, 6.1, 6.0, 35.17, 0.33]","[[449, 11, 16, 450, 451, 443, 452, 149, 453, 1..."
3,{{Plot|dateAct 1Act 2Act 3Act 4Act 5 Finally n...,"[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[1989.0, 7.7, 20.0, 45.9, 0.85]","[[8, 523, 1, 11, 524, 11, 524, 11, 524, 11, 52..."
4,The film opens with Mary Poppins perched in a...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[1964.0, 7.8, 10.0, 37.6, 0.4]","[[4, 12, 426, 36, 550, 551, 11, 552, 3, 16, 55..."


## PyTorch Dataset

In [6]:
posters = np.load('/content/drive/MyDrive/deep-learning/formatted_posters.npy')[wiki.index]

X = list(wiki['encoded'])
y = list(wiki['label'])

tabular = list(wiki['tabular_input'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)
posters_train, posters_valid = train_test_split(posters, test_size=0.2, random_state=1)
tabular_train, tabular_valid = train_test_split(tabular, test_size=0.2, random_state=1)

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class MovieDataset(Dataset):
    def __init__(self, X, Y, posters, tabular):
        self.X = X
        self.posters = posters
        self.y = Y
        self.tabular = tabular
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1], self.posters[idx], self.tabular[idx]

train_ds = MovieDataset(X_train, y_train, posters_train, tabular_train)
valid_ds = MovieDataset(X_valid, y_valid, posters_valid, tabular_valid)

batch_size = 32
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [8]:
import torch.nn.functional as F
import random

criterion = nn.BCELoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, epochs=10, lr=0.001):

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l, p, t in train_dl:
            x = x.long().to(device)
            y = y.float().to(device)
            p = p.float().to(device)
            t = t.float().to(device)

            y_pred = model(x, l, p, t)
            optimizer.zero_grad()
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss = validation_metrics(model, val_dl)
        print(f"Epoch {i}: train loss {round(sum_loss/total, 4)}, val loss {round(val_loss, 4)}")

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l, p, t in valid_dl:
        x = x.long().to(device)
        y = y.float().to(device)
        p = p.float().to(device)
        t = t.float().to(device)

        with torch.no_grad():
            y_hat = model(x, l, p, t)
        loss = criterion(y_hat, y)
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total

In [9]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class MultiModalNet(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=2)
        self.fc0 = nn.Linear(hidden_dim, 16)

        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(43008, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 16)
        
        self.fc4 = nn.Linear(37, len(genres_i))
        self.sigmoid = nn.Sigmoid()

        
    def forward(self, x, s, p, t):
        x = self.embeddings(x)
        x = self.dropout(x)
        x = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x)
        x = self.fc0(ht[-1])

        p = p.view(-1, 3, 268, 182)
        p = self.pool(F.relu(self.conv1(p)))
        p = self.pool(F.relu(self.conv2(p)))
        p = torch.flatten(p, 1)
        p = F.relu(self.fc1(p))
        p = F.relu(self.fc2(p))
        p = F.relu(self.fc3(p))

        out = torch.cat([x, p, self.sigmoid(t)], 1)
        out = self.sigmoid(self.fc4(out))

        return out

In [10]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

model =  MultiModalNet(vocab_size, 64, 32).to(device)

In [11]:
train_model(model, epochs=50, lr=0.005)

Epoch 0: train loss 1.8355, val loss 0.4447
Epoch 1: train loss 0.4354, val loss 0.4345
Epoch 2: train loss 0.4146, val loss 0.4235
Epoch 3: train loss 0.3996, val loss 0.4237
Epoch 4: train loss 0.3875, val loss 0.4254
Epoch 5: train loss 0.3772, val loss 0.4375
Epoch 6: train loss 0.367, val loss 0.4456
Epoch 7: train loss 0.3581, val loss 0.4541
Epoch 8: train loss 0.3449, val loss 0.4463
Epoch 9: train loss 0.3314, val loss 0.4591
Epoch 10: train loss 0.3197, val loss 0.4638
Epoch 11: train loss 0.3091, val loss 0.4509
Epoch 12: train loss 0.2982, val loss 0.4785
Epoch 13: train loss 0.2894, val loss 0.4835
Epoch 14: train loss 0.2804, val loss 0.4879
Epoch 15: train loss 0.2702, val loss 0.4942
Epoch 16: train loss 0.2617, val loss 0.5067
Epoch 17: train loss 0.2532, val loss 0.4976
Epoch 18: train loss 0.2434, val loss 0.5346
Epoch 19: train loss 0.2363, val loss 0.5207
Epoch 20: train loss 0.2298, val loss 0.539
Epoch 21: train loss 0.2239, val loss 0.5449
Epoch 22: train loss 0

In [12]:
start = True

for x, y, l, p, t in val_dl:
    x = x.long().to(device)
    y = y.float().to(device)
    p = p.float().to(device)
    t = t.float().to(device)
    with torch.no_grad():
        y_hat = model(x, l, p, t)
    logits = y_hat.detach().cpu().numpy()
    label_ids = y.to('cpu').numpy()
    if start: 
        probs = logits 
        test_labels = label_ids
        start = False
    else:
        probs = np.concatenate((probs, logits))
        test_labels = np.concatenate((test_labels, label_ids))
    

In [14]:
from sklearn import metrics
# precision, recall, accuracy, f1, roc_auc
genres_i = [k for k, v in sorted(genres.items(), key=lambda x: x[1])]

metric_dict = {
    'f1_score': [],
    'precision': [],
    'recall': [],
    'accuracy': [],
    'roc_auc': []
}
for i, genre in enumerate(genres_i):
    metric_dict['f1_score'].append(metrics.f1_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['precision'].append(metrics.precision_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['recall'].append(metrics.recall_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['accuracy'].append(metrics.accuracy_score(test_labels[:, i], probs[:, i] > .5))
    metric_dict['roc_auc'].append(metrics.roc_auc_score(test_labels[:, i], probs[:, i] > .5))

metric_dict['genres'] = genres_i
metric_df = pd.DataFrame(metric_dict)
metric_df.sort_values('roc_auc', ascending=False, inplace=True)
metric_df

Unnamed: 0,f1_score,precision,recall,accuracy,roc_auc,genres
0,0.42454,0.442455,0.408019,0.816078,0.652739,Action
7,0.368664,0.377358,0.36036,0.838824,0.635525,Adventure
3,0.537237,0.510244,0.567245,0.646667,0.629446,Comedy
8,0.384236,0.404145,0.366197,0.803922,0.628955,Crime
1,0.360396,0.4375,0.306397,0.873333,0.627233,Horror
5,0.608209,0.622613,0.594457,0.588235,0.587729,Drama
4,0.326257,0.358722,0.29918,0.763529,0.586302,Romance
9,0.186235,0.377049,0.123656,0.921176,0.553791,Mystery
2,0.223228,0.262411,0.194226,0.798039,0.549164,Thriller
6,0.14433,0.21875,0.107692,0.902353,0.537923,Family
