<a href="https://colab.research.google.com/github/Lolinator3001/MovieWeb/blob/master/MovieWeb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import transforms
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import wandb
import io
import os
from PIL import Image
from torch.utils.data import Dataset
from sklearn.manifold import TSNE
import seaborn as sns
#import plotly.express as px
import asyncio
import aiohttp
from tqdm import trange
import plotly.express as px

import IPython

In [27]:
WANDB_ON = True
SWEEP_ON = True
COUNT = None

### IMDB Dataset

Get the movie titles, genres, ratings, years, and poster urls out of the IMDB website and saving it into a dataframe

In [28]:
class IMDB(Dataset):
    def __init__(self, URL: str = "https://www.imdb.com/search/title/?genres=action&start=0&explore=title_type,genres&ref_=adv_nxt", IMG_SIZE: tuple = (44,64)):
        self.URL = URL
        self.IMG_SIZE = IMG_SIZE
        self.transform = transforms.Compose([
            # transforms.RandomHorizontalFlip(p=0.5),
            transforms.ToTensor(),
            # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

        # Parses through the IMDB website and finds the html elements that contain movie information  
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, 'html.parser')
        movie_list = soup.find(class_='lister-list')
        movie_elems = movie_list.find_all('tr')

        # Runs through the elements of the HTML to find the movie information 
        movTitle = []
        movYear = []
        movRating = []
        movPoster = []
        movGenre = []
        movURL = []

        for i in trange(0, 10000, 50):
          response = requests.get(f"https://www.imdb.com/search/title/?start={i+1}&explore=title_type,genres&ref_=adv_nxt&title_type=Movie")
          soup = BeautifulSoup(response.content, 'html.parser')
          lister_list = soup.find(class_='lister-list')
          movies = lister_list.find_all('div', "lister-item mode-advanced")
          for movie in movies:
            try:
              title = movie.a.img['alt']
              genre = movie.find('span', 'genre').decode_contents().strip()
              rating = float(movie.strong.decode_contents())
              url = movie.a.img['loadlate']
              year = int(''.join([c for c in movie.find('span', "lister-item-year text-muted unbold").decode_contents() if c in list(map(str, range(10)))]))
              movURL.append(url)
              movRating.append(rating)
              movGenre.append(genre)
              movTitle.append(title)
              movYear.append(year)
            except:
              pass

        # Take the information and save it into a dataframe
        imdb = {'title':movTitle, 'year':movYear, 'genre': movGenre, 'rating':movRating, 'url': movURL}
        self.df = pd.DataFrame(imdb)
        self.df.to_csv('MovieData.csv')


    def __len__(self):
        return self.df.shape[0] # Gets the length of the dataframe


    # def __getitem__(self, index:int) -> np.ndarray:
    #     image = self.transform(self.df['poster'][index]) # Turns the image into a tensor image
    #     return image
    
    # def displayImage(self, index:int):
    #     image = self.df['Movie Poster'][index]
    #     plt.imshow(image) # Displays image

In [29]:
data = IMDB()

100%|██████████| 200/200 [03:37<00:00,  1.09s/it]


Login to W&B

In [33]:
if WANDB_ON:
    !wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Hyperparameter optimization configuration using W&B Sweeps

In [34]:
if WANDB_ON and SWEEP_ON:
    sweep_config = {
        "name": "MovieWeb",
        "method": "bayes",
        "metric": {
            "name": "test_loss",
            "goal": "minimize"
        },
        "parameters": {
            "learning_rate": {
                "distribution": "uniform",
                "min": 0.0001,
                "max": 0.01
            },
            "optimizer": {
                "values": ["Adam"]
            },
            "epochs":{
                "distribution": "normal",
                "mu": 1000,
                "sigma": 250,
            },
            "batch_size": {
                "distribution": "int_uniform",
                "min": 4,
                "max": 256
            },
            "random_seed": {
                "values": [42]
            },
            "validation_split": {
                "values": [.2]
            },
            "shuffle_dataset": {
                "values": [True]
            }
        }
    }

    sweep_id = wandb.sweep(sweep_config, project='MovieWeb', entity='cmerrill')

Create sweep with ID: xll7dsby
Sweep URL: https://wandb.ai/cmerrill/MovieWeb/sweeps/xll7dsby


# **Define the Convolutional Autoencoder**

In [35]:
class ConvAutoencoder(nn.Module):
    def __init__(self):
        super(ConvAutoencoder, self).__init__()
        self.embedding_size = None
        
        #Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1), 
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(16, 4, 3, padding=1), 
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
            
        )
       
        #Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(4, 16, 2, stride=2),
            nn.ReLU(),
            nn.ConvTranspose2d(16, 3, 2, stride=2),
            nn.Sigmoid(),
        )

    def forward(self, x) -> torch.Tensor:
        x = self.encoder(x)
        if x.size() != self.embedding_size:
            self.embedding_size = x.size()
        x = self.decoder(x)
        return x

    def embedding(self, x) -> torch.Tensor:
        x = torch.flatten(self.encoder(x), start_dim=1).T
        return x

    def count_parameters(self) -> int:
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

## Get's the ratio between the input of the auto encoder and the embedding space.

In [36]:
def compression_ratio():
    temp_db = IMDB()
    x = temp_db.__getitem__(0)
    model = ConvAutoencoder()
    x = torch.reshape(x, (1,*x.shape))
    _ = model.forward(x)
    cr = np.prod(x.shape) / np.prod(model.embedding_size)
    return cr



print(f'Compression ratio: {compression_ratio()}:1')

  6%|▌         | 11/200 [00:10<02:56,  1.07it/s]


KeyboardInterrupt: ignored

In [37]:
hasGPU = torch.cuda.is_available

Trains the model on the dataset extracted from IMDB

In [None]:
def train():
    # get_device()
    with wandb.init(project='MovieWeb', entity='cmerrill') if WANDB_ON else None as run:
        config = {}
        config['learning_rate'] = 0.1
        config['batch_size'] = 4
        config['epochs'] = 2000
        config['validation_split'] = 0.2
        config['shuffle_dataset'] = True
        config['random_seed'] = 42
        config['optimizer'] = "SGD"
        
        if WANDB_ON:
            config.update(wandb.config)
        
        config['epochs'] = int(config['epochs'])
        print(config)
        
        # Creating data indices for training and validation splits:
        imdbDataset = IMDB()
        dataset_size = len(imdbDataset)
        indices = list(range(dataset_size))
        split = int(np.floor(config['validation_split'] * dataset_size))
        if config['shuffle_dataset'] :
            np.random.seed(config['random_seed'])
            np.random.shuffle(indices)
        train_indices, test_indices = indices[split:], indices[:split]

        # Creating PT data samplers and loaders:
        train_sampler = SubsetRandomSampler(train_indices)
        test_sampler = SubsetRandomSampler(test_indices)

        train_loader = torch.utils.data.DataLoader(
            imdbDataset, 
            batch_size=config['batch_size'], 
            sampler=train_sampler
        )

        test_loader = torch.utils.data.DataLoader(
            imdbDataset, 
            batch_size=config['batch_size'],
            sampler=test_sampler
        )

        #Instantiate the model
        model = ConvAutoencoder()
        if hasGPU():
          model.cuda()
        else:
          model.cpu()

        if WANDB_ON:
            wandb.watch(model)

        #Loss function
        criterion = nn.MSELoss()

        #Optimizer
        optimizer = getattr(torch.optim, config['optimizer'])(model.parameters(), lr = config['learning_rate'])

        for epoch in range(1, config['epochs']+1):
            # monitor training loss
            train_loss = 0.0
            
            # Training
            for images in train_loader:
              if hasGPU():
                images = images.cuda()
              else:
                images = images.cpu()
                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, images)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()*images.size(0)
                
            train_loss = train_loss/len(train_loader)
            
            if WANDB_ON:
                wandb.log({"loss": train_loss})

            #monitor testing loss
            test_loss = 0.0

            # Testing
            for images in test_loader:
              if hasGPU():
                images = images.cuda()
              else:
                images = images.cpu()
              outputs = model(images)
              loss = criterion(outputs, images)
              test_loss += loss.item()*images.size(0)
          
            test_loss = test_loss/len(test_loader)
            
            if WANDB_ON:
                wandb.log({"test_loss": test_loss})

            print('Epoch {} | Train Loss: {:.4f} |  Test Loss: {:.4f}'.format(epoch, train_loss, test_loss))

        #Batch of test images
        dataiter = iter(train_loader)
        images = dataiter.next()
        if hasGPU():
          images = images.cuda()
        else:
          images = images.cpu()

        #Sample outputs
        outputs = model(images)
        images = images.cpu().data.numpy()
        outputs = outputs.cpu().data.numpy()

        #Original Images
        f, axarr = plt.subplots(1,4,figsize=(12,12))
        for idx in np.arange(4):
            axarr[idx].imshow(np.moveaxis(images[idx], 0, -1))
            axarr[idx].tick_params(
                left=False, 
                right=False , 
                labelleft=False ,
                labelbottom=False, 
                bottom=False
            )
        if WANDB_ON:
            wandb.log({"true_images": plt})
        else:
            plt.show()

        #Reconstructed Images
        f, axarr = plt.subplots(1,4,figsize=(12,12))
        for idx in np.arange(4):
            axarr[idx].imshow(np.moveaxis(outputs[idx], 0, -1))
            axarr[idx].tick_params(
                left=False, 
                right=False , 
                labelleft=False ,
                labelbottom=False, 
                bottom=False
            )

        if WANDB_ON:
            wandb.log({"reconstructed_images": plt})
        else:
            plt.show()

        if WANDB_ON:
            # Save your model.
            torch.save(model.state_dict(), './saved_models/model.pth')
            # Save as artifact for version control.
            artifact = wandb.Artifact(f'model', type='model')
            artifact.add_file('./saved_models/model.pth')
            run.log_artifact(artifact, aliases=['latest', run.name])
            run.join()
            # Delete the file from the saved_models folder.
            os.remove('./saved_models/model.pth')
            run.finish()

Check if W&B and Sweeps are on; if so, run sweeps, otherwise run without it

In [None]:
if WANDB_ON:
    if SWEEP_ON:
        wandb.agent(sweep_id, function=train, count=COUNT)
    else:
        train()

Comparing the test_loss between each model and taking the lowest value as the BestModel as long as the test_loss isn't 0.

In [None]:
def getBestModel(sweep_id):
    api = wandb.Api()

    sweep = api.sweep(f"cmerrill/MovieWeb/{sweep_id}")
    runs = sorted(sweep.runs,
        key=lambda run: run.summary.get("test_loss", 0))
    runs = list(filter(lambda run: run.summary.get("test_loss") not in [0,None],runs))
    test_loss = runs[0].summary.get("name", 0)
    print(f"Best run {runs[0].name} with {test_loss} test loss")

    run = wandb.init()
    artifact = run.use_artifact(f'cmerrill/MovieWeb/model:{runs[0].name}', type='model')
    artifact_dir = artifact.download()
    model = ConvAutoencoder()
    checkpoint = torch.load(os.path.join(artifact_dir, 'model.pth'))
    model.load_state_dict(checkpoint)
    
    return model

In [None]:
model = getBestModel('wd6dnfqw')

Showing the user what the BestModel has produced in terms of reconstructed image; comparing the test image extracted from IMDB to the Reconstructed Image created by the model.

In [None]:
def validateBestModel(model):
    
    imdbDataset = IMDB()
    # Creating data indices for training and validation splits:
    dataset_size = len(imdbDataset)
    indices = list(range(dataset_size))
    split = int(np.floor(.2 * dataset_size))
    np.random.seed(42)
    np.random.shuffle(indices)
    test_indices = indices[:split]

    # Creating PyTorch data samplers and loaders:
    test_sampler = SubsetRandomSampler(test_indices)

    test_loader = torch.utils.data.DataLoader(
        imdbDataset, 
        batch_size=8,
        sampler=test_sampler
    )

    #Batch of test images
    dataiter = iter(test_loader)
    images = dataiter.next()
    # images = images.cuda()

    #Sample outputs
    outputs = model(images)
    images = images.cpu().data.numpy()
    outputs = outputs.cpu().data.numpy()

    #Original Images
    print("Test Images")
    f, axarr = plt.subplots(1,8,figsize=(12,12))
    for idx in np.arange(8):
        axarr[idx].imshow(np.moveaxis(images[idx], 0, -1))
        axarr[idx].tick_params(
            left=False, 
            right=False , 
            labelleft=False ,
            labelbottom=False, 
            bottom=False
        )
    plt.show()

    #Reconstructed Images
    print('Reconstructed Images')
    f, axarr = plt.subplots(1,8,figsize=(12,12))
    for idx in np.arange(8):
        axarr[idx].imshow(np.moveaxis(outputs[idx], 0, -1))
        axarr[idx].tick_params(
            left=False, 
            right=False , 
            labelleft=False ,
            labelbottom=False, 
            bottom=False
        )
    plt.show()

In [None]:
validateBestModel(model)

Run all of the data through the model to get the embedding for each poster

Taking the embeddings and turning them into 3D points so we can comprehend it and graph it

In [None]:
imdbDataset = IMDB()

data_loader = torch.utils.data.DataLoader(
    imdbDataset, 
    batch_size=len(imdbDataset),
)

#Batch of test images
dataiter = iter(data_loader)
outputs = model.embedding(next(dataiter))
outputs = outputs.T.cpu().data.numpy()

X_embedded = TSNE(n_components=3, learning_rate='auto', init='random').fit_transform(outputs)

Graph the 3D points mentioned above so we can see the clusters

In [None]:
x, y, z = zip(*X_embedded)
fig = px.scatter_3d(x=x, y=y, z=z)
fig.show()