In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# MovieLens Data Enrichment

In this notebook, we will enrich the MovieLens 25M dataset with poster and movie sypnopsis scrapped from IMDB.

First, we will need to install some extra package for IMDB data collection.

In [None]:
%%bash
wget -O master.zip https://codeload.github.com/alberanid/imdbpy/zip/master
unzip master.zip
cd imdbpy-master && python setup.py install

Note: restart the kernel for the new package to take effect.


In [2]:
import IPython

IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

## Scraping data from IMDB

The IMDB API allows the collection of a rich set of multi-modal meta data from the IMDB database, including link to poster, synopsis and plots.

In [15]:
from imdb import IMDb

# create an instance of the IMDb class
ia = IMDb()

# get a movie and print its director(s)
the_matrix = ia.get_movie('0114709')
for director in the_matrix['directors']:
    print(director['name'])

# show all information that are currently available for a movie
print(sorted(the_matrix.keys()))

# show all information sets that can be fetched for a movie
print(ia.get_movie_infoset())

John Lasseter
['akas', 'animation department', 'art department', 'art directors', 'aspect ratio', 'box office', 'camera department', 'canonical title', 'cast', 'casting department', 'casting directors', 'certificates', 'color info', 'composers', 'countries', 'country codes', 'cover url', 'director', 'directors', 'distributors', 'editorial department', 'editors', 'full-size cover url', 'genres', 'imdbID', 'kind', 'language codes', 'languages', 'localized title', 'long imdb canonical title', 'long imdb title', 'miscellaneous', 'music department', 'original air date', 'other companies', 'plot', 'plot outline', 'producers', 'production companies', 'production managers', 'rating', 'runtimes', 'smart canonical title', 'smart long imdb canonical title', 'sound department', 'sound mix', 'synopsis', 'thanks', 'title', 'top 250 rank', 'visual effects', 'votes', 'writer', 'writers', 'year']
['airing', 'akas', 'alternate versions', 'awards', 'connections', 'crazy credits', 'critic reviews', 'episo

In [3]:
print(the_matrix.get('plot'))

["A cowboy doll is profoundly threatened and jealous when a new spaceman figure supplants him as top toy in a boy's room.::Kenneth Chisholm", 'A little boy named Andy loves to be in his room, playing with his toys, especially his doll named "Woody". But, what do the toys do when Andy is not with them, they come to life. Woody believes that his life (as a toy) is good. However, he must worry about Andy\'s family moving, and what Woody does not know is about Andy\'s birthday party. Woody does not realize that Andy\'s mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy\'s new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without them, but they will have to pass through a ruthless toy killer, Sid Phillips.::John Wiggins', "Woody, a good-hearted cowboy doll who belongs to a young boy named A

In [4]:
the_matrix.get('synopsis')

['A boy called Andy Davis (voice: John Morris) uses his toys to act out a bank robbery. The bank is a cardboard box, the robber is Mr. Potato Head (voice: Don Rickles) assisted by Slinky Dog (voice: Jim Varney), and the bystanders include Bo Peep (voice: Annie Potts) and her sheep. The day is saved by cowboy doll Woody (voice: Tom Hanks) playing the sheriff, with help from Rex the dinosaur (voice: Wallace Shawn). Woody is the only toy who gets to say his own lines because he has a pull-string that makes him say things like "Reach for the sky!" and "You\'re my favorite deputy!"During the opening credits (soundtrack: Randy Newman\'s "You\'ve Got a Friend in Me"), Andy takes Woody downstairs to find his mother (voice: Laurie Metcalf) decorating the dining room for his birthday party. He asks if they can leave the decorations up until they move, and his mom agrees. She says the guests will arrive soon and sends him back upstairs to get his baby sister Molly (voice: Hannah Unkrich), whose c

## Collect synopsis for all movies

Next, we will collect meta data, including the synopsis, for all movies in the dataset. Note that this process will take a while to complete.

In [5]:
from collections import defaultdict
import pandas as pd

In [6]:
links = pd.read_csv("./data/ml-25m/links.csv")

In [7]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
links.imdbId.nunique()

62423

In [9]:
from tqdm import tqdm
import pickle

In [10]:
movies_infos = {}

In [None]:
for cnt, row in tqdm(links.iterrows(), total=len(links)):
    movieID = row['movieId']
    imdbID = row['imdbId']
    
    if movieID in movies_infos:
        #print("Movie exist: %d"%movieID)
        continue
    else:
        try:
            movie_info = ia.get_movie(imdbID)
            movies_infos[movieID] = movie_info
        except Exception as e:
            print("Movie %d download error"%movieID, e)

        if (cnt+1) %500 == 0:
            print("Saving to file, %d/%d"%(cnt,len(links)))
            with open('movies_info.pkl', 'wb') as f:
                pickle.dump({"movies_infos": movies_infos}, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('movies_info.pkl', 'wb') as f:
    pickle.dump({"movies_infos": movies_infos}, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
len(movies_infos)

## Scraping movie posters

The movie metadata also contains link to poster images. We next collect these posters where available.

Note: this process will take some time to complete.

In [12]:
import pickle

with open('movies_info.pkl', 'rb') as f:
    movies_infos = pickle.load(f)['movies_infos']

In [17]:
import subprocess
from tqdm import tqdm
import os

poster_small = {}
poster_large = {}

COLLECT_LARGE_POSTER = False

for key, movie in tqdm(movies_infos.items(), total=len(movies_infos)):
    if 'cover url' in movie.keys():
        target_path = './poster_small/%s.jpg'%(movie['imdbID'])
        if os.path.exists(target_path):
            continue
        cmd = 'wget "%s" -O %s'%(movie['cover url'], target_path)
        stream = os.popen(cmd)
        output = stream.read()
        print(output, cmd)
        
    
    # Optionally, collect high-res poster images 
    if COLLECT_LARGE_POSTER:
        if 'full-size cover url' in movie.keys():
            target_path = '"./poster_large/%s.jpg"'%(movie['imdbID'])
            if os.path.exists(target_path):
                continue
            cmd = 'wget "%s" -O %s'%(movie['full-size cover url'], target_path)
            stream = os.popen(cmd)
            output = stream.read()
            print(output, cmd)
        

100%|██████████| 62423/62423 [01:24<00:00, 739.65it/s] 


In [4]:
!ls -l poster_small|wc -l

61952
