In [22]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import imdb
from tqdm import tqdm
tqdm.pandas()
from sentence_transformers import SentenceTransformer, util
import numpy as np
import re

import pickle
# Helper function to play with pickle
def save_pickle(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
def load_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In this notebook, we will primarily focus on extracting new movie features from IMDb instance.

### Instantiate sentence transformer model, IMDb instance, and read movie data

In [3]:
# Instantiate sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Instantiate IMDb class
ia = imdb.IMDb()

In [9]:
# Read movie data
movie = pd.read_csv("u.item", sep='|', header = None, encoding='latin-1', 
                    names = [ "movieId" , "title" , "release_date" , 'video_release_date' ,
              'IMDbURL' , 'unknown' , 'Action' , 'Adventure' , 'Animation' ,
              "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy',
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
              'Thriller' , 'War' , 'Western' ])

In [10]:
# Get a brief overview on movie data
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             1682 non-null   int64  
 1   title               1682 non-null   object 
 2   release_date        1681 non-null   object 
 3   video_release_date  0 non-null      float64
 4   IMDbURL             1679 non-null   object 
 5   unknown             1682 non-null   int64  
 6   Action              1682 non-null   int64  
 7   Adventure           1682 non-null   int64  
 8   Animation           1682 non-null   int64  
 9   Children's          1682 non-null   int64  
 10  Comedy              1682 non-null   int64  
 11  Crime               1682 non-null   int64  
 12  Documentary         1682 non-null   int64  
 13  Drama               1682 non-null   int64  
 14  Fantasy             1682 non-null   int64  
 15  Film-Noir           1682 non-null   int64  
 16  Horror

We can use the IMDB instance to search movie title and grab valuable information regarding the movie that are not in the dataset.

In [11]:
def parseMovieTitle(movieTitle):
    # Using try-except block here since error will occur for some movies
    # because search_movie does not return a good result
    try:
        # Search movie title to retrieve its IMDb id
        imdb_id = ia.search_movie(movieTitle)[0].getID()
        # Use IMDb id to retrieve all its data
        movie_data = ia.get_movie(imdb_id)
        
        return pd.Series( [[person['name'] for person in movie_data['cast'][:2]],       # Top2 cast
                       movie_data['director'][0]['name'],                               # Director
                       movie_data['rating'],                                            # IMDb rating
                       movie_data['plot'][0],                                           # Plot summary
                       model.encode(movie_data['plot'][0])])                            # Plot summary embedding encoded by sentence transformer
    except:
        return pd.Series([None,None,None,None,None])

In [12]:
# Get all these features on movie dataframe
movie[['top2_cast','director','rating','plot summary','plot embedding']] = movie.title.progress_apply(parseMovieTitle)

 25%|██▌       | 428/1682 [22:23<1:08:20,  3.27s/it]2022-12-18 16:40:07,970 CRITICAL [imdbpy] D:\apps\anaconda\lib\site-packages\imdb\_exceptions.py:32: IMDbParserError exception raised; args: ('invalid title: ""',); kwds: {}
NoneType: None
 48%|████▊     | 806/1682 [41:07<44:02,  3.02s/it]  2022-12-18 16:58:51,881 CRITICAL [imdbpy] D:\apps\anaconda\lib\site-packages\imdb\_exceptions.py:32: IMDbParserError exception raised; args: ('invalid title: ""',); kwds: {}
NoneType: None
100%|██████████| 1682/1682 [1:21:14<00:00,  2.90s/it]


In [14]:
# Extract movie entries that encounter an error
movie_na = movie[movie.rating.isna()]

In [16]:
# Same approach as before, but uses regex to discard info enclosed by the first parentheses
def parseMovieTitleRegex(movieTitle):
    try:
        imdb_id = ia.search_movie(re.search("\((.*)",movieTitle)[0])[0].getID()   
        movie_data = ia.get_movie(imdb_id)
        return pd.Series( [[person['name'] for person in movie_data['cast'][:2]], 
                       movie_data['director'][0]['name'], 
                       movie_data['rating'], 
                       movie_data['plot'][0], 
                       model.encode(movie_data['plot'][0])])
    except:
        return pd.Series([None,None,None,None,None])

In [17]:
# Try this approach on the remaining 61 movie entries
movie_na[['top2_cast','director','rating','plot summary','plot embedding']] = movie_na.title.progress_apply(parseMovieTitleRegex)

100%|██████████| 61/61 [02:33<00:00,  2.52s/it]


In [19]:
# Load 2nd processed features back to movie
movie.iloc[movie_na.index] = movie_na

In [35]:
# There are still 5 movie entries that encounter error when using IMDb instance, we will discard them
movie_na_still = movie_na[movie_na.rating.isna()]

In [37]:
# Discard these movie entries
movie.drop(movie_na_still.index, inplace = True,errors='ignore')

In [34]:
# Read ratings and user data
ratings = pd.read_csv("u.data",sep='\t', header = None, names=["userId" , "movieId" , "rating" , "timestamp"])
ratings.drop(ratings[ratings.movieId.isin(movie_na_still.movieId)].index, inplace = True)

user = pd.read_csv("u.user", sep='|', header = None, names = ['userId','age','gender','occupation', 'zip code'])

In [44]:
# Save all data into one pickle file for further preprocessing
save_pickle(ratings.merge(movie, on = ['movieId']).merge(user, on=['userId']), "rating_movie_user.pickle")