In [1]:
import numpy as np
import pandas as pd
import ast #To convert string to list
import pickle
import requests
from iso639 import languages
from datetime import datetime
from datetime import date

In [2]:
movies = pd.read_csv('../Datasets/tmdb_5000_movies.csv')
credits = pd.read_csv("../Datasets/tmdb_5000_credits.csv")
credits.rename(columns = {'movie_id':'id'}, inplace = True)
movies = movies.merge(credits, on='id')
movies.rename(columns = {'title_x':'title'}, inplace = True)
movies = movies[['id', 'title', 'popularity', 'overview', 'genres', 'release_date','vote_average', 'original_language','cast', 'crew']]

In [3]:
print(movies.isnull().sum())

id                   0
title                0
popularity           0
overview             3
genres               0
release_date         1
vote_average         0
original_language    0
cast                 0
crew                 0
dtype: int64


In [4]:
def find_null(attribute):
    null_attributes = movies.index[movies[attribute].isnull()]
    for i in null_attributes:
        print(f"Movie Id: {movies['id'][i]} Movie Title: {movies['title'][i]}")
print("Null overviews: ")
find_null('overview')
print("Null release dates: ")
find_null('release_date')

Null overviews: 
Movie Id: 292539 Movie Title: Food Chains
Movie Id: 370980 Movie Title: Chiamatemi Francesco - Il Papa della gente
Movie Id: 459488 Movie Title: To Be Frank, Sinatra at 100
Null release dates: 
Movie Id: 380097 Movie Title: America Is Still the Place


In [5]:
#Data collected from IMDb
movies['overview'][movies.index[movies['id'] == 370980]] = 'Following the rise of father Jorge Mario Bergoglio from his early life as a teacher in a Jesuit High School in Argentina, to archbishop and cardinal of Buenos Aires, until he was elected Pope of the Roman Catholic Church.'
movies['overview'][movies.index[movies['id'] == 459488]] = 'An exploration of how singer and actor Frank Sinatra became one of the biggest stars of the 20th century while remaining, in his heart, a normal person.'
movies['overview'][movies.index[movies['id'] == 292539]] = 'To protest their working conditions and poor wages, farmworkers in Immokalee, Florida, start a hunger strike outside the headquarters of Publix supermarkets.'
movies['title'].loc[movies['id'] == 380097] = "I’m Charlie Walker"
movies['release_date'].loc[movies['id'] == 380097] = '10-06-2022'
movies.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['overview'][movies.index[movies['id'] == 459488]] = 'An exploration of how singer and actor Frank Sinatra became one of the biggest stars of the 20th century while remaining, in his heart, a normal person.'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the c

In [6]:
print(movies.dropna(inplace = True))
print(movies.duplicated().sum())
print(movies.isnull().sum())

None
0
id                   0
title                0
popularity           0
overview             0
genres               0
release_date         0
vote_average         0
original_language    0
cast                 0
crew                 0
dtype: int64


In [7]:
for i in range(0, len(movies['release_date'])):
    string = movies['release_date'].iloc[i]
    movies['release_date'].iloc[i] = datetime.strptime(string, '%d-%m-%Y').year
    
movies.rename(columns = {'release_date':'year'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [8]:
movies['original_language'] = movies['original_language'].apply(lambda x: languages.get(alpha2=x).name.lower())

In [9]:
#To convert strings with ids and genre names/keywords to lists of genres/keywords
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

#To fetch the names of top 10 cast of a movie
def convert_trim(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 5:
            L.append(i['name'].lower())
            counter += 1
        else:
            break
    return L

#To fetch the name of the director of a movie
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'].lower())
            break
    return ''.join(L)

In [10]:
movies['genres'] = movies['genres'].apply(convert)
movies['cast'] = movies['cast'].apply(convert_trim)
movies['crew'] = movies['crew'].apply(fetch_director)

In [11]:
print(f"Memory usage before optimization: {movies.memory_usage().sum()}")
movies['id'] = movies['id'].astype('uint32')
movies['year'] = movies['year'].astype('uint16')
movies['vote_average'] = movies['vote_average'].astype('float32')
movies['popularity'] = movies['popularity'].astype('float32')
print(f"Memory after after optimization: {movies.memory_usage().sum()}")

Memory usage before optimization: 419408
Memory after after optimization: 333620


In [12]:
pickle.dump(movies.to_dict(), open('movie_dict.pkl', 'wb'))