In [98]:
import pandas as pd

In [99]:
movies = pd.read_csv('datasets/movies.csv')
credits = pd.read_csv('datasets/credits.csv')

In [100]:
# Merging movies and credits datasets using 'title' as common column
movies = movies.merge(credits, on='title')

In [101]:
# Choosing specific columns for content filtering based recommendation
movies = movies[['id', 'title', 'overview', 'genres',
                 'keywords', 'popularity', 'cast', 'crew', 
                 'production_companies', 'runtime', 'release_date']]

In [102]:
# Remove missing(NaN) rows
movies.dropna(inplace=True)

# Duplicated values check
movies.duplicated().sum()

# Remove duplicated rows
movies.drop_duplicates(inplace=True)

In [103]:
import ast

# Extraction function
def extract_names(obj):
    extracted_names = [i['name'] for i in ast.literal_eval(obj)]
    return ', '.join(extracted_names)

In [104]:
# Apply the extraction function to the 'genres' column
movies['genres'] = movies['genres'].apply(extract_names)

# Display the result
print(movies['genres'].iloc[0])

Action, Adventure, Fantasy, Science Fiction


In [105]:
# Apply the extraction function to the 'keywords' column
movies['keywords'] = movies['keywords'].apply(extract_names)

# Display the result
movies['keywords']

0       culture clash, future, space war, space colony...
1       ocean, drug abuse, exotic island, east india t...
2       spy, based on novel, secret agent, sequel, mi6...
3       dc comics, crime fighter, terrorist, secret id...
4       based on novel, mars, medallion, space travel,...
                              ...                        
4804    united states–mexico barrier, legs, arms, pape...
4805                                                     
4806    date, love at first sight, narration, investig...
4807                                                     
4808              obsession, camcorder, crush, dream girl
Name: keywords, Length: 4805, dtype: object

In [106]:
# Extraction function with a limit of 5 names
def extract_cast_with_characters(obj, limit=5):
    cast_list = ast.literal_eval(obj)
    result_str = ', '.join([f"{i['name']} ({i['character']})" for i in cast_list[:limit]])
    return result_str

In [107]:
movies['cast'] = movies['cast'].apply(extract_cast_with_characters)

print(movies['cast'].iloc[0])

Sam Worthington (Jake Sully), Zoe Saldana (Neytiri), Sigourney Weaver (Dr. Grace Augustine), Stephen Lang (Col. Quaritch), Michelle Rodriguez (Trudy Chacon)


In [108]:
# Extraction function to fetch the movie director
def extract_director(obj):
    crew_list = ast.literal_eval(obj)
    director_list = [i['name'] for i in crew_list if i['job'] == 'Director']
    return ', '.join(director_list)

In [109]:
movies['crew'] = movies['crew'].apply(extract_director)

movies['crew']

0                                James Cameron
1                               Gore Verbinski
2                                   Sam Mendes
3                            Christopher Nolan
4                               Andrew Stanton
                         ...                  
4804                          Robert Rodriguez
4805                              Edward Burns
4806                               Scott Smith
4807                               Daniel Hsia
4808    Brian Herzlinger, Jon Gunn, Brett Winn
Name: crew, Length: 4805, dtype: object

In [110]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,popularity,cast,crew,production_companies,runtime,release_date
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction","culture clash, future, space war, space colony...",150.437577,"Sam Worthington (Jake Sully), Zoe Saldana (Ney...",James Cameron,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",162.0,2009-12-10
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","Adventure, Fantasy, Action","ocean, drug abuse, exotic island, east india t...",139.082615,"Johnny Depp (Captain Jack Sparrow), Orlando Bl...",Gore Verbinski,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",169.0,2007-05-19
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"Action, Adventure, Crime","spy, based on novel, secret agent, sequel, mi6...",107.376788,"Daniel Craig (James Bond), Christoph Waltz (Bl...",Sam Mendes,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",148.0,2015-10-26
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"Action, Crime, Drama, Thriller","dc comics, crime fighter, terrorist, secret id...",112.312950,"Christian Bale (Bruce Wayne / Batman), Michael...",Christopher Nolan,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",165.0,2012-07-16
4,49529,John Carter,"John Carter is a war-weary, former military ca...","Action, Adventure, Science Fiction","based on novel, mars, medallion, space travel,...",43.926995,"Taylor Kitsch (John Carter), Lynn Collins (Dej...",Andrew Stanton,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",132.0,2012-03-07
...,...,...,...,...,...,...,...,...,...,...,...
4804,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,"Action, Crime, Thriller","united states–mexico barrier, legs, arms, pape...",14.269792,"Carlos Gallardo (El Mariachi), Jaime de Hoyos ...",Robert Rodriguez,"[{""name"": ""Columbia Pictures"", ""id"": 5}]",81.0,1992-09-04
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,"Comedy, Romance",,0.642552,"Edward Burns (Buzzy), Kerry Bishé (Linda), Mar...",Edward Burns,[],85.0,2011-12-26
4806,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...","Comedy, Drama, Romance, TV Movie","date, love at first sight, narration, investig...",1.444476,"Eric Mabius (Oliver O’Toole), Kristin Booth (S...",Scott Smith,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...",120.0,2013-10-13
4807,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,,,0.857008,"Daniel Henney (Sam), Eliza Coupe (Amanda), Bil...",Daniel Hsia,[],98.0,2012-05-03


In [111]:
movies['release_year'] = pd.to_datetime(movies['release_date']).dt.year

#Drop the column release_date
movies.drop('release_date', axis=1, inplace=True)

In [112]:
movies['production_companies'].iloc[0]

'[{"name": "Ingenious Film Partners", "id": 289}, {"name": "Twentieth Century Fox Film Corporation", "id": 306}, {"name": "Dune Entertainment", "id": 444}, {"name": "Lightstorm Entertainment", "id": 574}]'

In [113]:
# Extraction function to get the first three company names
def extract_production_companies(obj, limit=3):
    companies_list = ast.literal_eval(obj)
    names_list = [company['name'] for company in companies_list[:limit]]
    return ', '.join(names_list)

In [114]:
movies['production_companies'] = movies['production_companies'].apply(extract_production_companies)

print(movies['production_companies'].iloc[0])

Ingenious Film Partners, Twentieth Century Fox Film Corporation, Dune Entertainment


In [115]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,popularity,cast,crew,production_companies,runtime,release_year
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction","culture clash, future, space war, space colony...",150.437577,"Sam Worthington (Jake Sully), Zoe Saldana (Ney...",James Cameron,"Ingenious Film Partners, Twentieth Century Fox...",162.0,2009
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","Adventure, Fantasy, Action","ocean, drug abuse, exotic island, east india t...",139.082615,"Johnny Depp (Captain Jack Sparrow), Orlando Bl...",Gore Verbinski,"Walt Disney Pictures, Jerry Bruckheimer Films,...",169.0,2007
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"Action, Adventure, Crime","spy, based on novel, secret agent, sequel, mi6...",107.376788,"Daniel Craig (James Bond), Christoph Waltz (Bl...",Sam Mendes,"Columbia Pictures, Danjaq, B24",148.0,2015
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"Action, Crime, Drama, Thriller","dc comics, crime fighter, terrorist, secret id...",112.31295,"Christian Bale (Bruce Wayne / Batman), Michael...",Christopher Nolan,"Legendary Pictures, Warner Bros., DC Entertain...",165.0,2012
4,49529,John Carter,"John Carter is a war-weary, former military ca...","Action, Adventure, Science Fiction","based on novel, mars, medallion, space travel,...",43.926995,"Taylor Kitsch (John Carter), Lynn Collins (Dej...",Andrew Stanton,Walt Disney Pictures,132.0,2012


In [116]:
import pickle

pickle.dump(movies,open('movies_dataset.pkl', 'wb'))

In [117]:
# Save as CSV
movies.to_csv('movies_dataset.csv', index=False)