In [26]:
import numpy as np
import pandas as pd
import ast

## Reading the data files

In [27]:
movies = pd.read_csv("./data/tmdb_5000_movies.csv")
credits = pd.read_csv("./data/tmdb_5000_credits.csv")

In [28]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [29]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Merge the data frames

In [30]:
# Merge both dataframes on the basis of title column
movies = movies.merge(credits,on="title")

movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Keep only useful columns

In [31]:
# genere, movie_id, keywords, title, cast, crew, overview
movies = movies[["movie_id", "title", "overview" ,"genres", "cast", "crew", "keywords"]]

movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."


## Preprocessing

### Remove rows that have empty column 

In [32]:
# check the empty column values
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
cast        0
crew        0
keywords    0
dtype: int64

In [33]:
# (implace -> remove from the existing dataframe istead of creating new one)
movies.dropna(inplace=True)

In [34]:
# check the empty column values
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
cast        0
crew        0
keywords    0
dtype: int64

### Remove the duplicate rows

In [35]:
movies.duplicated().sum()

0

In [36]:
movies = movies.drop_duplicates()

In [37]:
movies.duplicated().sum()

0

### Genres
Make a list out of the list of objects of genres

In [38]:
movies["genres"] = movies['genres'].apply(lambda genres: [genre["name"] for genre in ast.literal_eval(genres)])


In [39]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."


### Keywords

In [40]:
movies["keywords"] = movies['keywords'].apply(lambda keywords: [keyword["name"] for keyword in ast.literal_eval(keywords)])


In [41]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[culture clash, future, space war, space colon..."


### Cast

In [42]:
movies["cast"] = movies["cast"].apply(lambda casts: [cast["name"] for cast in ast.literal_eval(casts)][:3])

In [43]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[culture clash, future, space war, space colon..."


### Crew Member

Make a list of directors only

In [44]:
movies["crew"] = movies["crew"].apply(lambda crew_members: [crew_member["name"] for crew_member in ast.literal_eval(crew_members) if crew_member["job"] == "Director"])
       

In [45]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[culture clash, future, space war, space colon..."


### Overview

In [46]:
movies["overview"][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [47]:
movies["overview"] = movies["overview"].apply(lambda string: string.split())

In [48]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[culture clash, future, space war, space colon..."
