In [2]:
import numpy as np
import pandas as pd

In [3]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [5]:
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

## Mearging both datasets

In [6]:
movies = movies.merge(credits, on="title")

In [7]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Feature engineering

In [8]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [9]:
movies = movies[['id', 'genres', 'keywords', 'title', 'overview', 'cast', 'crew']]

In [10]:
movies.head(1)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


---

## Checking Missing values

In [11]:
movies.isnull().sum()

id          0
genres      0
keywords    0
title       0
overview    3
cast        0
crew        0
dtype: int64

* Droping the null rows
* inplace true is used to save the changes

In [13]:
movies.dropna(inplace=True)

## To check duplicate values

In [14]:
movies.duplicated().sum()

np.int64(0)

---

## Formatting columns according to needs

* <b>genres:</b> [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}] => ['Actions', 'Adventures']
* <b>keywords:</b> [{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}] => ["culture clash", "future", "space war"]
* <b>cast:</b> Extarcting only first 4 character
* <b>crew:</b> Extarcting director name
* All should be in list

In [19]:
movies.iloc[0]

id                                                      19995
genres      [{"id": 28, "name": "Action"}, {"id": 12, "nam...
keywords    [{"id": 1463, "name": "culture clash"}, {"id":...
title                                                  Avatar
overview    In the 22nd century, a paraplegic Marine is di...
cast        [{"cast_id": 242, "character": "Jake Sully", "...
crew        [{"credit_id": "52fe48009251416c750aca23", "de...
Name: 0, dtype: object

## What is .iloc?

* .iloc = integer location
* Used to access rows by index number (position)
* Index starts from 0

In [20]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

### Formating of genres column 

In [52]:
import ast
#ast = Abstract Syntax Tree
#It helps Python safely convert strings into Python objects
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        #üëâ ast.literal_eval() safely converts strings containing Python literals into actual Python data types 
        #like lists, dictionaries, tuples, etc.
        L.append(i["name"])
    return L    

In [24]:
movies['genres'] = movies['genres'].apply(convert)

### What is .apply()?
.apply() runs a function on each row value of a column. <br>
*Means: ‚ÄúTake each value in the genres column and pass it to the convert() function‚Äù*

In [26]:
movies.iloc[0].genres

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

### Formating of keywords column

In [None]:
# Doing the same thing on the keywords column
movies['keywords'] = movies['keywords'].apply(convert)

In [28]:
movies.head(1)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Formating of cast column

In [54]:
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if (counter > 3): break
        L.append(i["name"])
        counter += 1
    return L  
# This function will extract the first 3 characters from the cast column.    

In [None]:
movies['cast'] = movies['cast'].apply(convert3)

In [39]:
movies.head(1)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Formating of crew column

In [55]:
def extractDirector(obj):
    L = []
    for i in ast.literal_eval(obj):
        if (i['job'] == 'Director'): # extract only those obj which has job : Director
            L.append(i["name"]) # extract the name of the director and append it to the list L.
            break
    return L 

In [49]:
movies['crew'] = movies['crew'].apply(extractDirector)

In [51]:
movies.head(3)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond‚Äôs past sends him o...,"[Daniel Craig, Christoph Waltz, L√©a Seydoux, R...",[Sam Mendes]
