In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [4]:
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

## Mearging both datasets

In [5]:
movies = movies.merge(credits, on="title")

In [6]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Feature engineering

In [7]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [8]:
movies = movies[['id', 'genres', 'keywords', 'title', 'overview', 'cast', 'crew']]

In [9]:
movies.head(1)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


---

## Checking Missing values

In [10]:
movies.isnull().sum()

id          0
genres      0
keywords    0
title       0
overview    3
cast        0
crew        0
dtype: int64

* Droping the null rows
* inplace true is used to save the changes

In [11]:
movies.dropna(inplace=True)

## To check duplicate values

In [12]:
movies.duplicated().sum()

np.int64(0)

---

## Formatting columns according to needs

* <b>genres:</b> [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}] => ['Actions', 'Adventures']
* <b>keywords:</b> [{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}] => ["culture clash", "future", "space war"]
* <b>cast:</b> Extarcting only first 4 character
* <b>crew:</b> Extarcting director name
* All should be in list

In [13]:
movies.iloc[0]

id                                                      19995
genres      [{"id": 28, "name": "Action"}, {"id": 12, "nam...
keywords    [{"id": 1463, "name": "culture clash"}, {"id":...
title                                                  Avatar
overview    In the 22nd century, a paraplegic Marine is di...
cast        [{"cast_id": 242, "character": "Jake Sully", "...
crew        [{"credit_id": "52fe48009251416c750aca23", "de...
Name: 0, dtype: object

## What is .iloc?

* .iloc = integer location
* Used to access rows by index number (position)
* Index starts from 0

In [14]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

### Formating of genres column 

In [15]:
import ast
#ast = Abstract Syntax Tree
#It helps Python safely convert strings into Python objects
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        #üëâ ast.literal_eval() safely converts strings containing Python literals into actual Python data types 
        #like lists, dictionaries, tuples, etc.
        L.append(i["name"])
    return L    

In [16]:
movies['genres'] = movies['genres'].apply(convert)

### What is .apply()?
.apply() runs a function on each row value of a column. <br>
*Means: ‚ÄúTake each value in the genres column and pass it to the convert() function‚Äù*

In [17]:
movies.iloc[0].genres

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

### Formating of keywords column

In [18]:
# Doing the same thing on the keywords column
movies['keywords'] = movies['keywords'].apply(convert)

In [19]:
movies.head(1)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Formating of cast column

In [20]:
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if (counter > 3): break
        L.append(i["name"])
        counter += 1
    return L  
# This function will extract the first 3 characters from the cast column.    

In [21]:
movies['cast'] = movies['cast'].apply(convert3)

In [22]:
movies.head(1)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Formating of crew column

In [23]:
def extractDirector(obj):
    L = []
    for i in ast.literal_eval(obj):
        if (i['job'] == 'Director'): # extract only those obj which has job : Director
            L.append(i["name"]) # extract the name of the director and append it to the list L.
            break
    return L 

In [24]:
movies['crew'] = movies['crew'].apply(extractDirector)

In [25]:
movies.head(3)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond‚Äôs past sends him o...,"[Daniel Craig, Christoph Waltz, L√©a Seydoux, R...",[Sam Mendes]


---

## Now converting a overview string into List
<i>Strings are converted into a list of words so the computer can compare movies based on their content.<br>
A computer cannot understand meaning, but it can compare words.<br>
<b>With words computer can:</b>
* Count words
* Compare common words
* Measure similarity</i>

In [26]:
movies['overview']

0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bond‚Äôs past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4804    El Mariachi just wants to play his guitar and ...
4805    A newlywed couple's honeymoon is upended by th...
4806    "Signed, Sealed, Delivered" introduces a dedic...
4807    When ambitious New York attorney Sam is sent t...
4808    Ever since the second grade when he first saw ...
Name: overview, Length: 4806, dtype: object

In [27]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
# lambda function is similar to arrow function in javascript 'short form of writng a function'
# this lamda function takes x as a string from overview and apply a split() fun. on it 
# split() = this fun. split the string into a list of words.

In [28]:
movies['overview'].head(2)

0    [In, the, 22nd, century,, a, paraplegic, Marin...
1    [Captain, Barbossa,, long, believed, to, be, d...
Name: overview, dtype: object

---

### <i>Now removing space from the cast, crew names in order to remove confusion.</i>

In [29]:
movies['cast']

0       [Sam Worthington, Zoe Saldana, Sigourney Weave...
1       [Johnny Depp, Orlando Bloom, Keira Knightley, ...
2       [Daniel Craig, Christoph Waltz, L√©a Seydoux, R...
3       [Christian Bale, Michael Caine, Gary Oldman, A...
4       [Taylor Kitsch, Lynn Collins, Samantha Morton,...
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805    [Edward Burns, Kerry Bish√©, Marsha Dietlein, C...
4806    [Eric Mabius, Kristin Booth, Crystal Lowe, Geo...
4807    [Daniel Henney, Eliza Coupe, Bill Paxton, Alan...
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldm...
Name: cast, Length: 4806, dtype: object

In [30]:
comet = 'Christian Bale'
comet1 = comet.replace(' ', '')

In [31]:
comet1

'ChristianBale'

In [32]:
# Creating a function for removing space

In [33]:
def remSpace(li):
    L = []
    for i in li:
        word = i.replace(' ', '')
        L.append(word)
    return L   

In [34]:
movies['cast'] = movies['cast'].apply(remSpace)

In [36]:
movies['cast'].head(2)

0    [SamWorthington, ZoeSaldana, SigourneyWeaver, ...
1    [JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...
Name: cast, dtype: object

In [37]:
movies.columns


Index(['id', 'genres', 'keywords', 'title', 'overview', 'cast', 'crew'], dtype='object')

In [38]:
movies['keywords'] = movies['keywords'].apply(remSpace)
movies['genres'] = movies['genres'].apply(remSpace)
movies['crew'] = movies['crew'].apply(remSpace)

In [39]:
movies.head(2)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]


---

## Concatinating each column into one column

In [44]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [45]:
movies['tags'].head(3)

0    [In, the, 22nd, century,, a, paraplegic, Marin...
1    [Captain, Barbossa,, long, believed, to, be, d...
2    [A, cryptic, message, from, Bond‚Äôs, past, send...
Name: tags, dtype: object

### Creating new dataframe which will have columns
* id
* title
* tags

In [47]:
new_df = movies[['id', 'title', 'tags']]

In [48]:
new_df.head(5)

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond‚Äôs, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


### Converting tags column into string from list

In [50]:
ex = new_df['tags'].apply(lambda x:" ".join(x))

In [53]:
ex.iloc[0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver StephenLang JamesCameron'