## Steps

- Import libraries
- Import the dataset
- Data Analysis - DE, DM, DC, DV, EDA [OPTIONAL], Hyper Parameter Tuning
- Feature Engineering - Encoders, Feature Scaling
- Split the data into two sets using the CV
- Model Selection - KNN
- Training the model
- Test the model
- Performance - Confusion Metrix

## Import libraries

In [None]:
import numpy as np
import pandas as pd
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler

In [None]:
# Importing the Dataset

In [None]:
credit = 'https://drive.google.com/file/d/1nWdt8_Hh-M35N4GwRixPmb3AsHNQVZKQ/view'
movie = 'https://drive.google.com/file/d/1HxZXAkPajH40pChWvpAseOi71VW0k3wS/view'

In [4]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [5]:
# Data Analysis
movies.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [6]:
credits.head(3)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [7]:
# Both the DataFrames are not structured data
# Those columns are lists of dictionaries

In [8]:
movies.describe()

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,29045040.0,57165.484281,21.492301,82260640.0,106.875859,6.092172,690.217989
std,40722390.0,88694.614033,31.81665,162857100.0,22.611935,1.194612,1234.585891
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,790000.0,9014.5,4.66807,0.0,94.0,5.6,54.0
50%,15000000.0,14629.0,12.921594,19170000.0,103.0,6.2,235.0
75%,40000000.0,58610.5,28.313505,92917190.0,118.0,6.8,737.0
max,380000000.0,459488.0,875.581305,2787965000.0,338.0,10.0,13752.0


In [9]:
credits.describe()

Unnamed: 0,movie_id
count,4803.0
mean,57165.484281
std,88694.614033
min,5.0
25%,9014.5
50%,14629.0
75%,58610.5
max,459488.0


In [10]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [11]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [12]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [13]:
movies.genres[0]
# Name represents the genre and this id is related to movie name

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [14]:
# Actual Output
# [{"id": 28, "name": "Action"},
#  {"id": 12, "name": "Adventure"},
#  {"id": 14, "name": "Fantasy"}, 
#  {"id": 878, "name": "Science Fiction"}]

# Expected output for movies.genres[0] is
# ['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [15]:
import json

x = movies.genres[0]
y = []
genres_list = json.loads(x)  #Converting into dictionary --> unpacking from json
for genre in genres_list:
    y.append(genre['name'])
y

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [16]:
a = eval(movies.genres[0])
for i in range(len(a)):
    print(a[i]['name'])

Action
Adventure
Fantasy
Science Fiction


In [17]:
movie = movies.copy()

In [18]:
# for i, k in zip(movies.genres, range(len(movies.genres))):
#     movies.genres[k] = [eval(i)[j]['name'] for j in range(len(eval(i)))]

# for index in range(len(movies)):
#     movies.loc[index, 'genres'] = [genre['name'] for genre in eval(movies.loc[index, 'genres'])]

import ast

# Convert each genre string to a list of names directly
movies['genres'] = [", ".join([d['name'] for d in ast.literal_eval(i)]) for i in movies['genres']]

In [19]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"Action, Adventure, Fantasy, Science Fiction",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"Adventure, Fantasy, Action",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"Action, Adventure, Crime",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"Action, Crime, Drama, Thriller",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"Action, Adventure, Science Fiction",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [20]:
# Keywords
# production_companies
# production_countires
# spoken_languages

In [21]:
movies.keywords[0]
# Extract name

movies.production_companies[0]
# Extract name

movies.production_countries[0]
# Extract name


'[{"iso_3166_1": "US", "name": "United States of America"}, {"iso_3166_1": "GB", "name": "United Kingdom"}]'

In [22]:
# for i, k in zip(movies.keywords, range(len(movies.keywords))):
#     movies.keywords[k] = [eval(i)[j]['name'] for j in range(len(eval(i)))]

# for i, k in zip(movies.production_companies, range(len(movies.production_companies))):
#     movies.production_companies[k] = [eval(i)[j]['name'] for j in range(len(eval(i)))]

# for i, k in zip(movies.production_countires, range(len(movies.production_countires))):
#     movies.production_countires[k] = [eval(i)[j]['name'] for j in range(len(eval(i)))]

# for i, k in zip(movies.spoken_languages, range(len(movies.spoken_languages))):
#     movies.spoken_languages[k] = [eval(i)[j]['name'] for j in range(len(eval(i)))]


In [23]:
movies['keywords'] = [", ".join([d['name'] for d in ast.literal_eval(i)]) for i in movies['keywords']]
movies['production_companies'] = [", ".join([d['name'] for d in ast.literal_eval(i)]) for i in movies['production_companies']]
movies['production_countries'] = [", ".join([d['name'] for d in ast.literal_eval(i)]) for i in movies['production_countries']]
movies['spoken_languages'] = [", ".join([d['name'] for d in ast.literal_eval(i)]) for i in movies['spoken_languages']]

In [24]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"Action, Adventure, Fantasy, Science Fiction",http://www.avatarmovie.com/,19995,"culture clash, future, space war, space colony...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"Ingenious Film Partners, Twentieth Century Fox...","United States of America, United Kingdom",2009-12-10,2787965087,162.0,"English, Español",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"Adventure, Fantasy, Action",http://disney.go.com/disneypictures/pirates/,285,"ocean, drug abuse, exotic island, east india t...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,2007-05-19,961000000,169.0,English,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"Action, Adventure, Crime",http://www.sonypictures.com/movies/spectre/,206647,"spy, based on novel, secret agent, sequel, mi6...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"Columbia Pictures, Danjaq, B24","United Kingdom, United States of America",2015-10-26,880674609,148.0,"Français, English, Español, Italiano, Deutsch",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"Action, Crime, Drama, Thriller",http://www.thedarkknightrises.com/,49026,"dc comics, crime fighter, terrorist, secret id...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"Legendary Pictures, Warner Bros., DC Entertain...",United States of America,2012-07-16,1084939099,165.0,English,Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"Action, Adventure, Science Fiction",http://movies.disney.com/john-carter,49529,"based on novel, mars, medallion, space travel,...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,Walt Disney Pictures,United States of America,2012-03-07,284139100,132.0,English,Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [25]:
movies.iloc[140]

budget                                                          150000000
genres                                            Action, Drama, Thriller
homepage                                                              NaN
id                                                                 117251
keywords                usa president, conspiracy, secret service, the...
original_language                                                      en
original_title                                           White House Down
overview                Capitol Policeman John Cale has just been deni...
popularity                                                      39.004588
production_companies    Columbia Pictures, Centropolis Entertainment, ...
production_countries                             United States of America
release_date                                                   2013-06-27
revenue                                                         205366737
runtime                               

In [26]:
movies.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [27]:
# The home page column is null for more than 60% rows..... So we can drop it
movies.drop(['homepage'], axis = 1, inplace = True)
movies.head(2)

Unnamed: 0,budget,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"Action, Adventure, Fantasy, Science Fiction",19995,"culture clash, future, space war, space colony...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"Ingenious Film Partners, Twentieth Century Fox...","United States of America, United Kingdom",2009-12-10,2787965087,162.0,"English, Español",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"Adventure, Fantasy, Action",285,"ocean, drug abuse, exotic island, east india t...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,2007-05-19,961000000,169.0,English,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [28]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   id                    4803 non-null   int64  
 3   keywords              4803 non-null   object 
 4   original_language     4803 non-null   object 
 5   original_title        4803 non-null   object 
 6   overview              4800 non-null   object 
 7   popularity            4803 non-null   float64
 8   production_companies  4803 non-null   object 
 9   production_countries  4803 non-null   object 
 10  release_date          4802 non-null   object 
 11  revenue               4803 non-null   int64  
 12  runtime               4801 non-null   float64
 13  spoken_languages      4803 non-null   object 
 14  status                4803 non-null   object 
 15  tagline              

In [29]:
movies.shape

(4803, 19)

In [30]:
cop = movies.copy()

In [31]:
# Tagline doesn't help us to predict the ratings
movies.drop('tagline', axis = 1, inplace = True)

In [32]:
movies.isnull().sum()

budget                  0
genres                  0
id                      0
keywords                0
original_language       0
original_title          0
overview                3
popularity              0
production_companies    0
production_countries    0
release_date            1
revenue                 0
runtime                 2
spoken_languages        0
status                  0
title                   0
vote_average            0
vote_count              0
dtype: int64

In [33]:
cop[cop.runtime.isnull()]

Unnamed: 0,budget,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
2656,15000000,Drama,370980,"pope, biography",it,Chiamatemi Francesco - Il Papa della gente,,0.738646,Taodue Film,Italy,2015-12-03,0,,Español,Released,,Chiamatemi Francesco - Il Papa della gente,7.3,12
4140,2,Documentary,459488,"music, actors, legendary perfomer, classic hol...",en,"To Be Frank, Sinatra at 100",,0.050625,Eyeline Entertainment,United Kingdom,2015-12-12,0,,,Released,,"To Be Frank, Sinatra at 100",0.0,0


In [34]:
# Dropping all the null values
movies.dropna(inplace = True)

In [35]:
# Instead of release date ->  3 columns at the exact position
# release_day, release_month, release_year

In [36]:
movies['release_date'] = pd.to_datetime(movies['release_date'])
movies['release_day'] = movies['release_date'].dt.day
movies['release_month'] = movies['release_date'].dt.month
movies['release_year'] = movies['release_date'].dt.year

cols = movies.columns.tolist()  # Get current column order
release_date_index = cols.index('release_date')  # Find the index of 'release_date'

# Reorder columns: keep columns before 'release_date', then add new columns, then columns after 'release_date'
new_order = (
    cols[:release_date_index + 1] +  # Columns before and including 'release_date'
    ['release_day', 'release_month', 'release_year'] +  # New columns
    cols[release_date_index + 1:]  # Columns after 'release_date'
)

In [37]:
movies.head(2)

Unnamed: 0,budget,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,...,revenue,runtime,spoken_languages,status,title,vote_average,vote_count,release_day,release_month,release_year
0,237000000,"Action, Adventure, Fantasy, Science Fiction",19995,"culture clash, future, space war, space colony...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"Ingenious Film Partners, Twentieth Century Fox...","United States of America, United Kingdom",...,2787965087,162.0,"English, Español",Released,Avatar,7.2,11800,10,12,2009
1,300000000,"Adventure, Fantasy, Action",285,"ocean, drug abuse, exotic island, east india t...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,...,961000000,169.0,English,Released,Pirates of the Caribbean: At World's End,6.9,4500,19,5,2007


In [38]:
movies.isnull().sum()

budget                  0
genres                  0
id                      0
keywords                0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
title                   0
vote_average            0
vote_count              0
release_day             0
release_month           0
release_year            0
dtype: int64

In [39]:
movies.columns

Index(['budget', 'genres', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'title', 'vote_average', 'vote_count',
       'release_day', 'release_month', 'release_year'],
      dtype='object')

In [40]:
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [41]:
# Joining these 2 datasets with the movie_id and id
merged_df = pd.merge(movies, credits, left_on='id', right_on='movie_id')

In [42]:
merged_df.head(2)

Unnamed: 0,budget,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,...,title_x,vote_average,vote_count,release_day,release_month,release_year,movie_id,title_y,cast,crew
0,237000000,"Action, Adventure, Fantasy, Science Fiction",19995,"culture clash, future, space war, space colony...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"Ingenious Film Partners, Twentieth Century Fox...","United States of America, United Kingdom",...,Avatar,7.2,11800,10,12,2009,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"Adventure, Fantasy, Action",285,"ocean, drug abuse, exotic island, east india t...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,...,Pirates of the Caribbean: At World's End,6.9,4500,19,5,2007,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [43]:
merged_df.cast[0] 
# Character -> name

# Get all the cast names

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [44]:
# movies.crew[0] -> name
# departments -> crew departments
# director -> crew director(where the job is director)  (Only director.... not animation director and all i.e., no thokas necessary)

In [45]:
crew_names = [", ".join([d['name'] for d in ast.literal_eval(i)]) for i in credits['cast']]

In [46]:
crew_names

['Sam Worthington, Zoe Saldana, Sigourney Weaver, Stephen Lang, Michelle Rodriguez, Giovanni Ribisi, Joel David Moore, CCH Pounder, Wes Studi, Laz Alonso, Dileep Rao, Matt Gerald, Sean Anthony Moran, Jason Whyte, Scott Lawrence, Kelly Kilgour, James Patrick Pitt, Sean Patrick Murphy, Peter Dillon, Kevin Dorman, Kelson Henderson, David Van Horn, Jacob Tomuri, Michael Blain-Rozgay, Jon Curry, Luke Hawker, Woody Schultz, Peter Mensah, Sonia Yee, Jahnel Curfman, Ilram Choi, Kyla Warren, Lisa Roumain, Debra Wilson, Chris Mala, Taylor Kibby, Jodie Landau, Julie Lamm, Cullen B. Madden, Joseph Brady Madden, Frankie Torres, Austin Wilson, Sara Wilson, Tamica Washington-Miller, Lucy Briant, Nathan Meister, Gerry Blair, Matthew Chamberlain, Paul Yates, Wray Wilson, James Gaylyn, Melvin Leno Clark III, Carvon Futrell, Brandon Jelkes, Micah Moch, Hanniyah Muhammad, Christopher Nolen, Christa Oliver, April Marie Thomas, Bravita A. Threatt, Colin Bleasdale, Mike Bodnar, Matt Clayton, Nicole Dionne, J

In [47]:
#  COmpleted till 2 hr 30 mins

In [50]:
merged_df.crew[0]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [52]:
crew_names[1]

'Johnny Depp, Orlando Bloom, Keira Knightley, Stellan Skarsgård, Chow Yun-fat, Bill Nighy, Geoffrey Rush, Jack Davenport, Kevin McNally, Tom Hollander, Naomie Harris, Jonathan Pryce, Keith Richards, Lee Arenberg, Mackenzie Crook, Greg Ellis, David Bailie, Martin Klebba, David Schofield, Lauren Maher, Vanessa Branch, Angus Barnett, Giles New, Reggie Lee, Dominic Scott Kay, Takayo Fischer, David Meunier, Ho-Kwan Tse, Andy Beckwith, Peter Donald Badalamenti II, Christopher S. Capp, Keith Richards, Hakeem Kae-Kazim, Ghassan Massoud'

In [54]:
movies = merged_df.copy()

In [56]:
merged_df.crew = crew_names

ValueError: Length of values (4803) does not match length of index (4799)

In [57]:
len(merged_df.crew), len(crew_names)

(4799, 4803)

In [60]:
# for i, l in zip(merged_df, tange(len(merged_df.crew))):
#     merged_df.crew[l] = [eval(i)[j]]['department'] for j in range(len(eval(i)))

In [59]:
for i, l in zip(merged_df['crew'], range(len(merged_df['crew']))):
    # Evaluate 'i' to get a list-like structure
    evaluated_list = eval(i)
    # Extract the 'department' field from each element in the evaluated list
    departments = [item['department'] for item in evaluated_list]
    # Assign the result to the corresponding row in 'merged_df'
    merged_df.at[l, 'crew'] = departments