# MOVIES OF 2021

In [1]:
import pandas as pd
import numpy as np

# Read the Data

In [2]:
def get_wikipedia(link, start, stop): # Start and stop are the range of required tables from wiki
    df = pd.DataFrame()
    for i in range(start, stop+1):
        df1 = pd.read_html(link, header=0)[i]
        df = df.append(df1, ignore_index=True)
    return df

link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2021"
df_2021 = get_wikipedia(link, 3, 4) # Get 3rd and 4th tables from the given Wikipedia page
df_2021

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref,Ref.
0,JANUARY,1,Shadow in the Cloud,Vertical Entertainment,Roseanne Liang (director/screenplay); Max Land...,[2],
1,JANUARY,13,The White Tiger,Netflix,Ramin Bahrani (director/screenplay); Adarsh Go...,,
2,JANUARY,14,Locked Down,HBO Max / Warner Bros. Pictures,Doug Liman (director); Steven Knight (screenpl...,[3],
3,JANUARY,15,The Dig,Netflix / Clerkenwell Films,Simon Stone (director); Moira Buffini (screenp...,[4],
4,JANUARY,15,Outside the Wire,Netflix,"Mikael Håfström (director); Rob Yescombe, Rowa...",[5],
...,...,...,...,...,...,...,...
140,JUNE,25,Mary J. Blige's My Life,Amazon Prime Video,Vanessa Roth (director/screenplay); Mary J. Bl...,,
141,JUNE,25,Fathom,Apple TV+,Drew Xanthopoulos (director/screenplay); Miche...,,
142,JUNE,25,Rollers,Level 33 Entertainment,"Isaiah Smallman (director); Johnny Ray Gill, K...",,
143,JUNE,30,America: The Motion Picture,Netflix / Floyd County Productions,Matt Thompson (director); Dave Callaham (scree...,,


In [3]:
df_2021 = df_2021[['Title','Cast and crew']]
df_2021

Unnamed: 0,Title,Cast and crew
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...
3,The Dig,Simon Stone (director); Moira Buffini (screenp...
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa..."
...,...,...
140,Mary J. Blige's My Life,Vanessa Roth (director/screenplay); Mary J. Bl...
141,Fathom,Drew Xanthopoulos (director/screenplay); Miche...
142,Rollers,"Isaiah Smallman (director); Johnny Ray Gill, K..."
143,America: The Motion Picture,Matt Thompson (director); Dave Callaham (scree...


# Feature Engineering

In [4]:
from tmdbv3api import TMDb
from tmdbv3api import Movie
tmdb = TMDb()
tmdb_movie = Movie() 
tmdb.api_key = 'PUT YOUR API KEY'

import json
import requests

def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)
    if not result:
      return np.NaN
    else:
      movie_id = result[0].id
      response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
      data_json = response.json()
      if data_json['genres']:
          genre_str = " " 
          for i in range(0,len(data_json['genres'])):
              genres.append(data_json['genres'][i]['name'])
          return genre_str.join(genres)
      else:
          return np.NaN

In [5]:
df_2021['genres'] = df_2021['Title'].map(lambda x: get_genre(str(x)))
df_2021

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['genres'] = df_2021['Title'].map(lambda x: get_genre(str(x)))


Unnamed: 0,Title,Cast and crew,genres
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...,Horror Action War
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...,Crime Drama
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...,Comedy Crime Drama
3,The Dig,Simon Stone (director); Moira Buffini (screenp...,Drama History
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa...",Thriller Action Science Fiction
...,...,...,...
140,Mary J. Blige's My Life,Vanessa Roth (director/screenplay); Mary J. Bl...,Documentary Music
141,Fathom,Drew Xanthopoulos (director/screenplay); Miche...,Documentary
142,Rollers,"Isaiah Smallman (director); Johnny Ray Gill, K...",Comedy
143,America: The Motion Picture,Matt Thompson (director); Dave Callaham (scree...,Action Comedy History Animation Fantasy


In [6]:
def get_director(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]

def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])

def get_actor2(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[1])
    
def get_actor3(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[2])

In [7]:
df_2021['director_name'] = df_2021['Cast and crew'].map(lambda x: get_director(str(x)))
df_2021['actor_1_name'] = df_2021['Cast and crew'].map(lambda x: get_actor1(str(x)))
df_2021['actor_2_name'] = df_2021['Cast and crew'].map(lambda x: get_actor2(str(x)))
df_2021['actor_3_name'] = df_2021['Cast and crew'].map(lambda x: get_actor3(str(x)))
df_2021

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['director_name'] = df_2021['Cast and crew'].map(lambda x: get_director(str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['actor_1_name'] = df_2021['Cast and crew'].map(lambda x: get_actor1(str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['actor_2_name'] = df_202

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...,Horror Action War,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...,Crime Drama,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...,Comedy Crime Drama,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant
3,The Dig,Simon Stone (director); Moira Buffini (screenp...,Drama History,Simon Stone,Carey Mulligan,Ralph Fiennes,Lily James
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa...",Thriller Action Science Fiction,Mikael Håfström,Anthony Mackie,Damson Idris,Emily Beecham
...,...,...,...,...,...,...,...
140,Mary J. Blige's My Life,Vanessa Roth (director/screenplay); Mary J. Bl...,Documentary Music,Vanessa Roth,Mary J. Blige,Taraji P. Henson,Alicia Keys
141,Fathom,Drew Xanthopoulos (director/screenplay); Miche...,Documentary,Drew Xanthopoulos,Michelle Fournet,Ellen Garland,
142,Rollers,"Isaiah Smallman (director); Johnny Ray Gill, K...",Comedy,Isaiah Smallman,Isaiah Smallman (director); Johnny Ray Gill,Kate Cobb,Vicky Jeudy
143,America: The Motion Picture,Matt Thompson (director); Dave Callaham (scree...,Action Comedy History Animation Fantasy,Matt Thompson,Channing Tatum,Jason Mantzoukas,Olivia Munn


In [8]:
df_2021 = df_2021.rename(columns={'Title':'movie_title'})

In [9]:
new_df21 = df_2021.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]
new_df21

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale,Horror Action War,Shadow in the Cloud
1,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas,Crime Drama,The White Tiger
2,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant,Comedy Crime Drama,Locked Down
3,Simon Stone,Carey Mulligan,Ralph Fiennes,Lily James,Drama History,The Dig
4,Mikael Håfström,Anthony Mackie,Damson Idris,Emily Beecham,Thriller Action Science Fiction,Outside the Wire
...,...,...,...,...,...,...
140,Vanessa Roth,Mary J. Blige,Taraji P. Henson,Alicia Keys,Documentary Music,Mary J. Blige's My Life
141,Drew Xanthopoulos,Michelle Fournet,Ellen Garland,,Documentary,Fathom
142,Isaiah Smallman,Isaiah Smallman (director); Johnny Ray Gill,Kate Cobb,Vicky Jeudy,Comedy,Rollers
143,Matt Thompson,Channing Tatum,Jason Mantzoukas,Olivia Munn,Action Comedy History Animation Fantasy,America: The Motion Picture


# Handling the Null values

In [10]:
new_df21.isna().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     8
genres           1
movie_title      0
dtype: int64

In [11]:
new_df21 = new_df21.dropna(how='any')
new_df21.isna().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
dtype: int64

In [12]:
new_df21['movie_title'] = new_df21['movie_title'].str.lower()
new_df21

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df21['movie_title'] = new_df21['movie_title'].str.lower()


Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale,Horror Action War,shadow in the cloud
1,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas,Crime Drama,the white tiger
2,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant,Comedy Crime Drama,locked down
3,Simon Stone,Carey Mulligan,Ralph Fiennes,Lily James,Drama History,the dig
4,Mikael Håfström,Anthony Mackie,Damson Idris,Emily Beecham,Thriller Action Science Fiction,outside the wire
...,...,...,...,...,...,...
139,Heidi Ewing,Armando Espitia,Christian Vázquez,Michelle Rodríguez,Drama Romance,i carry you with me
140,Vanessa Roth,Mary J. Blige,Taraji P. Henson,Alicia Keys,Documentary Music,mary j. blige's my life
142,Isaiah Smallman,Isaiah Smallman (director); Johnny Ray Gill,Kate Cobb,Vicky Jeudy,Comedy,rollers
143,Matt Thompson,Channing Tatum,Jason Mantzoukas,Olivia Munn,Action Comedy History Animation Fantasy,america: the motion picture


# Save the File

In [13]:
new_df21.to_csv("prep4data.csv", index=False)