In [1]:
import pandas as pd
import numpy as np
import requests
import json
import bs4 as bs
import urllib.request
from tmdbv3api import TMDb, Movie
from utils import *

## Note: In this notebook we will build a whole pipeline based on the previous notebook - prepro_3

In [2]:
link_2019 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2019"
link_2020 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"
link_2021 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2021"

### Scrape the data from Wikipedia

In [3]:
def scrape_data_to_df(link):
    source = urllib.request.urlopen(link).read()
    soup = bs.BeautifulSoup(source, "lxml")
    tables = soup.find_all("table", class_="wikitable sortable")
    jan_mar_title = soup.find(id="January–March").get_text()
    apr_jun_title = soup.find(id="April–June").get_text()
    jul_sep_title = soup.find(id="July–September").get_text()
    oct_dec_title = soup.find(id="October–December").get_text()
    print(f"Scraped movies from the following months: {jan_mar_title}, {apr_jun_title}, {jul_sep_title}, {oct_dec_title}.")
    print(f"{len(tables)} tables in total.")
    
    return tables
    #tables_to_df(tables)
    #print(df)

In [4]:
scraped_data = scrape_data_to_df(link_2019)

Scraped movies from the following months: January–March, April–June, July–September, October–December.
4 tables in total.


### Create a dataframe from the scrapped tables

In [5]:
def tables_to_df(tables):
    df1 = pd.read_html(str(tables[0]))[0]
    df2 = pd.read_html(str(tables[1]))[0]
    df3 = pd.read_html(str(tables[2]))[0]
    df4 = pd.read_html(str(tables[3]))[0]
    
    df = df1.append(df2.append(df3.append(df4, ignore_index=True), ignore_index=True), ignore_index=True)
    return df

In [6]:
df = tables_to_df(scraped_data)

In [7]:
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,JANUARY,4,Escape Room,Columbia Pictures,"Adam Robitel (director); Bragi F. Schut, Maria...",[2]
1,JANUARY,4,Rust Creek,IFC Films,Jen McGowan (director); Julie Lipson (screenpl...,[3]
2,JANUARY,4,American Hangman,Hangman Justice Productions,Wilson Coneybeare (director/screenplay); Donal...,[4]
3,JANUARY,11,A Dog's Way Home,Columbia Pictures,Charles Martin Smith (director); W. Bruce Came...,[5]
4,JANUARY,11,The Upside,STX Entertainment,Neil Burger (director); Jon Hartmere (screenpl...,[6]
...,...,...,...,...,...,...
237,DECEMBER,25,Spies in Disguise,20th Century Fox / Blue Sky Studios / Chernin ...,"Nick Bruno, Troy Quane (directors); Brad Copel...",[132]
238,DECEMBER,25,Little Women,Columbia Pictures / Regency Enterprises,Greta Gerwig (director/screenplay); Saoirse Ro...,[221]
239,DECEMBER,25,1917,Universal Pictures / DreamWorks Pictures / Ent...,Sam Mendes (director/screenplay); Krysty Wilso...,[222]
240,DECEMBER,25,Just Mercy,Warner Bros. Pictures / Participant Media,"Destin Daniel Cretton (director/screenplay), A...",[223]


### Getting the genre from TMDb API

In [8]:
df["genres"] = df["Title"].map(lambda x: get_genre(str(x)))

In [9]:
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,JANUARY,4,Escape Room,Columbia Pictures,"Adam Robitel (director); Bragi F. Schut, Maria...",[2],Thriller Action Mystery Adventure Horror
1,JANUARY,4,Rust Creek,IFC Films,Jen McGowan (director); Julie Lipson (screenpl...,[3],Thriller Drama
2,JANUARY,4,American Hangman,Hangman Justice Productions,Wilson Coneybeare (director/screenplay); Donal...,[4],Thriller
3,JANUARY,11,A Dog's Way Home,Columbia Pictures,Charles Martin Smith (director); W. Bruce Came...,[5],Drama Adventure Family
4,JANUARY,11,The Upside,STX Entertainment,Neil Burger (director); Jon Hartmere (screenpl...,[6],Comedy Drama
...,...,...,...,...,...,...,...
237,DECEMBER,25,Spies in Disguise,20th Century Fox / Blue Sky Studios / Chernin ...,"Nick Bruno, Troy Quane (directors); Brad Copel...",[132],Animation Action Adventure Comedy Family
238,DECEMBER,25,Little Women,Columbia Pictures / Regency Enterprises,Greta Gerwig (director/screenplay); Saoirse Ro...,[221],Drama Romance
239,DECEMBER,25,1917,Universal Pictures / DreamWorks Pictures / Ent...,Sam Mendes (director/screenplay); Krysty Wilso...,[222],War Drama Action Thriller
240,DECEMBER,25,Just Mercy,Warner Bros. Pictures / Participant Media,"Destin Daniel Cretton (director/screenplay), A...",[223],Drama Crime History


### Filtering the director and the top 3 actors of each movie 

In [10]:
df["director_name"] = df["Cast and crew"].map(lambda x: get_director(str(x)))

In [11]:
#Retrun to this after you finished
#df["director_name"].unique()

In [12]:
df["actor_1_name"] = df["Cast and crew"].map(lambda x: get_actor1(str(x)))

In [13]:
#Retrun to this after you finished
#df["actor_1_name"].unique()

In [14]:
df["actor_2_name"] = df["Cast and crew"].map(lambda x: get_actor2(str(x)))

In [15]:
#Retrun to this after you finished
#df["actor_2_name"].unique()

In [16]:
df["actor_3_name"] = df["Cast and crew"].map(lambda x: get_actor3(str(x)))

### Feature selection

In [17]:
feature_list = ["Title", "genres", "director_name", "actor_1_name", "actor_2_name", "actor_3_name"]

In [43]:
df_2019 = df[feature_list]
df_2019

Unnamed: 0,Title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Escape Room,Thriller Action Mystery Adventure Horror,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll
1,Rust Creek,Thriller Drama,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan
2,American Hangman,Thriller,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis
3,A Dog's Way Home,Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos,Alexandra Shipp
4,The Upside,Comedy Drama,Neil Burger,Bryan Cranston,Kevin Hart,Nicole Kidman
...,...,...,...,...,...,...
237,Spies in Disguise,Animation Action Adventure Comedy Family,"Nick Bruno, Troy Quane",Will Smith,Tom Holland,Rashida Jones
238,Little Women,Drama Romance,Greta Gerwig,Saoirse Ronan,Emma Watson,Florence Pugh
239,1917,War Drama Action Thriller,Sam Mendes,George MacKay,Dean-Charles Chapman,Mark Strong
240,Just Mercy,Drama Crime History,Destin Daniel Cretton,Andrew Lanham (screenplay); Michael B. Jordan,Jamie Foxx,Brie Larson


### Renaming column name and movie names to be consistent with previous datasets

In [19]:
df_2019 = df_2019.rename(columns={"Title": "movie_title"})

In [20]:
df_2019["movie_title"] = df_2019["movie_title"].str.lower()

### Dropping missing values

In [21]:
df_2019.isnull().sum()

movie_title      0
genres           0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
dtype: int64

### Combine cast and genre columns for future steps in Tfidf vectorizer

In [23]:
df_2019["combined_info"] = df_2019["director_name"] + " " + df_2019["actor_1_name"] + " " +  df_2019["actor_2_name"] + " " + df_2019["actor_3_name"] + " " + df_2019["genres"]

In [26]:
df_2019["combined_info"].unique()

array(['Adam Robitel Taylor Russell Logan Miller Deborah Ann Woll Thriller Action Mystery Adventure Horror',
       "Jen McGowan Hermione Corfield Jay Paulson Sean O'Bryan Thriller Drama",
       'Wilson Coneybeare Donald Sutherland Vincent Kartheiser Oliver Dennis Thriller',
       'Charles Martin Smith Bryce Dallas Howard Edward James Olmos Alexandra Shipp Drama Adventure Family',
       'Neil Burger Bryan Cranston Kevin Hart Nicole Kidman Comedy Drama',
       'Jeffrey Nachmanoff Keanu Reeves None None Science Fiction Thriller',
       'Batán Silva Natalia Dyer Kyra Sedgwick Tim Daly Drama Family Thriller Science Fiction',
       'M. Night Shyamalan James McAvoy Bruce Willis Samuel L. Jackson Thriller Drama Science Fiction',
       'Vicky Jewson Noomi Rapace Sophie Nélisse Indira Varma Action Thriller',
       'Henry Dunham James Badge Dale Brian Geraghty None Drama Thriller Mystery',
       'Timothy Woodward Jr. Lin Shaye Michael Welch Melissa Bolona Horror',
       'Joe Cornish Lo