In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
import datetime
import pytz
import scrapy
from scrapy.crawler import CrawlerProcess
import pickle
import json

# Validate movie data
- To validate the movie data collected, I will check to make sure that all of the best Black movies are present. 
- To do this I will scrape two webpages that present a list of the best Black movies (Rotten Tomatoes and IMDb). 
- I will then merge these lists of titles and compare it to the current list of titles in the movie data dataframe. 
- If several movie titles are missing, I will search for the possible cause and amend the dataset accordingly.  

## 1. Scrape movie titles and merge lists

In [None]:
# Scrape Rotten Tomatoes' "123 Best Black Movies of the 21st Century" webpage 

# Initialise empty list to store scraped titles
scraped_movie_list_1 = []  

page_url = "https://editorial.rottentomatoes.com/guide/best-black-movies-21st-century/"
    
# Create the Spider Class
class SCRAPE123movies(scrapy.Spider):
    name = "123movies"

    def start_requests(self):
        yield scrapy.Request(url=page_url, callback=self.parse)
    
    def parse(self, response):
        # Extract the 123 movie titles
        movie_titles = response.xpath('//h2/a/text()')
        for title in movie_titles.extract():
            scraped_movie_list_1.append(title.strip(" \n"))

process = CrawlerProcess()
process.crawl(SCRAPE123movies)
process.start()

In [None]:
# Check scraped_movie_list_1
scraped_movie_list_1 

In [None]:
# Remove last two elements that are not film titles
scraped_movie_list_1 = scraped_movie_list_1[:-2]
scraped_movie_list_1

In [9]:
# Save scraped_movie_list_1
pd.DataFrame(scraped_movie_list_1).to_pickle('scraped_movie_list_1.pkl')

In [None]:
# Scrape IMDb's 'The 100 best Black / urban movies of all time' webpage

# Initialise empty list to store scraped titles
scraped_movie_list_2 = []

page_url = "https://www.imdb.com/list/ls054431555/?sort=list_order,asc&st_dt=&mode=simple&page=1&ref_=ttls_vw_smp"

# Create the Spider Class
class SCRAPEIMDbBestBlack(scrapy.Spider):
    name = "IMDbBestBlack"

    def start_requests(self):
        yield scrapy.Request(url=page_url, callback=self.parse)

    def parse(self, response):
        # Extract the movie titles
        movie_titles = response.xpath('//a/img/@alt')
        for title in movie_titles.extract():
            scraped_movie_list_2.append(title.strip(" \n"))

process = CrawlerProcess()
process.crawl(SCRAPEIMDbBestBlack)
process.start()

In [None]:
# Check scraped_movie_list_2
scraped_movie_list_2

In [8]:
print(scraped_movie_list_2[-10:])

# Remove last 5 elements
scraped_movie_list_2 = scraped_movie_list_2[:-5]
scraped_movie_list_2

["Big Momma's House 2", 'Nutty Professor II: The Klumps', 'ATL', "Get Rich or Die Tryin'", 'State Property: Blood on the Streets', 'list image', 'list image', 'list image', 'list image', 'list image']


In [10]:
# Save scraped_movie_list_2
pd.DataFrame(scraped_movie_list_2).to_pickle("scraped_movie_list_2.pkl")

In [24]:
# Merge both lists
list1 = pd.read_pickle("scraped_movie_list_1.pkl")
list2 = pd.read_pickle("scraped_movie_list_2.pkl")
merged_list = pd.concat([list1, list2])[0].to_list()

In [None]:
print(f'The length of the merged list is: {len(merged_list)}')
merged_list

## 2. Compare with current movie data dataframe

In [29]:
# Load movie data
movie_data_df = pd.read_pickle("movie_data_df_2.pkl")
movie_data_df.head()

Unnamed: 0,imdbID,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Lead_Actors,...,Total_Awards_Lead_Actors,Total_Awards_Supporting_Actors,Total_Awards_Movie_Cast,Total_Awards_Director,Total_Awards_Writer,Total_Awards_Movie_Crew,Total_Awards_Soundtrack_Credits,Black_Lead_Proportion,Black_Support_Proportion,Black_Cast_Proportion
0,tt0056952,The Cool World,1963,,1964-11-02 00:00:00+00:00,105,"[Crime, Drama]",[SHIRLEY CLARKE],"[SHIRLEY CLARKE, CARL LEE, WARREN MILLER]","[RONY CLANTON, CARL LEE, YOLANDA RODRÍGUEZ]",...,0,0,0,0,0,2,0,0.0,0.2,0.15
1,tt0065944,King: A Filmed Record... Montgomery to Memphis,1970,,1970-03-24 00:00:00+00:00,185,"[Documentary, Biography, History]","[SIDNEY LUMET, JOSEPH L. MANKIEWICZ]","[MITCHELL GRAYSON, ELY A. LANDAU]","[PAUL NEWMAN, JOANNE WOODWARD, RUBY DEE, JAMES...",...,10,15,25,1,0,4,0,0.5,0.08,0.18
2,tt0066559,What Do You Say to a Naked Lady?,1970,X,1970-02-18 00:00:00+00:00,85,"[Documentary, Comedy]",[ALLEN FUNT],[ALLEN FUNT],"[JOIE ADDISON, LAURA HUSTON, MARTIN MEYERS, KA...",...,0,0,0,0,0,0,0,0.0,0.12,0.08
3,tt0067741,Shaft,1971,R,1971-07-02 00:00:00+00:00,100,"[Action, Crime, Thriller]",[GORDON PARKS],"[ERNEST TIDYMAN, JOHN D.F. BLACK, ERNEST TIDYMAN]","[RICHARD ROUNDTREE, MOSES GUNN, CHARLES CIOFFI...",...,0,0,0,0,0,3,0,0.5,0.0,0.13
4,tt0068358,Charley-One-Eye,1973,R,1973-04-18 00:00:00+00:00,96,[Western],[DON CHAFFEY],[KEITH LEONARD],"[RICHARD ROUNDTREE, ROY THINNES, NIGEL DAVENPO...",...,1,0,1,0,0,2,0,0.25,0.0,0.12


In [None]:
current_list = movie_data_df["Title"].to_list()
current_list

In [50]:
# Normalise titles in both lists
current_list = [title.strip().lower() for title in current_list]
merged_list = [title.strip().lower() for title in merged_list]

# Get the difference in titles
list_difference = [title for title in merged_list if title not in current_list]
print(f'There seems to be {len(list_difference)} titles missing from the movie data.') 

There seems to be 144 titles missing from the movie data.


In [52]:
list_difference

['13th',
 '20 feet from stardom',
 'akeelah and the bee',
 'all about the benjamins',
 'all in: the fight for democracy',
 'amazing grace',
 'antwone fisher',
 'are we there yet?',
 'atl',
 'attica',
 'b*a*p*s',
 'baadasssss!',
 'bad boys',
 'bad boys for life',
 'bad boys ii',
 'bad trip',
 'beasts of the southern wild',
 'beats, rhymes & life: the travels of a tribe called quest',
 'beauty shop',
 "big momma's house 2",
 'black dynamite',
 'black is king',
 'blackkklansman',
 'blindspotting',
 'booty call',
 'burning cane',
 'candyman',
 'cb4',
 'chi-raq',
 'class act',
 'clemency',
 'copshop',
 "dave chappelle's block party",
 'dear white people',
 'dolemite is my name',
 "don't be a menace to south central while drinking your juice in the hood",
 'dreamgirls',
 'drumline',
 'drumline',
 "eve's bayou",
 'farewell amor',
 'fast color',
 'foxy brown',
 'fresh',
 'fruitvale station',
 'george washington',
 'get on up',
 'get out',
 'ghost dad',
 'god grew tired of us',
 'good hair',
 '