# Scraping 100 Movie Titles

### Imports

In [1]:
from gevent import monkey
monkey.patch_socket()
import grequests
import time
import re
from bs4 import BeautifulSoup
import pandas as pd

  with loop.timer(seconds, ref=ref) as t:


### Genre to be scraped

In [2]:
genre_list = ['action']

### Functions for scraping

In [3]:
# Creates the base urls for each genre and to be used for the scraping process
def generate_genre_urls():
    genre_urls = []
    for genre in genre_list:
        genre_urls.append(f'https://letterboxd.com/films/ajax/genre/{genre}')
    return genre_urls

# Grabs total number of films for each genre to calculate how many requests for each genre must be sent out
def get_film_num_by_genre():
    film_genre_counts = []
    urls = generate_genre_urls()
    page_reqs = (grequests.get(link) for link in urls)
    page_responses = grequests.map(page_reqs)
    for r in page_responses:
        film_page = BeautifulSoup(r.text, 'lxml')
        film_num = re.findall(r'are ([\d,]+)', str(film_page.find(
            'h2', class_='ui-block-heading').text))[0].replace(',', '')
        film_genre_counts.append(film_num)
    film_num_by_genre = create_genre_dict(film_genre_counts)
    return film_num_by_genre

# Creates a dictionary that separates the film urls by genre
def create_genre_dict(film_num):
    film_num_by_genre = {}
    for idx in range(len(genre_list)):
        film_num_by_genre.update({genre_list[idx]: film_num[idx]})
    return film_num_by_genre

# Creates list of all pages for each genre (ex. .../action/page/1/ THROUGH .../action/page/235/)
def generate_genre_page_urls(film_num_by_genre):
    final_genre_dict = {}
    for genre in genre_list:
        genre_urls = []
        total_pages = (int(film_num_by_genre[genre])//72) + 1
        url = f'https://letterboxd.com/films/ajax/genre/{genre}/size/small/page/'
        for page in range(total_pages):
            genre_urls.append(url+str(page))
        final_genre_dict.update({genre: genre_urls})
    return final_genre_dict

# Goes through every page of every film genre (72 films per page) and grabs the url fragment (eg. the-incredible-hulk) for each film
def get_film_titles():
    film_url_fragments = []
    film_num_by_genre = get_film_num_by_genre()
    genre_url_dict = generate_genre_page_urls(film_num_by_genre)
    for genre in genre_list:
        genre_start = time.perf_counter()
        print(f'Genre: {genre} ({film_num_by_genre[genre]} films, {(int(film_num_by_genre[genre])//72) + 1} pages)')
        page_reqs = (grequests.get(link) for link in genre_url_dict[genre])
        page_responses = grequests.map(page_reqs, size=75)
        for r in page_responses:
            film_page = BeautifulSoup(r.text, 'lxml')
            film_url_fragments += re.findall(r'data-film-slug="/film/([\w-]+)/"', str(film_page.find(
                'ul', class_='poster-list -p70 -grid')))
        genre_end = (time.perf_counter() - genre_start)
        print(f'Len: {len(film_url_fragments)} {genre_end.__round__(2)}s\n')
    return film_url_fragments

# Removes duplicate url fragments
def remove_duplicates(url_fragments):
    return list(dict.fromkeys(url_fragments))

# Combines letterboxd url with url fragments to form complete, requestable url
def create_full_film_urls(uniq_films):
    uniq_urls = []
    for url in uniq_films:
        uniq_urls.append("https://letterboxd.com/film/" + url)
    return uniq_urls

# Tries to grab all pieces of information from each url and returns info in a dictionary (for translating data to Pandas Dataframe later)
def find_info(url_response):
    try:
        film_url = re.findall(r'slug="/film/([\w-]+)/"', str(url_response.find('section', class_='poster-list -p230 -single no-hover el col')))[0]
    except Exception:
        film_url = None
    try:
        film_name = url_response.find('h1', class_='headline-1 js-widont prettify').text
    except Exception:
        film_name = None
    try:
        film_year = url_response.find('small', class_='number').text
    except Exception:
        film_year = None
    try:
        film_rating = re.findall(r'"ratingValue":(.+?),', str(url_response.find_all()))[0]
    except Exception:
        film_rating = None
    try:
        film_cast = re.findall(r'"/actor/([\w-]+)/"', str(url_response.find('div', class_='cast-list text-sluglist')))
    except Exception:
        film_cast = None
    try:
        film_director = url_response.find('span', class_='prettify').text
    except Exception:
        film_director = None
    try:
        film_studios = re.findall(r'studio/([\w-]+)/"', str(url_response.find('div', id='tab-details')))
    except Exception:
        film_studios = None
    try:
        film_country = re.findall(r'country/([\w-]+)/"', str(url_response.find('div', id='tab-details')))[0]
    except Exception:
        film_country = None
    try:
        film_language = re.findall(r'language/([\w-]+)/"', str(url_response.find('div', id='tab-details')))[0]
    except Exception:
        film_language = None
    try:
        film_genres = re.findall(r'genre/([\w-]+)/"', str(url_response.find('div', class_='text-sluglist capitalize')))
    except Exception:
        film_genres = None

    film_info = {
        "url": film_url,
        "name": film_name,
        "year": film_year,
        "rating": film_rating,
        "cast": film_cast,
        "director": film_director,
        "studios": film_studios,
        "country": film_country,
        "language": film_language,
        "genres": film_genres
    }
    return film_info

# Requests final list of urls and returns list of dictionaries
def send_requests(uniq_urls):
    formatted_url_data = []
    page_reqs = (grequests.get(link) for link in uniq_urls)
    page_responses = grequests.map(page_reqs, size=75)
    for r in page_responses:
        film_page = BeautifulSoup(r.text, 'lxml')
        formatted_url_data.append(find_info(film_page))
    return formatted_url_data

### Scrape all action movies titles (15-20 seconds)

In [4]:
start = time.perf_counter()

film_urls = get_film_titles()

fin = (time.perf_counter() - start)
print(f'Time to scrape and assemble url list: {fin.__round__(2)}s')

Genre: action (31373 films, 436 pages)
Len: 30993 16.16s

Time to scrape and assemble url list: 16.58s


### Display first ten action movie titles

In [5]:
film_urls[:10]

['inception',
 'spider-man-into-the-spider-verse',
 'the-dark-knight',
 'baby-driver',
 'inglourious-basterds',
 'avengers-infinity-war',
 'black-panther',
 'avengers-endgame',
 'mad-max-fury-road',
 'guardians-of-the-galaxy']

### Removes duplicate movie titles (movies with more than one genre)

In [6]:
start = time.perf_counter()

uniq = remove_duplicates(film_urls)
uniq_url_list = create_full_film_urls(uniq)

fin = (time.perf_counter() - start)
print(f'\nTime to clean list: {fin.__round__(2)}s, New List Length: {len(uniq_url_list)})')


Time to clean list: 0.01s, New List Length: 30921)


### Scrape info for first 100 titles (15-20 seconds)

In [7]:
start = time.perf_counter()

titles_to_scrape = uniq_url_list[:100]
kinodf = pd.DataFrame.from_records(send_requests(titles_to_scrape))

fin = (time.perf_counter() - start)
print(f'Time to process {len(titles_to_scrape)} titles: {fin.__round__(2)}s')

Time to process 100 titles: 14.1s


### Display DataFrame containing scraped film info

In [8]:
kinodf

Unnamed: 0,url,name,year,rating,cast,director,studios,country,language,genres
0,inception,Inception,2010,4.17,"[leonardo-dicaprio, joseph-gordon-levitt, ken-...",Christopher Nolan,"[legendary-pictures-1, syncopy, warner-bros-pi...",uk,english,"[science-fiction, action, adventure]"
1,spider-man-into-the-spider-verse,Spider-Man: Into the Spider-Verse,2018,4.43,"[shameik-moore, jake-johnson-1, hailee-steinfe...",Rodney Rothman,"[sony-pictures, columbia-pictures, marvel-ente...",usa,english,"[action, adventure, animation, science-fiction..."
2,the-dark-knight,The Dark Knight,2008,4.42,"[christian-bale, heath-ledger, michael-caine, ...",Christopher Nolan,"[dc-comics, legendary-pictures-1, syncopy, iso...",uk,english,"[action, thriller, drama, crime]"
3,baby-driver,Baby Driver,2017,3.79,"[ansel-elgort, kevin-spacey, lily-james, jon-h...",Edgar Wright,"[big-talk-productions, working-title-films]",uk,english,"[crime, action]"
4,inglourious-basterds,Inglourious Basterds,2009,4.28,"[brad-pitt, melanie-laurent, christoph-waltz, ...",Quentin Tarantino,"[the-weinstein-company, universal-pictures, a-...",germany,english,"[thriller, drama, action, war]"
...,...,...,...,...,...,...,...,...,...,...
95,x-men-days-of-future-past,X-Men: Days of Future Past,2014,3.57,"[hugh-jackman, james-mcavoy, michael-fassbende...",Bryan Singer,"[marvel-entertainment, bad-hat-harry-productio...",uk,english,"[adventure, action, fantasy, science-fiction]"
96,casino-royale-2006,Casino Royale,2006,3.92,"[daniel-craig, eva-green, mads-mikkelsen, judi...",Martin Campbell,"[eon-productions, stillking-films, columbia-pi...",bahamas-the,english,"[action, adventure, thriller]"
97,okja,Okja,2017,3.69,"[ahn-seo-hyun, tilda-swinton, paul-dano, jake-...",Bong Joon-ho,"[kate-street-picture-company, plan-b-entertain...",south-korea,english,"[action, adventure, drama, science-fiction]"
98,terminator-2-judgment-day,Terminator 2: Judgment Day,1991,4.21,"[arnold-schwarzenegger, linda-hamilton, edward...",James Cameron,"[tristar-pictures-1, studiocanal, valhalla-ent...",usa,english,"[thriller, action, science-fiction]"


  with loop.timer(seconds, ref=ref) as t:
