Let's get started! 

As always, we start by importing our libraries. 

In [1]:
from bs4 import BeautifulSoup as bs
import json
import matplotlib as mpl
import pandas
import re
import requests
import seaborn as sns
import unicodedata as uni

First, let's setup the URLs that'll be driving our data analytics project here.

In [2]:
base_url = 'https://letterboxd.com/'
account_name = 'JoshLinneburg'
ratings_url = base_url + account_name + '/films/ratings/by/rating/'

And let's just scrape the webpage of interest; the films by rating on my account.

In [3]:
response = requests.get(ratings_url).text
soup = bs(response)

First thing we're going to do: Figure out how to itemize the films that have been rated *on a single page*

In [4]:
soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li')

[<li class="poster-container">
 <div class="poster film-poster really-lazy-load" data-film-id="50713" data-film-slug="/film/the-empire-strikes-back/" data-image-height="225" data-image-width="150" data-linked="linked" data-menu="menu" data-target-link="/film/the-empire-strikes-back/" data-target-link-target=""> <img alt="The Empire Strikes Back" class="image" height="225" src="https://s.ltrbxd.com/static/img/empty-poster-150.469d00af.png" width="150"/><span class="frame"><span class="frame-title"></span></span> </div>
 <p class="poster-viewingdata">
 <span class="rating rated-10"> ★★★★★ </span>
 <time class="localtime-mmm-dd" datetime="2019-12-06T04:24:11Z"></time>
 </p>
 </li>, <li class="poster-container">
 <div class="poster film-poster really-lazy-load" data-film-id="51896" data-film-slug="/film/the-dark-knight/" data-image-height="225" data-image-width="150" data-linked="linked" data-menu="menu" data-target-link="/film/the-dark-knight/" data-target-link-target=""> <img alt="The Da

Looks like we get a ranking (in star characters, will need to clean that up to use it later on), along with a link to the film's page itself where we will be able to access more info (actors, director, etc.). 

Okay, how do we attack the specific elements of a given entry in this list so we can extract the movie's URL and star rating?

In [5]:
movies_on_page = soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li')

In [6]:
for i in range(len(movies_on_page)):
    print(movies_on_page[i].find('div', class_='poster film-poster really-lazy-load')['data-target-link'])
    print(movies_on_page[i].find('p').find('span').text.strip())

/film/the-empire-strikes-back/
★★★★★
/film/the-dark-knight/
★★★★★
/film/the-shawshank-redemption/
★★★★★
/film/the-lord-of-the-rings-the-return-of-the-king/
★★★★½
/film/the-lord-of-the-rings-the-fellowship-of-the-ring/
★★★★★
/film/star-wars/
★★★★½
/film/the-lion-king/
★★★★½
/film/the-lord-of-the-rings-the-two-towers/
★★★★½
/film/raiders-of-the-lost-ark/
★★★★½
/film/toy-story/
★★★★★
/film/terminator-2-judgment-day/
★★★★★
/film/mad-max-fury-road/
★★★★★
/film/the-departed/
★★★★★
/film/saving-private-ryan/
★★★★★
/film/the-iron-giant/
★★★★½
/film/die-hard/
★★★★★
/film/jurassic-park/
★★★★
/film/blade-runner/
★★★★★


Looks like we're able to get the user's rating and a URL for the film so we'll be able to get more info about it.

Next up: Let's start scraping from a movie's page on here so we can access that detailed info about the film. Let's start with attributes that are on the Letterboxd website (title, actors, director, audience rating, genres) and then we can get crazy with items we'll need to scrape IMDb for (box office, release date, awards, etc.).

We'll use *The Fellowship of the Ring* as our working example here.

In [7]:
film_to_scrape = '/film/the-lord-of-the-rings-the-fellowship-of-the-ring/'
response = requests.get(base_url+film_to_scrape).text
soup = bs(response)

Let's figure out how we can grab the 

In [27]:
available_role_list

NameError: name 'available_role_list' is not defined

In [39]:
# Categories (Director, Producers, Writers, etc.) of crewmembers available
crew_roles_avail = soup.find('div', class_='tabbed-content-block column-block').findAll('span')

# Same information in a list without HTML tags
crew_roles_avail_list = [tag.text.lower() for tag in crew_roles_avail]

# Init the crew list
crew_list = []

for i in range(len(crew_roles_avail_list)):
    crew_dict = {}
    
    # Role is just whatever role we're on 
    crew_dict['crew_role'] = crew_roles_avail_list[i]
    
    # Find the list of names and URLs for a given role
    crew_attributes_list = soup.find('div', class_='tabbed-content-block column-block').findAll('div')[i].findAll('a')
    
    # Init an "inner" list - a list of dictionaries containing the name and URL of each crewmember
    inner_crew_list = [] 
    
    for i in range(len(crew_attributes_list)):
        inner_crew_dict = {}
        
        # Crewmember name
        inner_crew_dict['name'] = crew_attributes_list[i].text
        
        # Crewmember URL
        inner_crew_dict['url'] = crew_attributes_list[i]['href']
        
        # Append this inner list to the inner dictionary
        inner_crew_list.append(inner_crew_dict)
    
    # Add our completed inner list to the outer dictionary
    crew_dict['crew_attributes'] = inner_crew_list
    
    # Append to our growing list
    crew_list.append(crew_dict)

In [45]:
crew_list

[{'crew_role': 'director',
  'crew_attributes': [{'name': 'Peter Jackson',
    'url': '/director/peter-jackson/'}]},
 {'crew_role': 'producers',
  'crew_attributes': [{'name': 'Barrie M. Osborne',
    'url': '/producer/barrie-m-osborne/'},
   {'name': 'Peter Jackson', 'url': '/producer/peter-jackson/'},
   {'name': 'Bob Weinstein', 'url': '/producer/bob-weinstein/'},
   {'name': 'Harvey Weinstein', 'url': '/producer/harvey-weinstein/'},
   {'name': 'Mark Ordesky', 'url': '/producer/mark-ordesky/'},
   {'name': 'Michael Lynne', 'url': '/producer/michael-lynne/'},
   {'name': 'Fran Walsh', 'url': '/producer/fran-walsh/'},
   {'name': 'Robert Shaye', 'url': '/producer/robert-shaye/'},
   {'name': 'Tim Sanders', 'url': '/producer/tim-sanders/'}]},
 {'crew_role': 'writers',
  'crew_attributes': [{'name': 'Peter Jackson',
    'url': '/writer/peter-jackson/'},
   {'name': 'J.R.R. Tolkien', 'url': '/writer/jrr-tolkien/'},
   {'name': 'Fran Walsh', 'url': '/writer/fran-walsh/'},
   {'name': 'Ph

Let's grab more information about the movie first. How about the cast?

In [9]:
# Let's maybe limit this to the first 25 cast members
max_cast_returned = 25
available_cast = soup.find('div', class_='cast-list text-sluglist').findAll('a', class_='text-slug tooltip')

cast_list = []

for i in range(min(len(available_cast), max_cast_returned)):
    cast_dict = {}
    cast_dict['actor_name'] = available_cast[i].text
    cast_dict['actor_role'] = available_cast[i]['title']
    cast_dict['actor_url'] = available_cast[i]['href']
    cast_list.append(cast_dict)

In [46]:
cast_list

[{'actor_name': 'Elijah Wood',
  'actor_role': 'Frodo Baggins',
  'actor_url': '/actor/elijah-wood/'},
 {'actor_name': 'Ian McKellen',
  'actor_role': 'Gandalf the Grey',
  'actor_url': '/actor/ian-mckellen/'},
 {'actor_name': 'Viggo Mortensen',
  'actor_role': 'Aragorn',
  'actor_url': '/actor/viggo-mortensen/'},
 {'actor_name': 'Sean Astin',
  'actor_role': 'Samwise "Sam" Gamgee',
  'actor_url': '/actor/sean-astin/'},
 {'actor_name': 'Liv Tyler',
  'actor_role': 'Arwen Evenstar',
  'actor_url': '/actor/liv-tyler/'},
 {'actor_name': 'Orlando Bloom',
  'actor_role': 'Legolas',
  'actor_url': '/actor/orlando-bloom/'},
 {'actor_name': 'John Rhys-Davies',
  'actor_role': 'Gimli',
  'actor_url': '/actor/john-rhys-davies/'},
 {'actor_name': 'Dominic Monaghan',
  'actor_role': 'Meriadoc "Merry" Brandybuck',
  'actor_url': '/actor/dominic-monaghan/'},
 {'actor_name': 'Billy Boyd',
  'actor_role': 'Peregrin "Pippin" Took',
  'actor_url': '/actor/billy-boyd/'},
 {'actor_name': 'Sean Bean',
  'a

Let's get the title next and then the average rating among all reviewers.

In [10]:
movie_title = soup.find('h1', class_='headline-1 js-widont prettify').text.strip()

In [11]:
small_soup = bs(requests.get(base_url + soup.find('aside', class_='sidebar').find('div', class_='js-csi')['data-src']).text)
movie_avg_rating = small_soup.find('span', class_='average-rating').find('a').text