Let's get started! 

As always, we start by importing the libraries we'll need for this little exercise. 

In [98]:
from bs4 import BeautifulSoup as bs
from functools import reduce
from pandas.io.json import json_normalize
import json
import matplotlib as mpl
import pandas as pd
import re
import requests
import seaborn as sns
import unicodedata as uni
import os

And we'll set up some constants that will be driving our data analytics project here. The base URL for the website won't change, and we'll be using my username as the running example. 

In [2]:
base_url = 'https://letterboxd.com/'
account_name = 'JoshLinneburg'
ratings_url = base_url + account_name + '/films/ratings/by/rating/'

Let's start by scraping the webpage of interest: films by rating on my account.

In [3]:
def get_html_soup(url):
    response = requests.get(url).text
    soup = bs(response)
    return soup

In [4]:
soup = get_html_soup(ratings_url)

First thing we're going to do: Figure out how to itemize the films that have been rated *on a single page*.

Start by splitting the page up into a list of movies.

In [5]:
movies_on_page = soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li')

Now to extract the stars and the URL from this HTML.

In [6]:
# Let's only do the first 3 in our print() 
movies_to_show = min(len(movies_on_page), 3)

for i in range(movies_to_show):
    
    # Might not use this later, but let's grab the name of the movie's image (which seems to be the movie name itself)
    movie_name = movies_on_page[i].find('img')['alt'].strip()
    
    # Grab the movie_url
    movie_url = movies_on_page[i].find('div', class_='poster film-poster really-lazy-load')['data-target-link']
    
    # Grab the raw star rating of the movie
    user_movie_star_rating = movies_on_page[i].find('p').find('span').text.strip()
    
    # Convert the star rating to a numeric representation
    # Note "\" is used to breakup a statement into multiple lines
    user_movie_nbr_rating = sum([float(user_movie_star_rating[i].replace('★', '1.0').replace('½', '0.5'))\
                                 for i in range(len(user_movie_star_rating))])
    
    # Print the results
    print('User {0} gave {1}, which can be accessed here: {2}, a rating of {3} which translates to {4} on a numeric scale'\
          .format(account_name, movie_name, movie_url, user_movie_star_rating, user_movie_nbr_rating))
    
    # If we're at the last one, just print out a statement saying there's more we didn't show. 
    # We do minus 1 because our length goes to the value of 3 but list indexing in Python starts at 0
    # So our "i" variable goes 0, 1, 2 for a total of 3 (our length)
    if i == movies_to_show - 1:
        print('\nRemaining results hidden, you get the idea. \n...')

User JoshLinneburg gave Chernobyl, which can be accessed here: /film/chernobyl/, a rating of ★★★★½ which translates to 4.5 on a numeric scale
User JoshLinneburg gave The Empire Strikes Back, which can be accessed here: /film/the-empire-strikes-back/, a rating of ★★★★★ which translates to 5.0 on a numeric scale
User JoshLinneburg gave The Shawshank Redemption, which can be accessed here: /film/the-shawshank-redemption/, a rating of ★★★★★ which translates to 5.0 on a numeric scale

Remaining results hidden, you get the idea. 
...


In [7]:
def get_user_movie_rating(html_soup):
    
    '''
    
    Parses the HTML of a given film on https://letterboxd.com/YOURACCOUNTHERE/films/ratings/ and returns the rating
    the user in question gave the film.
    
    Parameters:
        html_soup (bs4 BeautifulSoup): BeautifulSoup representation of a given movie on a user's '/films/ratings/' page
        This should be a single item in the list soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li')
        
    Returns:
        movie_rating (float): Numeric representation of the star value assigned to a movie by a given user. 
        
    Example:
        url = 'https://letterboxd.com/joshlinneburg/films/ratings/' # URL string
        soup = get_html_soup(url) # Parsed HTML for the URL string
        movies_on_page = soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li') # List of 'li' classes on the page
        get_user_movie_rating(movies_on_page[0]) # Single item in the list
        
        ...
        
        movie_rating = 5.0
    
    '''
    
    raw_movie_rating = html_soup.find('p').find('span').text.strip()
    movie_rating = sum([float(raw_movie_rating[i].replace('★', '1.0').replace('½', '0.5'))\
                        for i in range(len(raw_movie_rating))])
    return movie_rating

In [8]:
get_user_movie_rating(movies_on_page[0])

4.5

In [9]:
def get_movie_url(html_soup):
    
    '''
    
    Parses the HTML of a given film on https://letterboxd.com/YOURACCOUNTHERE/films/ratings/ and returns the URL
    of the movie. 
    
    Parameters:
        html_soup (bs4 BeautifulSoup): BeautifulSoup representation of a given movie on a user's '/films/ratings' page
        This should be a single item in the list soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li')
    
    Returns:
        movie_url (str): Endpoint URL of a movie on letterboxd.com
        Note: The URL does not contain the base_url ('https://letterboxd.com') and only begins at the /film/ endpoint. 
    
    Example:
        url = 'https://letterboxd.com/joshlinneburg/films/ratings/' # URL string
        soup = get_html_soup(url) # Parsed HTML for the URL string
        movies_on_page = soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li') # List of 'li' classes on the page
        get_movie_url(movies_on_page[0]) # Single item in the list
        
        ...
        
        movie_url = /film/the-empire-strikes-back/
    
    '''
    
    movie_url = html_soup.find('div', class_='poster film-poster really-lazy-load')['data-target-link']
    return movie_url

In [10]:
get_movie_url(movies_on_page[0])

'/film/chernobyl/'

Next up: Let's start scraping from a movie's page on here so we can access that detailed info about the film. Let's start with attributes that are on the Letterboxd website (title, actors, director, audience rating, genres) and then we can get crazy with items we'll need to scrape IMDb for (box office, release date, awards, etc.).

We'll use *The Fellowship of the Ring* as our working example here.

In [11]:
lotr_fotr_url = 'https://letterboxd.com//film/the-lord-of-the-rings-the-fellowship-of-the-ring/'
lotr_fotr_url

'https://letterboxd.com//film/the-lord-of-the-rings-the-fellowship-of-the-ring/'

Next up: scraping some data about the crew on a film. 

In [12]:
def get_crew_data(movie_url):
    
    '''
    
    Given a URL to a movie on letterboxd.com, this function returns a nested dictionary containing data about the crewmembers
    that worked on the movie. 
    
    Parameters:
        movie_url (str): URL to a film on letterboxd.com.
        Note: Must be the full path of the URL including 'https://letterboxd.com/'
        
    Returns:
        crew_dict (dict): Nested dictionary with data about the crew on a movie from letterboxd.com.
        
    Example: 
        movie_url = 'https://letterboxd.com/film/the-lord-of-the-rings-the-fellowship-of-the-ring/'
        get_crew_data(movie_url)
        
        ...
        
        Output:
        {'total_crew': 73,
         'total_unique_crew': 70,
         'crew_list': [
                       {'crew_role': 'director',
                        'number_assigned': 1,
                        'crew_attributes': [
                                            {'name': 'Peter Jackson',
                                             'url': '/director/peter-jackson/'}
                                           ]
                       }, 
                       
                       {'crew_role': 'producers',
                        'number_assigned': 9,
                        'crew_attributes': [
                                            {'name': 'Barrie M. Osborne',
                                             'url': '/producer/barrie-m-osborne/'},
                                            {'name': 'Peter Jackson', 
                                             'url': '/producer/peter-jackson/'},
                                            {'name': 'Bob Weinstein', 
                                             'url': '/producer/bob-weinstein/'},
                                            {'name': 'Harvey Weinstein', 
                                             'url': '/producer/harvey-weinstein/'},
                                            {'name': 'Mark Ordesky', 
                                             'url': '/producer/mark-ordesky/'},
                                            {'name': 'Michael Lynne', 
                                             'url': '/producer/michael-lynne/'},
                                            {'name': 'Fran Walsh', 
                                             'url': '/producer/fran-walsh/'},
                                            {'name': 'Robert Shaye', 
                                             'url': '/producer/robert-shaye/'},
                                            {'name': 'Tim Sanders', 
                                             'url': '/producer/tim-sanders/'}
                                           ]
                       },
                       
                       ... 
                      
                      ]
        }
        
    
    '''
    
    soup = get_html_soup(movie_url)
    
    # Categories (Director, Producers, Writers, etc.) of crewmembers available
    crew_roles_avail = soup.find('div', class_='tabbed-content-block column-block').findAll('span')

    # Same information in a list without HTML tags
    crew_roles_avail_list = [tag.text.lower() for tag in crew_roles_avail]

    # Init the crew list
    crew_dict = {}
    crew_list = []
    total_crew_counter = 0
    crew_names_list = []
    unique_crew_names = set()

    for i in range(len(crew_roles_avail_list)):
        crew_role_dict = {}

        # Role is just whatever role we're on 
        crew_role_dict['crew_role'] = crew_roles_avail_list[i]

        # Find the list of names and URLs for a given role
        crew_attributes_list = soup.find('div', class_='tabbed-content-block column-block').findAll('div')[i].findAll('a')

        # Iterate on our counter
        total_crew_counter += len(crew_attributes_list)

        # How many people are assigned to this role
        crew_role_dict['number_assigned'] = len(crew_attributes_list)

        # Init an "inner" list - a list of dictionaries containing the name and URL of each crewmember
        inner_crew_list = [] 

        for j in range(len(crew_attributes_list)):
            inner_crew_dict = {}

            # Crewmember name
            inner_crew_dict['name'] = crew_attributes_list[j].text
            crew_names_list.append(crew_attributes_list[j].text)

            # Crewmember URL
            inner_crew_dict['url'] = crew_attributes_list[j]['href']

            # Append this inner list to the inner dictionary
            inner_crew_list.append(inner_crew_dict)

        # Add our completed inner list to the outer dictionary
        crew_role_dict['crew_attributes'] = inner_crew_list

        # Append to our growing list
        crew_list.append(crew_role_dict)

    # Gets the total number of crewmembers listed
    crew_dict['total_crew'] = total_crew_counter

    # Gets the total number of unique crewmembers listed
    for name in crew_names_list:
        if name not in unique_crew_names:
            unique_crew_names.add(name)

    crew_dict['total_unique_crew'] = len(unique_crew_names)

    crew_dict['crew_list'] = crew_list
    
    return crew_dict

In [13]:
crew_dict = get_crew_data(lotr_fotr_url)
crew_dict

{'total_crew': 73,
 'total_unique_crew': 70,
 'crew_list': [{'crew_role': 'director',
   'number_assigned': 1,
   'crew_attributes': [{'name': 'Peter Jackson',
     'url': '/director/peter-jackson/'}]},
  {'crew_role': 'producers',
   'number_assigned': 9,
   'crew_attributes': [{'name': 'Barrie M. Osborne',
     'url': '/producer/barrie-m-osborne/'},
    {'name': 'Peter Jackson', 'url': '/producer/peter-jackson/'},
    {'name': 'Bob Weinstein', 'url': '/producer/bob-weinstein/'},
    {'name': 'Harvey Weinstein', 'url': '/producer/harvey-weinstein/'},
    {'name': 'Mark Ordesky', 'url': '/producer/mark-ordesky/'},
    {'name': 'Michael Lynne', 'url': '/producer/michael-lynne/'},
    {'name': 'Fran Walsh', 'url': '/producer/fran-walsh/'},
    {'name': 'Robert Shaye', 'url': '/producer/robert-shaye/'},
    {'name': 'Tim Sanders', 'url': '/producer/tim-sanders/'}]},
  {'crew_role': 'writers',
   'number_assigned': 4,
   'crew_attributes': [{'name': 'Peter Jackson',
     'url': '/writer/pe

How about some prototype code for when we need to convert this into a Pandas dataframe? Let's assume we only care about a few different crew roles and only want the "main" person (the first person listed) in that role returned. How would we do something like this? 

In [14]:
crew_wanted = ['director', 'producer', 'writer', 'composer']

condensed_crew_dict = {}
condensed_crew_list = crew_dict['crew_list']

for i in range(len(condensed_crew_list)):
    for j in range(len(crew_wanted)):
        if crew_wanted[j] in condensed_crew_list[i]['crew_role']:
            condensed_crew_dict[crew_wanted[j]] = condensed_crew_list[i]['crew_attributes'][0]['name']
            print(condensed_crew_list[i]['crew_attributes'][0]['name'] + ': ' + crew_wanted[j] + ' ' + str(i))
            
condensed_crew_dict

Peter Jackson: director 0
Barrie M. Osborne: producer 1
Peter Jackson: writer 2
Howard Shore: composer 7


{'director': 'Peter Jackson',
 'producer': 'Barrie M. Osborne',
 'writer': 'Peter Jackson',
 'composer': 'Howard Shore'}

Seems like that would work!

Let's grab more information about the movie... How about the cast?

In [15]:
def get_cast_data(movie_url, max_cast_to_return = 15):
    
    '''
    
    Given a URL to a movie on letterboxd.com, this function returns a nested dictionary containing data about the cast 
    that played in the movie. 
    
    Parameters:
        movie_url (str): URL to a film on letterboxd.com.
        Note: Must be the full path of the URL including 'https://letterboxd.com/'
        
        max_cast_to_return: Max number of cast members to include in output. Default is 15. 
        
    Returns:
        cast_dict (dict): Nested dictionary which contains data about the cast of a movie on Letterboxd.com
        
    Example: 
        movie_url = 'https://letterboxd.com/film/the-lord-of-the-rings-the-fellowship-of-the-ring/'
        get_cast_data(movie_url)
        
        ...
        
        Output: 
        {'total_cast': 80,
         'cast_list': [
                       {'actor_seq_nbr': 0,
                        'actor_name': 'Elijah Wood',
                        'actor_role': 'Frodo Baggins',
                        'actor_url': '/actor/elijah-wood/'
                       },

                       {'actor_seq_nbr': 1,
                        'actor_name': 'Ian McKellen',
                        'actor_role': 'Gandalf the Grey',
                        'actor_url': '/actor/ian-mckellen/'
                       },
                       
                       ...
                      
                      ]
        }    
    
    '''
    
    soup = get_html_soup(movie_url)
    
    available_cast = soup.find('div', class_='cast-list text-sluglist').findAll('a', class_='text-slug tooltip')

    cast_dict = {}
    cast_list = []
    cast_names_list = []
    unique_cast_names = set()

    for i in range(min(len(available_cast), max_cast_to_return)):
        
        try:
            cast_attr_dict = {}
            cast_attr_dict['actor_seq_nbr'] = i
            cast_attr_dict['actor_name'] = available_cast[i].text
            cast_attr_dict['actor_role'] = available_cast[i]['title']
            cast_attr_dict['actor_url'] = available_cast[i]['href']
            unique_cast_names.add(available_cast[i].text)
            cast_list.append(cast_attr_dict)
        except:
            string_to_print = 'Failed on the ' + str(i) + "'th execution of the loop.\n"
            string_to_print = string_to_print + 'The movie passed in:' + movie_url +'.\n'

    # Total number of castmembers
    cast_dict['total_cast'] = len(available_cast)

    # List of castmembers
    cast_dict['cast_list'] = cast_list
    
    return cast_dict

In [16]:
cast_dict = get_cast_data(lotr_fotr_url)
cast_dict

{'total_cast': 80,
 'cast_list': [{'actor_seq_nbr': 0,
   'actor_name': 'Elijah Wood',
   'actor_role': 'Frodo Baggins',
   'actor_url': '/actor/elijah-wood/'},
  {'actor_seq_nbr': 1,
   'actor_name': 'Ian McKellen',
   'actor_role': 'Gandalf the Grey',
   'actor_url': '/actor/ian-mckellen/'},
  {'actor_seq_nbr': 2,
   'actor_name': 'Viggo Mortensen',
   'actor_role': 'Aragorn',
   'actor_url': '/actor/viggo-mortensen/'},
  {'actor_seq_nbr': 3,
   'actor_name': 'Sean Astin',
   'actor_role': 'Samwise "Sam" Gamgee',
   'actor_url': '/actor/sean-astin/'},
  {'actor_seq_nbr': 4,
   'actor_name': 'Liv Tyler',
   'actor_role': 'Arwen Evenstar',
   'actor_url': '/actor/liv-tyler/'},
  {'actor_seq_nbr': 5,
   'actor_name': 'Orlando Bloom',
   'actor_role': 'Legolas',
   'actor_url': '/actor/orlando-bloom/'},
  {'actor_seq_nbr': 6,
   'actor_name': 'John Rhys-Davies',
   'actor_role': 'Gimli',
   'actor_url': '/actor/john-rhys-davies/'},
  {'actor_seq_nbr': 7,
   'actor_name': 'Dominic Monagha

We can probably do something similar to above and only returne the first 3, 5, 10, etc. members of the cast when we go to toss this into a Pandas dataframe.

In [82]:
i = 0
while i < 5:
    print(cast_dict['cast_list'][i]['actor_name'])
    i += 1

Elijah Wood
Ian McKellen
Viggo Mortensen
Sean Astin
Liv Tyler


Let's get the title next.

In [17]:
def get_movie_title(movie_url):
    
    '''
    Given a URL to a movie on letterboxd.com, this function returns the title of the movie.
    
    Parameters:
        movie_url (str): URL to a film on letterboxd.com.
        
    Returns:
        movie_title (str): Title to the film in question. 
    
    '''
    
    soup = get_html_soup(movie_url)
    
    movie_title = soup.find('h1', class_='headline-1 js-widont prettify').text.strip()
    return movie_title

In [18]:
movie_title = get_movie_title(lotr_fotr_url)
movie_title

'The Lord of the Rings: The Fellowship of the Ring'

And we'll also grab the movie genres while we're at it too - this one is going to return a list of genres instead of a single value or a dictionary. Again, a good candidate for breaking this out into multiple columns in a dataframe. 

In [19]:
def get_movie_genres(movie_url):
    
    '''
    Given a URL to a movie on letterboxd.com, this function returns the title of the movie.
    
    Parameters:
        movie_url (str): URL to a film on letterboxd.com.
        
    Returns:
        movie_genres_list (list): List of genres for film on letterboxd.com.
    
    '''
    
    soup = get_html_soup(movie_url)
    
    movie_genres_list = [tag.text for tag in soup.find('div', class_='text-sluglist capitalize').findAll('a')]
    return movie_genres_list

In [20]:
movie_genres = get_movie_genres(lotr_fotr_url)
movie_genres

['action', 'adventure', 'fantasy']

This function is for the average rating among all raters of the movie. 

In [21]:
def get_movie_avg_rating(movie_url):
    
    '''
    
    Given a URL to a movie on letterboxd.com, this function returns the average rating of the movie among all users.
    
    Parameters:
        movie_url (str): URL to a film on letterboxd.com.
        
    Returns:
        movie_avg_rating (str): Average rating of the movie in question. 
    
    
    '''
    soup = get_html_soup(movie_url)
    soup = bs(requests.get(base_url + soup.find('aside', class_='sidebar').find('div', class_='js-csi')['data-src']).text)
    
    try:
        movie_avg_rating = soup.find('span', class_='average-rating').find('a').text
    except:
        return 'Not available'
    
    return movie_avg_rating

In [22]:
movie_avg_rating = get_movie_avg_rating(lotr_fotr_url)
movie_avg_rating

'4.3'

And here we're actually getting data on all the ratings for the movie - so we can tell what percentage of people rated it above or below our user or the average. 

In [23]:
def get_movie_rating_counts(movie_url):

    '''

    Given a URL to a movie on letterboxd.com, this function returns the average rating of the movie among all users.

    Parameters:
        movie_url (str): URL to a film on letterboxd.com.

    Returns:
        movie_avg_rating (str): Average rating of the movie in question. 


    '''

    soup = get_html_soup(movie_url)
    soup = bs(requests.get(base_url + soup.find('aside', class_='sidebar').find('div', class_='js-csi')['data-src']).text)

    li_list = soup.findAll('li')
    movie_rating_dict = {}
    movie_rating_list = []

    for i in range(len(li_list)):

        # If there are ratings in this star value
        try: 
            rating_string = li_list[i].find('a')['title'].replace(u'\xa0', u' ').split(' ')

        # If there are not
        except TypeError:
            rating_string = li_list[i]['title'].replace(u'\xa0', u' ').split(' ')

        # If the Exception was encountered, this replaces 'No' with 0
        number_of_ratings = rating_string[0].replace('No', '0')

        # If it's the first item, replace the string 'half-★' with '½' like the other rating scheme
        star_rating = rating_string[1].replace('half-★', '½')

        # Split the star_rating string into a list
        star_rating_list = list(star_rating)

        # Replace the stars with 1.0 and the '½' with 0.5, convert to floats, sum them
        rating = sum([float(star_rating_list[i].replace('★', '1.0').replace('½', '0.5'))\
                  for i in range(len(star_rating_list))])

        # Add to dictionary
        movie_rating_dict[rating] = number_of_ratings

    total = sum([int(item.replace(',', '')) for item in movie_rating_dict.values()])

    rating_dict_items = list(movie_rating_dict.items())

    for item in rating_dict_items:

        movie_rating_dict = {}

        rating = item[0]
        number_of_ratings = int(item[1].replace(',', ''))
        perc_of_total = round(number_of_ratings / total, 4)

        movie_rating_dict['rating'] = rating
        movie_rating_dict['number_of_ratings'] = number_of_ratings
        movie_rating_dict['perc_of_total'] = perc_of_total

        movie_rating_list.append(movie_rating_dict)

    return movie_rating_list

In [24]:
movie_rating_counts = get_movie_rating_counts(lotr_fotr_url)
movie_rating_counts

[{'rating': 0.5, 'number_of_ratings': 405, 'perc_of_total': 0.0019},
 {'rating': 1.0, 'number_of_ratings': 1095, 'perc_of_total': 0.0052},
 {'rating': 1.5, 'number_of_ratings': 475, 'perc_of_total': 0.0023},
 {'rating': 2.0, 'number_of_ratings': 3119, 'perc_of_total': 0.0149},
 {'rating': 2.5, 'number_of_ratings': 2450, 'perc_of_total': 0.0117},
 {'rating': 3.0, 'number_of_ratings': 15446, 'perc_of_total': 0.074},
 {'rating': 3.5, 'number_of_ratings': 15391, 'perc_of_total': 0.0738},
 {'rating': 4.0, 'number_of_ratings': 54193, 'perc_of_total': 0.2597},
 {'rating': 4.5, 'number_of_ratings': 33010, 'perc_of_total': 0.1582},
 {'rating': 5.0, 'number_of_ratings': 83083, 'perc_of_total': 0.3982}]

Snagging the movie description while we're at it!

In [25]:
def get_movie_desc(movie_url):
    
    '''
    
    Given a URL to a movie on letterboxd.com, this function returns the description of the movie.
    
    Parameters:
        movie_url (str): URL to a film on letterboxd.com.
        
    Returns:
        movie_desc (str): Description to the film in question. 
    
    '''
    
    soup = get_html_soup(movie_url)
    
    movie_desc = soup.findAll('meta')[3]['content'].strip()
    return movie_desc

In [26]:
movie_desc = get_movie_desc(lotr_fotr_url)
movie_desc

'Young hobbit Frodo Baggins, after inheriting a mysterious ring from his uncle Bilbo, must leave his home in order to keep it from falling into the hands of its evil creator. Along the way, a fellowship is formed to protect the ringbearer and make sure that the ring arrives at its final destination: Mt. Doom, the only place where it can be destroyed.'

Okay, now that we've got a good selection of information about *The Fellowship of the Ring* let's put it together in one large dictionary. 

Then we can get started on scraping IMDb for the last few attributes we'll want:
* The movie's budget and revenue
* The movie's release date

Additionally, we're going to want to scrape the movie's genre(s) from either Letterboxd or IMDb, I'm not sure which one just yet though. 

In [27]:
movie = {}
movie['title'] = movie_title
movie['description'] = movie_desc
movie['avg_rating'] = movie_avg_rating
movie['cast'] = cast_dict
movie['crew'] = crew_dict

In [28]:
# Uncomment the line below to see what the movie dictionary looks like
# movie

Now, let's put everything together into a *bigger* dictionary and add a loop to scrape every film on the page!

In [29]:
get_movie_desc(base_url + get_movie_url(movies_on_page[0]))

'The true story of one of the worst man-made catastrophes in history: the catastrophic nuclear accident at Chernobyl. A tale of the brave men and women who sacrificed to save Europe from unimaginable disaster.'

We're going to need some functions here:
1. A function that operates at the "account-level" to zero-in on a user's ratings
2. Get a list of pages of movies to scrape 
3. Get a list of movies on a single page to scrape
4. Scrape a single movie's data given a URL 
5. Write everything out to a JSON file

In [30]:
def scrape_movie_data(movie_url, user_movie_rating = None):
    
    '''
    Given a URL to a movie on letterboxd.com, this function returns a dictionary of data about the movie.
    
    Parameters:
        movie_url (str): URL to a film on letterboxd.com.
        
        user_movie_rating (str/float): Optional parameter; if scraping from a user's ratings page you can pass in that user's rating for the film in question.
        Otherwise, if you are only scraping a film's page for its attributes and are not interested in a specific user's rating, disregard. 
    
    Returns:
        movie_dict (dict): Nested dictionary that contains data about the movie passed in. 
    
    '''
    
    movie_dict = {}
    movie_url = movie_url

    # Attributes from the movie page or already stored in variables
    movie_dict['title'] = get_movie_title(movie_url)
    movie_dict['url'] = movie_url
    movie_dict['genres'] = get_movie_genres(movie_url)
    movie_dict['description'] = get_movie_desc(movie_url)
    movie_dict['avg_rating'] = get_movie_avg_rating(movie_url)
    movie_dict['rating_counts'] = get_movie_rating_counts(movie_url)
    
    if user_movie_rating:
        movie_dict['user_rating'] = user_movie_rating
        
    movie_dict['cast'] = get_cast_data(movie_url)
    movie_dict['crew'] = get_crew_data(movie_url)
    
    return movie_dict
    

In [31]:
def scrape_page_ratings(rating_page_url):
    
    '''
    
    Given a URL to a page of user ratings on letterboxd.com, this function returns a list of nested dictionaries
    which data about each movie rated on that specific page. 
    
    Parameters: 
        rating_page_url (str): URL to a page of a user's movie ratings on letterboxd.com.
        
    Returns:
        page_ratings_list (list): List of movies rated by a user on that specific page. 
        Contains nested dictionaries with data about each film as well as the user's rating for the film. 
    
    '''

    soup = get_html_soup(rating_page_url)
    base_url = 'https://letterboxd.com/'
    
    page_ratings_list = []
    
    try:
        movies_on_page = soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li')
    except IndexError:
        return None

    for i in range(len(movies_on_page)):
        
        # Get the URL for the movie
        movie_url = base_url + get_movie_url(movies_on_page[i])

        # Get the user's rating for the movie
        user_movie_rating = get_user_movie_rating(movies_on_page[i])
        
        # Scrape the movie data and also include the user's rating for the movie
        movie_dict = scrape_movie_data(movie_url, user_movie_rating)
        
        # Append the returned dictionary to the list for this page
        page_ratings_list.append(movie_dict)

    return page_ratings_list

In [32]:
def get_pages_to_scrape(account_name):
    
    '''
    
    Given an account name on letterboxd.com, this function returns the number of pages of movie ratings there are to
    scrape through for that account. 
    
    Parameters:
        account_name (str): Name of an account on letterboxd.com to gather movie ratings from. 
        
    Returns: 
        pages_to_scrape (int): Number of pages of movie ratings this user has that will need to be scraped. 
    
    '''
    
    base_url = 'https://letterboxd.com/'
    account_name = account_name.lower().strip()
    
    # Everyone has at least one page, even if they have no ratingns
    ratings_url = base_url + account_name.lower() + '/films/ratings/by/rating/page/1'
    soup = get_html_soup(ratings_url)
    pages_to_scrape = 1

    try:
        paginate_pages = soup.findAll('div', class_='paginate-pages')[0]

    except IndexError:
        return pages_to_scrape

    pages_to_scrape = int(paginate_pages.findAll('li')[-1].text)
    return pages_to_scrape

In [33]:
def scrape_account_ratings(account_to_scrape):

    '''
    
    Given an account name on letterboxd.com, this function returns a dictionary which contains data about all the films
    the user has rated. 
    
    The dictionary returned is nested - each movie rated is an entry in a list and each entry in the list itself contains 
    a multi-level data structure to include cast, crew, the moving's average rating, etc.
    
    Parameters: 
        account_to_scrape (str): Name of an account on letterboxd.com to gather movie ratings from. 
        
    Returns:
        movie_ratings (dict): Nested dictionary that contains data about movies rated by the user specified in account_name. 
    
    '''
    
    
    movie_ratings = {}
    movies_by_page_list = []
    movies_rated_list = []

    account_name = account_to_scrape.lower().strip()
    base_url = 'https://letterboxd.com/'
    account_url = base_url + account_name
    account_ratings_url = account_url + '/films/ratings/by/rating/'
    
    movie_ratings['account'] = account_name
    movie_ratings['account_url'] = account_url
    movie_ratings['account_ratings_url'] = account_ratings_url
    
    pages_to_scrape = get_pages_to_scrape(account_name)
    
    for i in range(pages_to_scrape):
        page_dict = {}
        
        page_number = str(i+1)
    
        ratings_url = base_url + account_name + '/films/ratings/by/rating/page/'+page_number
        page_ratings = scrape_page_ratings(ratings_url)
        
        page_dict['page'] = page_number
        page_dict['movies_on_page'] = page_ratings
        
        movies_by_page_list.append(page_dict)
    
    if not page_ratings:
        movie_ratings['movies_rated'] = None
        return movie_ratings
    
    for i in range(len(movies_by_page_list)):
        for j in range(len(movies_by_page_list[i]['movies_on_page'])):
            movies_rated_list.append(movies_by_page_list[i]['movies_on_page'][j])
            
    # The following code is equivalent to the nested FOR loops above using list comprehensions
    # The FOR loop is significantly slower in execution than the list comprehension - but has one key advantage
    # The FOR loop is very easily read, whereas the nested list comprehension is a rather convoluted piece of code
    '''
    movies_rated_list = \
    [movies_by_page_list[i]['movies_on_page'][j] \
     for j in range(len(movies_by_page_list[i]['movies_on_page'])) \
     for i in range(len(movies_by_page_list))]
    '''
    
    # For a look at why I'm choosing readability and simplicity over effectiveness, run one small piece of code:
    # import this
        
    movie_ratings['movies_rated'] = movies_rated_list
    
    return movie_ratings     

In [113]:
def write_dict_to_json(filename, data, path_to_file = None, encoding = 'utf-8'):
    
    '''
    Writes a dictionary out to a JSON file.
    
    Parameters:
        filename (str): Name of the file, including the .json extension. 
        If you forget the .json extension, the program will add it automatically.
        
        data (dict): Dictionary of data to be written out to a JSON file.
        
        path_to_file (str): Optional parameter; specify a path to the file. If the path does not exist, it will be created. 
        If you do not specify a path, the program will use the current working directory. 
        
        encoding (str): Optional parameter; file encoding. If you do not specify an encoding codec, utf-8 will be used. 
        
    Returns:
        None.
        
    Outputs:
        JSON file at the location specified. 
    
    '''
    
    
    if not os.path.isdir(path_to_file):
        os.mkdir(path_to_file)
        
    path_to_file = reduce(lambda x, y: x if x is not None else y, [path_to_file, os.getcwd().replace('\\', '/')])
    
    if '.json' not in filename:
        filename = filename + '.json'
        
    if path_to_file[-1:] != '/':
        path_to_file = path_to_file + '/'
        
    full_file_path = path_to_file + '/' + filename
    
    with open(full_file_path, 'w', encoding = encoding) as file:
        json.dump(data, file, ensure_ascii = False, indent = 4)

In [35]:
#aragorn_ratings = scrape_account_ratings('aragorn')

In [114]:
write_dict_to_json('aragorn.json', aragorn_ratings, 'movie_jsons')

In [50]:
#write_json_to_file('movie_jsons', 'elrond.json', scrape_account_ratings('elrond'))
#write_json_to_file('movie_jsons', 'frodo.json', scrape_account_ratings('frodo'))
#write_json_to_file('movie_jsons', 'gandalf.json', scrape_account_ratings('gandalf'))
#write_json_to_file('movie_jsons', 'gimli.json', scrape_account_ratings('gimli'))
#write_json_to_file('movie_jsons', 'legolas.json', scrape_account_ratings('legolas'))
#write_json_to_file('movie_jsons', 'samwise.json', scrape_account_ratings('samwise'))

In [38]:
# Uncomment to create the linneburg_ratings dictionary
# Note: This process takes roughly 15 minutes to complete (~1 minute, 15 seconds per page)
# linneburg_ratings = scrape_account_ratings('joshlinneburg') 

In [39]:
# Uncomment to see what the linneburg_ratings dictionary looks like
# linneburg_ratings