### Imports and Genre List

In [None]:
from gevent import monkey
monkey.patch_socket()
import grequests
import time
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# List of known genres on Letterboxd.com
genre_list = ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama',
              'family', 'fantasy', 'history', 'horror', 'music', 'mystery', 'romance',
              'science-fiction', 'thriller', 'tv-movie', 'war', 'western']

### Data Scraping Functions

In [None]:
# ALL FUNCTIONS NECESSARY FOR SCRAPING PROCESS

# Creates the base urls for each genre and to be used for the scraping process
def generate_genre_urls():
    genre_urls = []
    for genre in genre_list:
        genre_urls.append(f'https://letterboxd.com/films/ajax/genre/{genre}')
    return genre_urls

# Grabs total number of films for each genre to calculate how many requests for each genre must be sent out
def get_film_num_by_genre():
    film_genre_counts = []
    urls = generate_genre_urls()
    page_reqs = (grequests.get(link) for link in urls)
    page_responses = grequests.map(page_reqs)
    for r in page_responses:
        film_page = BeautifulSoup(r.text, 'lxml')
        film_num = re.findall(r'are ([\d,]+)', str(film_page.find(
            'h2', class_='ui-block-heading').text))[0].replace(',', '')
        film_genre_counts.append(film_num)
    film_num_by_genre = create_genre_dict(film_genre_counts)
    return film_num_by_genre

# Creates a dictionary that separates the film urls by genre
def create_genre_dict(film_num):
    film_num_by_genre = {}
    for idx in range(len(genre_list)):
        film_num_by_genre.update({genre_list[idx]: film_num[idx]})
    return film_num_by_genre

# Creates list of all pages for each genre (ex. .../action/page/1/ THROUGH .../action/page/235/)
def generate_genre_page_urls(film_num_by_genre):
    final_genre_dict = {}
    for genre in genre_list:
        genre_urls = []
        total_pages = (int(film_num_by_genre[genre])//72) + 1
        url = f'https://letterboxd.com/films/ajax/genre/{genre}/size/small/page/'
        for page in range(total_pages):
            genre_urls.append(url+str(page))
        final_genre_dict.update({genre: genre_urls})
    return final_genre_dict

# Goes through every page of every film genre (72 films per page) and grabs the url fragment (eg. the-incredible-hulk) for each film
def get_film_titles():
    film_url_fragments = []
    film_num_by_genre = get_film_num_by_genre()
    genre_url_dict = generate_genre_page_urls(film_num_by_genre)
    for genre in genre_list:
        genre_start = time.perf_counter()
        print(f'Genre: {genre} ({film_num_by_genre[genre]} films, {(int(film_num_by_genre[genre])//72) + 1} pages)')
        page_reqs = (grequests.get(link) for link in genre_url_dict[genre])
        page_responses = grequests.map(page_reqs, size=75)
        for r in page_responses:
            film_page = BeautifulSoup(r.text, 'lxml')
            film_url_fragments += re.findall(r'data-film-slug="/film/([\w-]+)/"', str(film_page.find(
                'ul', class_='poster-list -p70 -grid')))
        genre_end = (time.perf_counter() - genre_start)
        print(f'Len: {len(film_url_fragments)} {genre_end.__round__(2)}s\n')
    return film_url_fragments

# Removes duplicate url fragments
def remove_duplicates(url_fragments):
    return list(dict.fromkeys(url_fragments))

# Combines letterboxd url with url fragments to form complete, requestable url
def create_full_film_urls(uniq_films):
    uniq_urls = []
    for url in uniq_films:
        uniq_urls.append("https://letterboxd.com/film/" + url)
    return uniq_urls

# Tries to grab all pieces of information from each url and returns info in a dictionary (for translating data to Pandas Dataframe later)
def find_info(url_response):
    try:
        film_url = re.findall(r'slug="/film/([\w-]+)/"', str(url_response.find('section', class_='poster-list -p230 -single no-hover el col')))[0]
    except Exception:
        film_url = None
    try:
        film_name = url_response.find('h1', class_='headline-1 js-widont prettify').text
    except Exception:
        film_name = None
    try:
        film_year = url_response.find('small', class_='number').text
    except Exception:
        film_year = None
    try:
        film_rating = re.findall(r'"ratingValue":(.+?),', str(url_response.find_all()))[0]
    except Exception:
        film_rating = None
    try:
        film_cast = re.findall(r'"/actor/([\w-]+)/"', str(url_response.find('div', class_='cast-list text-sluglist')))
    except Exception:
        film_cast = None
    try:
        film_director = url_response.find('span', class_='prettify').text
    except Exception:
        film_director = None
    try:
        film_studios = re.findall(r'studio/([\w-]+)/"', str(url_response.find('div', id='tab-details')))
    except Exception:
        film_studios = None
    try:
        film_country = re.findall(r'country/([\w-]+)/"', str(url_response.find('div', id='tab-details')))[0]
    except Exception:
        film_country = None
    try:
        film_language = re.findall(r'language/([\w-]+)/"', str(url_response.find('div', id='tab-details')))[0]
    except Exception:
        film_language = None
    try:
        film_genres = re.findall(r'genre/([\w-]+)/"', str(url_response.find('div', class_='text-sluglist capitalize')))
    except Exception:
        film_genres = None

    film_info = {
        "url": film_url,
        "name": film_name,
        "year": film_year,
        "rating": film_rating,
        "cast": film_cast,
        "director": film_director,
        "studios": film_studios,
        "country": film_country,
        "language": film_language,
        "genres": film_genres
    }
    return film_info

# Requests final list of urls and returns list of dictionaries
def send_requests(uniq_urls):
    formatted_url_data = []
    page_reqs = (grequests.get(link) for link in uniq_urls)
    page_responses = grequests.map(page_reqs, size=75)
    for r in page_responses:
        film_page = BeautifulSoup(r.text, 'lxml')
        formatted_url_data.append(find_info(film_page))
    return formatted_url_data

### Preparing Necessary Info for Full Scrape

In [None]:
# Gets list of film url fragments
start = time.perf_counter()

film_urls = get_film_titles()

fin = (time.perf_counter() - start)
print(f'\nTime to scrape and assemble url list: {fin.__round__(2)}s')

In [None]:
# Removes all duplicates urls and creates list of final urls to request
start = time.perf_counter()

uniq = remove_duplicates(film_urls)
uniq_url_list = create_full_film_urls(uniq)

fin = (time.perf_counter() - start)
print(f'\nTime to clean list: {fin.__round__(2)}s, New List Length: {len(uniq_url_list)})')

### Running the Full Scrape (8-10 Hours)

In [None]:
# Inserts just the first line + header to start the csv file
# This isn't actually necessary since headers are defined in the postgresql table before importing anyways
header = uniq_url_list[:1]
kinodf = pd.DataFrame.from_records(send_requests(header))
kinodf.to_csv('films.csv', mode='a', encoding='utf8')


# This block stores all of the data in a Pandas Dataframe and writes each block of 1000 entires to csv
# In retrospect, moving the data between Dataframe and csv is unnecessary, however, this was initially done to better understand what the scraped data looked like
limit = len(uniq_url_list)
x, y = 1, 1001
for _ in range((limit//1000)+1):
    bookmark = y//1000
    start = time.perf_counter()
    if y > limit:
        y = limit
    kinodf = pd.DataFrame.from_records(send_requests(uniq_url_list[x:y]))
    kinodf.to_csv('films.csv', mode='a', encoding='utf8', header=False)
    print(f'Processed Chunk [{x}:{y}]')
    if y != limit:
        x += 1000
        y += 1000
    fin = (time.perf_counter() - start)
    print(f'Time to process chunk: {fin.__round__(2)}s (Estimated time left: {(((fin*(limit//1000))-(fin*bookmark))/3600).__round__(2)} hours)\n')

# Data Cleaning

In [None]:
# Convert all brackets [] to curly brackets {} for SQL array syntax
# This conversion is also rather unnecessary as I discovered arrays are definitely not the best way to store data within columns in SQL

df = pd.read_csv('clean_films.csv')

# Convert [genres] to {genres}
lclean = df['genres'].replace(to_replace=r'(\[)', value='{', regex=True)
rclean = lclean.replace(to_replace=r'(])', value='}', regex=True)
df['genres'] = rclean

# Convert [cast] to {cast}
lclean = df['cast'].replace(to_replace=r'(\[)', value='{', regex=True)
rclean = lclean.replace(to_replace=r'(])', value='}', regex=True)
df['cast'] = rclean

# Convert [studios] to {studios}
lclean = df['studios'].replace(to_replace=r'(\[)', value='{', regex=True)
rclean = lclean.replace(to_replace=r'(])', value='}', regex=True)
df['studios'] = rclean

df.to_csv('cleaner_films.csv', index=False)

In [None]:
from itertools import chain

def chainer(s):
    return list(chain.from_iterable(s.str.split(',')))

"""
This is more like pseudo-normalization because primary keys are not actually used in the final database at the moment
The resulting data will look like:

GenreID     genre
0      'action'
0      'adventure'
1      'drama'
2      'documentary'

So that the GenreID matches the main FilmID column in the main table and the tables can be joined in queries
"""

# NORMALIZE GENRES
df = pd.read_csv('genres.csv')
lclean = df['genres'].replace(to_replace=r'(\{)', value='', regex=True)
rclean = lclean.replace(to_replace=r'(})', value='', regex=True)
df['genres'] = rclean
lens = df['genres'].str.split(',').map(len)
genre_df = pd.DataFrame({'ID': np.repeat(df['ID'], lens),
                    'genre': chainer(df['genres'])})

# NORMALIZE CAST
df = pd.read_csv('cast.csv')
lclean = df['cast'].replace(to_replace=r'(\{)', value='', regex=True)
rclean = lclean.replace(to_replace=r'(})', value='', regex=True)
df['cast'] = rclean
lens = df['cast'].str.split(',').map(len)
cast_df = pd.DataFrame({'ID': np.repeat(df['ID'], lens),
                    'actor': chainer(df['cast'])})

# NORMALIZE STUDIOS
df = pd.read_csv('studios.csv')
lclean = df['studios'].replace(to_replace=r'(\{)', value='', regex=True)
rclean = lclean.replace(to_replace=r'(})', value='', regex=True)
df['studios'] = rclean
lens = df['studios'].str.split(',').map(len)
studios_df = pd.DataFrame({'ID': np.repeat(df['ID'], lens),
                    'studio': chainer(df['studios'])})

In [None]:
# WRITE NORMALIZED DFs TO CSV FILES
genre_df.to_csv('clean_genres.csv')
cast_df.to_csv('clean_cast.csv')
studios_df.to_csv('clean_studios.csv')

In [None]:
# The above "Normalization" does not remove the single-quotes around each entry and prevent proper querying within the database
# Therefore, each value must be further cleaned and re-written into a final csv for importation
fix_genres = pd.read_csv('clean_genres.csv')
fix_studios = pd.read_csv('clean_studios.csv')
fix_cast = pd.read_csv('clean_cast.csv')

In [None]:
# This block removes the singles quotes around each csv entry due to data recognition issues in SQL
# 'action' -> action

test = []
ids = []
for genre in range(len(fix_genres)):
    try:
        test.append(fix_genres['genre'][genre].replace('\'', '').replace(' ', ''))
        ids.append(fix_genres['ID'][genre])
    except AttributeError:
        test.append(fix_genres['genre'][genre])
        ids.append(fix_genres['ID'][genre])
new_genre_df = pd.DataFrame({'ID': ids, 'genre': test})

test = []
ids = []
for studio in range(len(fix_studios)):
    try:
        test.append(fix_studios['studio'][studio].replace('\'', '').replace(' ', ''))
        ids.append(fix_studios['ID'][studio])
    except AttributeError:
        test.append(fix_studios['studio'][studio])
        ids.append(fix_studios['ID'][studio])
new_studios_df = pd.DataFrame({'ID': ids, 'studio': test})

test = []
ids = []
for actor in range(len(fix_cast)):
    try:
        test.append(fix_cast['actor'][actor].replace('\'', '').replace(' ', ''))
        ids.append(fix_cast['ID'][actor])
    except AttributeError:
        test.append(fix_cast['actor'][actor])
        ids.append(fix_cast['ID'][actor])
new_cast_df = pd.DataFrame({'ID': ids, 'actor': test})

In [None]:
# Write the "fixed" csv files
new_genre_df.to_csv('fixed_genres.csv')
new_studios_df.to_csv('fixed_studios.csv')
new_cast_df.to_csv('fixed_cast.csv')