In [2]:
"""
Author: Ra Cohen (ra.q.cohen@gmail.com)
Date: May 10, 2023
Purpose: Scrape IMDB for movie information from matched data courtesy of https://github.com/dhruvilgala/tvtropes
"""

import pandas as pd
import json
from bs4 import BeautifulSoup as bs
import requests
import re

In [3]:
film_tropes = pd.read_csv('data/film_imdb_match.csv')
tv_tropes = pd.read_csv('data/tv_imdb_match.csv')

media_ids = pd.unique(pd.concat([film_tropes.tconst, tv_tropes.tconst]))

In [4]:
BASE_URL = 'https://www.imdb.com/title/'
USER_AGENT = {'User-agent':
              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

def get_page_html(path, url=BASE_URL, user_agent=USER_AGENT):
    url = url + path
    html = requests.get(url, headers=user_agent).text
    return bs(html)

In [5]:
def get_title(page_html):
    return page_html.find('title').text[0:-7]

In [6]:
def get_poster(page_html):
    try:
        return page_html.findAll('img', {'class' : 'ipc-image'})[0].get('src')
    except:
        return ''

In [None]:
df = pd.DataFrame(columns=['id', 'title', 'poster'])

for i, id_ in enumerate(media_ids):
    page_html = get_page_html(id_)
    title = get_title(page_html)
    poster = get_poster(page_html)
    df2 = {'id': id_, 'title': title, 'poster': poster}
    df = df.append(df2, ignore_index = True)
    if i % 100 == 0:
        print("Completed {} out of {}".format(i, len(media_ids)))

file = 'data/imdb_id_poster.csv'
df.to_csv(file, index=False)
