### Scrape imdb data

In [None]:
import pandas as pd
from urllib.parse import quote
import requests
import json

response_data = []
title_year_set = set()

api_key = 'b8236321'
api_url = 'http://www.omdbapi.com/?t={}&y={}&apikey={}'

oscar_data = pd.read_csv('oscar_data.csv')
titles_and_years = oscar_data[['year', 'entity']]

for index, row in titles_and_years.iterrows():
    title_year_set.add((row['entity'], row['year']))

    
title_year_set = sorted(title_year_set)
counter = 0
for title, year in title_year_set:
    if counter < 4900:
        counter += 1
        continue
    if counter == 5900:
        break
    
    title = quote(title)
    url = api_url.format(title, year, api_key)
    response = requests.get(url)
    response_data.append(json.loads(response.text))
    counter += 1
    
response_data

In [None]:
imdb_df = pd.DataFrame(response_data)

with open('imdb_data.csv', 'a') as f:
    imdb_df.to_csv(f, header=False)

### Scrape script data

In [1]:
import csv
import os
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

oscar_data = pd.read_csv('oscar_data.csv')

def write_script_data(movie_script_dict):
    filename = 'movie_scripts.csv'
    if os.path.exists(filename):
        append_write = 'a' # append if already exists
    else:
        append_write = 'w' # make a new file if not
    with open(filename, append_write, newline='') as infile:
        writer = csv.writer(infile)
        for key in movie_script_dict:
            if movie_script_dict[key] != None:
                try:
                    print('Flushing to csv: {}'.format(key))
                    year,script = movie_script_dict[key]
                    writer.writerow([key, year, script])
                except TypeError as e:
                    print('Error, skipping writing this movie script...')
                    print(e)
                    print(key)
                    print(movie_script_dict[key])

def get_movie_script(movie_name, year):
    # Search to see if movie in Springfield database
    print('Searching db for: {}'.format(movie_name))
    movie_script_url = r'https://www.springfieldspringfield.co.uk/movie_scripts.php?search={}'.format(movie_name)
    data = requests.get(movie_script_url).text
    parser = BeautifulSoup(data, 'html.parser')
    tags = parser.find_all('a', {'class':"script-list-item"}) # Should find first search item
    if tags == None:
        return None
    script_url = None
    for movie_tag in tags:
        result_text = movie_tag.text
        match = re.search('(.+) \((\d\d\d\d)\)', movie_tag.text)
        if match == None:
            return None
        tag_title = match.group(1)
        tag_year = int(match.group(2))
        # Only match if name is exact, and year within +-1
        if tag_title == movie_name and abs(year - tag_year) <= 1:
            script_url = 'https://www.springfieldspringfield.co.uk/{}'.format(movie_tag['href'])
            break
    if not script_url:
        return None

    # Get the script text from another url
    print(' Gathering script...')
    script_html = requests.get(script_url).text
    parser = BeautifulSoup(script_html, 'html.parser')
    script_container = parser.find('div', {'class':"scrolling-script-container"})
    script = script_container.text.strip()
    return script

# print(get_movie_script('Trading Places', 1983))

movie_set = set()
for index,row in oscar_data.iterrows():
    movie_set.add((row['entity'], row['year']))

movie_script_dict = {}
flush_count = 0
total_count = 0
for movie_name, year in movie_set:
    if movie_name in movie_script_dict:
        continue
    try:
        script = get_movie_script(movie_name, year)
    except Exception as e:
        print(' Error getting html data...')
        print(e)
        continue
    if script == None:
        continue
    movie_script_dict[movie_name] = (year,script)
    flush_count += 1
    
    # Flush to file when dict gets too large
    if flush_count >= 10:
        write_script_data(movie_script_dict)
        movie_script_dict = dict.fromkeys(movie_script_dict, None)
        flush_count = 0

KeyboardInterrupt: 