In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
output_file = 'scraping_results.json'
chrome_instances = 12

In [None]:
# Find element by looking for the headline, scroll element into view and wait for lazy-loading to complete
def find_element(driver, text_to_find, wait_time):
    element = driver.find_element(By.XPATH, f"//*[text() = '{text_to_find}']")
    driver.execute_script("arguments[0].scrollIntoView();", element)
    time.sleep(wait_time)
    element = driver.find_element(By.XPATH, f"//*[text() = '{text_to_find}']")
    return element.find_element(By.XPATH, './../../../../..')

def scrape_url(url):
    chrome_options = Options()
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--lang=en")
    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(url)
        time.sleep(3)
        driver.implicitly_wait(10)

        storyline = find_element(driver, 'Handlung', 5).text
        review = find_element(driver, 'Rezension', 1).text

        result = {'url': url,
                  'storyline': storyline,
                  'review': review}
        
    except:
        result = {
            'url': url,
            'storyline': '',
            'review': ''
        }
        
    finally:
        driver.quit()
    
    return result

def save_result(result):
    if os.path.exists(output_file):
        with open(output_file, 'r+') as file:
            data = json.load(file)
            data.append(result)
            file.seek(0)
            json.dump(data, file, indent=4)
    else:
        with open(output_file, 'w') as file:
            json.dump([result], file, indent=4)

In [None]:
len_results = 0

# Load output file if exists and count contents to prevent duplicates
if os.path.exists(output_file):
    with open(output_file, 'r') as f:
        scraping_results = json.load(f)
        len_results = len(scraping_results)
        print(f'Already scraped {len_results} movies')

# Subsctract already scraped movies from index
with open('urls.json', 'r') as f:
    movies_old = json.load(f)
    urls = [movie[0].split('?')[0] for movie in movies_old][len_results:]
    print(f'Starting scraping for {len(urls)} movies')

# Start parallel scraping
with ThreadPoolExecutor(max_workers=chrome_instances) as executor:
    future_to_url = {executor.submit(scrape_url, url): url for url in urls}
    for future in as_completed(future_to_url):
        url = future_to_url[future]
        try:
            result = future.result()
            save_result(result)
        except Exception as exc:
            print(f'URL {url} generated an exception: {exc}')