In [1]:
import pandas as pd
from recommender.crawl_imdb import IMDbDataPydantic, wrapper
import concurrent.futures
import tqdm
import time
import multiprocessing
import threading
from recommender import REPO_PATH
import os
os.chdir(REPO_PATH)

In [2]:
links = pd.read_csv(
    "data/imdb_data.csv",
    dtype={
        "movieId": "int64",
        "imdbId": "string",
        "tmdbId": "string",
    },
)
links

Unnamed: 0,imdbId,movieId,tmdbId,rating,director,plot_summary,top_cast
0,0114709,1,862,8.3,John Lasseter,A cowboy doll is profoundly threatened and jea...,Tom Hanks | Tim Allen | Don Rickles | Jim Varn...
1,0113497,2,8844,7.1,Joe Johnston,When two kids find and play a magical board ga...,Robin Williams | Kirsten Dunst | Bonnie Hunt |...
2,0113228,3,15602,6.6,Howard Deutch,John and Max resolve to save their beloved bai...,Walter Matthau | Jack Lemmon | Ann-Margret | S...
3,0114885,4,31357,6.0,Forest Whitaker,"Based on Terry McMillan's novel, this film fol...",Whitney Houston | Angela Bassett | Loretta Dev...
4,0113041,5,11862,6.1,Charles Shyer,George Banks must deal not only with his daugh...,Steve Martin | Diane Keaton | Martin Short | K...
...,...,...,...,...,...,...,...
9737,5476944,193581,432131,7.6,Noriyuki Abe,A young lord and his demon butler board a luxu...,Bryn Apprill | Dawn M. Bennett | Justin Briner...
9738,5914996,193583,445030,7.3,Atsuko Ishizuka,"Adaption of the sixth Light Novel of series, i...",Alexandra Bedford | Jessica Boone | Ricardo Co...
9739,6397426,193585,479308,6.2,Bruce Beresford,A woman deals with the toxic water scandal in ...,Marin Ireland | Queen Latifah | Lyndie Greenwo...
9740,8391976,193587,483455,7.3,Takuya Igarashi,The Armed Detective Agency investigates a biza...,Brian Beacock | Ray Chase | Lucien Dodge | Car...


In [3]:
wanted_ids = links.loc[links.director.isna()].imdbId
wanted_ids

585     0118114
4931    0070644
4981    0259153
4986    0112130
5011    0080297
         ...   
9573    6840134
9634    6769208
9651    0118460
9690    5189670
9736    5342766
Name: imdbId, Length: 94, dtype: string

In [4]:
def fetch_imdb_data_with_timeout(imdbid, timeout = 6):
    # Create a multiprocessing Queue to get the function result
    queue = multiprocessing.Queue()

    # Define a wrapper function to call the original function and put its result into the queue

    # Create and start a Process to run the wrapper function
    process = multiprocessing.Process(target=wrapper, args=(queue, imdbid))
    process.start()

    # Wait for the process to finish or timeout
    process.join(timeout)
    if process.is_alive():
        # If the process is still alive after the timeout, terminate it
        process.terminate()
        process.join()
        return "Timeout reached"

    try:
        # Get the result from the queue
        result = queue.get_nowait()
    except multiprocessing.queues.Empty:
        return "No result returned"

    return result

data = fetch_imdb_data_with_timeout('1234567')  # 10 seconds ti

In [5]:
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    future_to_imdb_id = {executor.submit(fetch_imdb_data_with_timeout, imdb_id): imdb_id for imdb_id in wanted_ids}

    results = {}
    i = 0

    for future in tqdm.tqdm(concurrent.futures.as_completed(future_to_imdb_id), total=len(future_to_imdb_id)):
        imdb_id = future_to_imdb_id[future]
        try:
            data = future.result()
            results[imdb_id] = data
        except concurrent.futures.TimeoutError:
            print(f"Fetching data for IMDb ID {imdb_id} timed out.")

  0%|          | 0/94 [00:00<?, ?it/s]2023-12-11 18:33:44.686 | INFO     | recommender.crawl_imdb:fetch_imdb_data:95 - Network error occurred: 404 Client Error: Not Found for url: https://www.imdb.com/title/tt0118114/ for https://www.imdb.com/title/tt0118114/
  1%|          | 1/94 [00:01<02:43,  1.75s/it]2023-12-11 18:33:45.094 | INFO     | recommender.crawl_imdb:fetch_imdb_data:52 - Could not find director for https://www.imdb.com/title/tt0259153/plotsummary/
2023-12-11 18:33:45.103 | INFO     | recommender.crawl_imdb:fetch_imdb_data:52 - Could not find director for https://www.imdb.com/title/tt0070644/plotsummary/
2023-12-11 18:33:45.148 | INFO     | recommender.crawl_imdb:fetch_imdb_data:52 - Could not find director for https://www.imdb.com/title/tt0112130/plotsummary/
  4%|▍         | 4/94 [00:03<00:56,  1.59it/s]2023-12-11 18:33:47.368 | INFO     | recommender.crawl_imdb:fetch_imdb_data:52 - Could not find director for https://www.imdb.com/title/tt0080297/plotsummary/
2023-12-11 1

In [6]:
df = links.set_index("imdbId").copy()
v : IMDbDataPydantic
for i,v in results.items():
    
    if not isinstance(v, IMDbDataPydantic):
        continue
    
    df.loc[i, "rating"] = v.rating
    df.loc[i, "director"] = v.director
    df.loc[i, "plot_summary"] = v.plot_summary
    df.loc[i, "top_cast"] = " | ".join(v.top_cast)
    
df.to_csv("data/imdb_data.csv")

In [7]:
df.plot_summary.isna().mean()

0.000821186614658181