In [1]:
import pandas as pd
import sqlalchemy as db

connection_str = f'mysql+pymysql://root:admin@172.17.0.2:3306/imdb'
engine = db.create_engine(connection_str)
conn = engine.connect()

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from tqdm import tqdm

In [2]:
query = '''
    SELECT tb.tconst FROM title_basics tb
    LEFT JOIN title_ratings tr ON tb.tconst = tr.tconst
    WHERE tr.numVotes >= 1900
'''
tconsts = pd.read_sql(query, conn)['tconst'].tolist()
tconsts[:10]

['tt0160904',
 'tt0204993',
 'tt0205700',
 'tt0206476',
 'tt0206511',
 'tt0210413',
 'tt0212671',
 'tt0212686',
 'tt0213327',
 'tt0219446']

In [3]:
def scrap_country(tconst):
    url = f'https://www.imdb.com/title/{tconst}/'
    req = Request(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, features='lxml')
    
    origin_li = soup.find('li', {'data-testid': 'title-details-origin'})
    link = origin_li.find('a')
    country = link.text
    
    return country

In [4]:
from multiprocessing.pool import ThreadPool as Pool

In [7]:
pool_size = 10

countries_of_origin = dict()
def worker_1(tconst):
    try:
        countries_of_origin[tconst] = scrap_country(tconst)
    except Exception:
        countries_of_origin[tconst] = None

pool = Pool(pool_size)
for _ in tqdm(pool.imap_unordered(worker_1, tconsts), total=len(tconsts)):
    pass

100%|██████████| 4076/4076 [20:50<00:00,  3.26it/s]


In [9]:
df = pd.DataFrame({
    'tconst': countries_of_origin.keys(), 
    'country_of_origin': countries_of_origin.values()
    })
df.to_csv('../treated_datasets/countries_of_origin.tsv', sep='\t')