In [10]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
import concurrent.futures
from itertools import repeat
import time as t

# Search movie title on IMDB

In [11]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def open_driver(url):
    options = webdriver.ChromeOptions()
    options.experimental_options["prefs"] = {
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.css": 2
    }
    driver = webdriver.Chrome(options=options)
    wait = WebDriverWait(driver, 20)

    url = "https://www.imdb.com/"
    driver.get(url)
    return driver, wait

In [12]:
def find_movie_year(movie_name, driver, wait):
    typetext = wait.until(EC.element_to_be_clickable((By.ID, "suggestion-search")))
    typetext.click()
    typetext.send_keys(movie_name.strip())
    typetext.submit()
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    year = soup.find("span", class_="ipc-metadata-list-summary-item__li").text
    try:
        year = int(year)
    except:
        try:
            year = int(year.split("–")[0])
        except:
            year = np.nan
    finally:
        return year

In [13]:
def get_n_movie_years(movie_names, dict_output={}):
    driver, wait = open_driver("https://www.imdb.com/")
    for movie_name in movie_names:
        if movie_name not in dict_output:
            dict_output[movie_name] = find_movie_year(movie_name, driver, wait)
    driver.close()
    return dict_output

In [14]:
def multiple_windows(movie_list, dict_output={}, workers=5):
    movie_chunks = np.array_split(movie_list, workers)
    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
        executor.map(get_n_movie_years, [list(chunk) for chunk in movie_chunks], repeat(dict_output))
    return dict_output

In [15]:
# dict = multiple_windows(pd.read_csv("../data/pre_release.csv")["movie_title"].values, workers=5)

In [16]:
# transform dictionary to dataframe with movie_title and year columns
# df = pd.DataFrame(list(dict.items()), columns=["movie_title", "year"])
# df.to_csv("../data/movie_years.csv", index=False)

In [20]:
release_df = pd.read_csv("../data/release_years.csv")
full = pd.read_csv("../data/full_dataset.csv")

full.merge(release_df, on="movie_title", how="left")

Unnamed: 0,movie_title,duration,genres,language,country,content_rating,budget,genre_Drama,genre_Comedy,genre_Romance,...,actor_3_facebook_likes,cast_total_facebook_likes,gross,imdb_score,movie_facebook_likes,num_critic_for_reviews,num_voted_users,num_user_for_reviews,gross_margin,year
0,The Spanish Prisoner,110.0,Drama|Mystery|Thriller,English,USA,PG,10000000.0,True,False,False,...,393.0,1722,10200000.0,7.3,578,97.0,18697,263.0,0.020000,1997.0
1,Oceans,104.0,Documentary|Drama,French,France,G,40000000.0,True,False,False,...,7.0,152,19406406.0,7.8,0,113.0,7630,42.0,-0.514840,2004.0
2,Exorcist: The Beginning,125.0,Horror|Mystery|Thriller,English,USA,R,50000000.0,False,False,False,...,394.0,1970,41814863.0,5.1,893,167.0,28635,308.0,-0.163703,2004.0
3,First Knight,134.0,Action|Adventure|Romance|Thriller,English,USA,PG-13,55000000.0,False,False,True,...,249.0,1695,37600435.0,5.9,0,53.0,55350,180.0,-0.316356,1995.0
4,This Is It,111.0,Documentary|Music,English,USA,PG,60000000.0,False,False,False,...,6.0,679,71844424.0,7.3,0,136.0,33158,223.0,0.197407,2009.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,The Circle,90.0,Drama,Persian,Iran,Not Rated,10000.0,True,False,False,...,0.0,5,673780.0,7.5,697,64.0,4555,26.0,66.378000,2020.0
1042,The Cure,111.0,Crime|Horror|Mystery|Thriller,Japanese,Japan,,1000000.0,False,False,False,...,6.0,115,94596.0,7.4,817,78.0,6318,50.0,-0.905404,2018.0
1043,El Mariachi,81.0,Action|Crime|Drama|Romance|Thriller,Spanish,USA,R,7000.0,True,False,True,...,6.0,147,2040920.0,6.9,0,56.0,52055,130.0,290.560000,1992.0
1044,Newlyweds,95.0,Comedy|Drama,English,USA,Not Rated,9000.0,True,True,False,...,133.0,690,4584.0,6.4,413,14.0,1338,14.0,-0.490667,2003.0
