In [395]:
import requests
from bs4 import BeautifulSoup
import re

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
import time

base_url = 'https://www.imdb.com'
check_lists = ['(TV Series)', '(Short)', '(Video Game)', '(Video short)', '(Video)', '(TV Movie)', 
               '(TV Mini-Series)', '(TV Series short)', '(TV Special)', '(TV Movie documentary)']

def get_cast_page_soup(id):
    url = base_url + f"/title/tt{id}" + '/fullcredits/?ref_=tt_ov_st_sm'
    try:
        response = requests.get(url)
        if not response.status_code // 100 == 2:
            return f"Error: Response is not successful. {response}"
        soup = BeautifulSoup(response.text)
        return soup
    except requests.exceptions.HTTPError as e:
        return f"Error: {e}"

def get_actor_page_soup(url):
    try:
        response = requests.get(url)
        if not response.status_code // 100 == 2:
            return f"Error: Response is not successful. {response}"
        else:
            soup = BeautifulSoup(response.text)
            return soup
    except requests.exceptions.HTTPError as e:
        return f"Error: {e}"

def get_actors_by_movie_soup(cast_page_soup, num_of_actors_limit = None):
    actor_table = cast_page_soup.find('table', class_ ='cast_list')

    actor_lists = []

    for row in actor_table.findAll('tr')[1:]:
        columns = row.findAll('td')
        if len(columns) > 1:
            actor_info = row.findAll('td')[1]
            actor_url = actor_info.find('a').get('href')
            full_actor_url = base_url + f"{actor_url}"
            actor_name = actor_info.find('a').getText().strip('\n')

            actor_lists.append((actor_name, full_actor_url))
    
    if num_of_actors_limit:
        return actor_lists[:num_of_actors_limit]

    return actor_lists

def get_dynamic_movies_list(url):
    driver = webdriver.Chrome('/Users/joyceliu/Downloads/chromedriver')
    site = driver.get(url)
    
    hide_all = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//span[@class='filmo-show-hide-all']")))
    action = ActionChains(driver)
    # double click operation and perform
    action.double_click(hide_all).perform()
    
    
    time.sleep(20)
     
    
    try:
        actor_links_to_click = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//a[text()="Actress" or text()="Actor"]')))
        actor_links_to_click.click()
    
        time.sleep(1)

        movies_table = driver.find_elements(By.XPATH, '//div[contains(@id, "actress") or contains(@id, "actor")]')
        print(movies_table)
        html = ''
    
        for movie in movies_table:
            html += f"<{movie.tag_name} class = 'filmo-row'>{movie.get_attribute('innerHTML')}</{movie.tag_name}>"
    
        soup = BeautifulSoup(html,'html.parser')
        
        return soup
    
    except NoSuchElementException as e:
        return f"Error: {e}"
    

def test(actor_page_soup, num_of_movies_limit = None):
    movie_lists = []

    for row in actor_page_soup.findAll('div', class_='filmo-row'):
        if row.find('b'):
            text = row.find('b').next_sibling
            text = text.replace(r"\n{2,}","\n")
            text = text.strip()
            text_arr = re.findall(r'\(.+?\)|".+?"|\w+' , text)
            links = row.findAll('a')
            movie_name = row.find('a').text
            movie_url = row.find('a').get('href')
            full_movie_url = base_url + movie_url

            if len(links) == 1:
                if len(text_arr) > 1:
                    text_to_check = text_arr[0]
                    if text_to_check not in check_lists:
                        movie_lists.append((movie_name, full_movie_url))
                elif len(text_arr) == 1:
                    text_to_check = text_arr[0]
                    if text_to_check not in check_lists:
                        movie_lists.append((movie_name, full_movie_url))
                else:
                    movie_lists.append((movie_name, full_movie_url))   
    
    return movie_lists

def get_movies_by_actor_soup(actor_page_soup, num_of_movies_limit = None):
    movie_table = actor_page.findAll('div', id=re.compile('^actor-|^actress-'))
    movie_lists = []
    
    for row in movie_table:
        if row.find('b'):
            text = row.find('b').next_sibling
            text = text.replace(r"\n{2,}","\n")
            text = text.strip()
            text_arr = re.findall(r'\(.+?\)|".+?"|\w+' , text)
            links = row.findAll('a')
            movie_name = row.find('a').text
            movie_url = row.find('a').get('href')
            full_movie_url = base_url + movie_url

            if len(links) == 1:
                if len(text_arr) > 1:
                    text_to_check = text_arr[0]
                    if text_to_check not in check_lists:
                        movie_lists.append((movie_name, full_movie_url))
                elif len(text_arr) == 1:
                    text_to_check = text_arr[0]
                    if text_to_check not in check_lists:
                        movie_lists.append((movie_name, full_movie_url))
                else:
                    movie_lists.append((movie_name, full_movie_url))   
    
    return movie_lists


In [308]:
cast_group_soup = get_cast_page_soup('0111161')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [139]:
ret = get_actors_by_movie_soup(cast_group_soup)

In [140]:
ret

[(' Tim Robbins', 'https://www.imdb.com/name/nm0000209/'),
 (' Morgan Freeman', 'https://www.imdb.com/name/nm0000151/'),
 (' Bob Gunton', 'https://www.imdb.com/name/nm0348409/'),
 (' William Sadler', 'https://www.imdb.com/name/nm0006669/'),
 (' Clancy Brown', 'https://www.imdb.com/name/nm0000317/'),
 (' Gil Bellows', 'https://www.imdb.com/name/nm0004743/'),
 (' Mark Rolston', 'https://www.imdb.com/name/nm0001679/'),
 (' James Whitmore', 'https://www.imdb.com/name/nm0926235/'),
 (' Jeffrey DeMunn', 'https://www.imdb.com/name/nm0218810/'),
 (' Larry Brandenburg', 'https://www.imdb.com/name/nm0104594/'),
 (' Neil Giuntoli', 'https://www.imdb.com/name/nm0321358/'),
 (' Brian Libby', 'https://www.imdb.com/name/nm0508742/'),
 (' David Proval', 'https://www.imdb.com/name/nm0698998/'),
 (' Joseph Ragno', 'https://www.imdb.com/name/nm0706554/'),
 (' Jude Ciccolella', 'https://www.imdb.com/name/nm0161980/'),
 (' Paul McCrane', 'https://www.imdb.com/name/nm0005204/'),
 (' Renee Blaine', 'https://

In [389]:
actor_page = get_actor_page_soup('https://www.imdb.com/name/nm3078932/')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [390]:
actor_page

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///name/nm3078932?src=mdot" name="apple-itunes-app"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Lady Gaga - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
</script>
<link href="https://www.imdb.com/name/nm

In [393]:
ret1 = test(actor_page)

In [394]:
ret1

[('A Star Is Born', 'https://www.imdb.com/title/tt1517451/'),
 ("Frank Miller's Sin City: A Dame to Kill For",
  'https://www.imdb.com/title/tt0458481/'),
 ('Muppets Most Wanted', 'https://www.imdb.com/title/tt2281587/'),
 ('Machete Kills', 'https://www.imdb.com/title/tt2002718/'),
 ('Men in Black 3', 'https://www.imdb.com/title/tt1409024/')]