#### Imports

In [34]:
from selenium import webdriver
import pandas as pd 
import time
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import requests

### Part 1 - Browser Automation with Selenium

In [65]:
driver = webdriver.Chrome(executable_path= 'C:\webdrivers\chromedriver.exe')
driver.get('https://imdb.com')

# maximize window
driver.maximize_window()

# dropdown
dropdown = driver.find_element_by_class_name('ipc-icon--arrow-drop-down')
dropdown.click()

# avanced search from dropdown menu
element = driver.find_element_by_link_text('Advanced Search')
element.click()

# click on avanced title search
adv_title = driver.find_element_by_link_text('Advanced Title Search')
adv_title.click() 

# select feature film
feature_film = driver.find_element_by_id('title_type-1')
feature_film.click() 

# select tv movie
tv_movie = driver.find_element_by_id('title_type-2')
tv_movie.click()

# min date
min_date = driver.find_element_by_name('release_date-min')
min_date.click()
min_date.send_keys('1990')

# max date
max_date = driver.find_element_by_name('release_date-max')
max_date.click()
max_date.send_keys('2020')

# rating min
rating_min = driver.find_element_by_name('user_rating-min')
rating_min.click()
dropdown_2 = Select(rating_min)
dropdown_2.select_by_visible_text('1.0')

# rating max
rating_max = driver.find_element_by_name('user_rating-max')
rating_max.click()
dropdown_3 = Select(rating_max)
dropdown_3.select_by_visible_text('10')

# oscar nominated
oscar_nominated = driver.find_element_by_id('groups-7')
oscar_nominated.click()

# color
color = driver.find_element_by_id('colors-1')
color.click()

# language
language = driver.find_element_by_name('languages')
dropdown_4 = Select(language)
dropdown_4.select_by_visible_text('English')

# 250 results
results_count = driver.find_element_by_id('search-count')
dropdown_5 = Select(results_count)
dropdown_5.select_by_index(2)

# submit
submit = driver.find_element_by_xpath('(//button[@type="submit"])[2]')
submit.click()

# current
current_url = driver.current_url 

##############

# get request
response = requests.get(current_url)

# soup object
soup = BeautifulSoup(response.content, 'html.parser')

# result items (starting point)
list_items = soup.find_all('div', {'class':'lister-item'})


# list comprehension
movie_title = [result.find('h3').find('a').get_text() for result in list_items]
year = [result.find('h3').find('span', {'class':'lister-item-year'}).get_text().replace('(', '').replace(')', '') for result in list_items]
duration = [result.find('span', {'class':'runtime'}).get_text() for result in list_items]
genre = [result.find('span', {'class':'genre'}).get_text().strip() for result in list_items]
rating = [result.find('div', {'class':'ratings-imdb-rating'}).get_text().strip() for result in list_items]

# create dataframe
imdb_df = pd.DataFrame({'Movie Title': movie_title, 'Year': year, 'Duration':duration,
                       'Genre': genre, 'Rating':rating})

### Part 2 - Data Extraction with Beautiful Soup

In [36]:
# get request
response = requests.get(current_url)

In [37]:
# status code
response.status_code

200

In [38]:
# soup object
soup = BeautifulSoup(response.content, 'html.parser')

In [39]:
soup


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Feature Film/TV Movie,
Released between 1990-01-01 and 2020-12-31,
User Rating between 1 and 10,
Oscar-Nominated,
Color,
English
(Sorted by Popularity Ascending) - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>


In [40]:
# result items (starting point)
list_items = soup.find_all('div', {'class':'lister-item'})

In [41]:
len(list_items)

250

#### Data we need to extract

- movie title
- year
- duration
- genre
- rating

In [45]:
# movie title
list_items[0].find('h3').find('a').get_text()

'Suicide Squad'

In [50]:
# year
list_items[0].find('h3').find('span', {'class':'lister-item-year'}).get_text().replace('(', '').replace(')', '')

'2016'

In [52]:
# duration
list_items[0].find('span', {'class':'runtime'}).get_text()

'123 min'

In [55]:
# genre
list_items[0].find('span', {'class':'genre'}).get_text().strip()

'Action, Adventure, Fantasy'

In [60]:
# rating
list_items[0].find('div', {'class':'ratings-imdb-rating'}).get_text().strip()

'5.9'

In [62]:
# list comprehension
movie_title = [result.find('h3').find('a').get_text() for result in list_items]
year = [result.find('h3').find('span', {'class':'lister-item-year'}).get_text().replace('(', '').replace(')', '') for result in list_items]
duration = [result.find('span', {'class':'runtime'}).get_text() for result in list_items]
genre = [result.find('span', {'class':'genre'}).get_text().strip() for result in list_items]
rating = [result.find('div', {'class':'ratings-imdb-rating'}).get_text().strip() for result in list_items]


In [63]:
imdb_df = pd.DataFrame({'Movie Title': movie_title, 'Year': year, 'Duration':duration,
                       'Genre': genre, 'Rating':rating})

In [64]:
imdb_df

Unnamed: 0,Movie Title,Year,Duration,Genre,Rating
0,Suicide Squad,2016,123 min,"Action, Adventure, Fantasy",5.9
1,Once Upon a Time In... Hollywood,2019,161 min,"Comedy, Drama",7.6
2,Tenet,2020,150 min,"Action, Sci-Fi, Thriller",7.4
3,A Quiet Place,2018,90 min,"Drama, Horror, Sci-Fi",7.5
4,Avengers: Endgame,2019,181 min,"Action, Adventure, Drama",8.4
...,...,...,...,...,...
245,Die Chroniken von Narnia - Der König von Narnia,2005,143 min,"Adventure, Family, Fantasy",6.9
246,Minority Report,2002,145 min,"Action, Crime, Mystery",7.6
247,Silver Linings,2012,122 min,"Comedy, Drama, Romance",7.7
248,Legenden der Leidenschaft,1994,133 min,"Drama, Romance, War",7.5


In [None]:
# output in excel
imdb_df.to_excel('imdb_multiple_pages.xlsx', index=False)