# IMPORTING REQUIRED LIBRARIES

In [1]:
from selenium import webdriver                          #For opening browser       
import pandas as pd                                     #For creating dataframe
from bs4 import BeautifulSoup                           #For scraping data
import time                                             #For waiting
import numpy as np                                      #For array
from tqdm import tqdm                                   #For progress bar

# OPENING REQUIRED SITE IN BROWSER

In [2]:
driver = webdriver.Firefox()

In [3]:
driver.get('https://www.imdb.com/chart/bottom/')

# GETTING HTML CODE OF OPENED WEBPAGE

In [4]:
html = driver.page_source

# CONVERTING EXTRACTED HTML CODE INTO SOUP TO SCRAP DATA

In [5]:
soup = BeautifulSoup(html,"html.parser")

# EXTRACTING LINKS TO THE INDIVIDUAL WEBPAGES OF MOVIES

In [6]:
link_list = [soup.findAll('li', {'class':'ipc-metadata-list-summary-item sc-59b6048d-0 cuaJSp cli-parent'})[i].findAll('a')[0].get('href') for i in range(100)]

# FUNCTION TO EXTRACT NAME OF THE MOVIE

In [7]:
def get_name(soup):
    basic_info = soup.findAll('div',{'class':'sc-dffc6c81-0 grcyBP'})[0]
    return basic_info.findAll('span')[0].text

# FUNCTION TO EXTRACT RELEASE YEAR OF THE MOVIE

In [8]:
def get_year(soup):
    try:
        basic_info = soup.findAll('div',{'class':'sc-dffc6c81-0 grcyBP'})[0]
        return basic_info.findAll('a')[0].text
    except Exception as e:
        return np.nan

# FUNCTION TO EXTRACT CERTIFICATION RATE OF THE MOVIE

In [9]:
def get_rate(soup):
    try:
        basic_info = soup.findAll('div',{'class':'sc-dffc6c81-0 grcyBP'})[0]
        return basic_info.findAll('a')[1].text
    except Exception as e:
        return np.nan

# FUNCTION TO EXTRACT DURATION OF THE MOVIE

In [10]:
def get_duration(soup):
    try:
        basic_info = soup.findAll('div',{'class':'sc-dffc6c81-0 grcyBP'})[0]
        return basic_info.findAll('li', {'class':'ipc-inline-list__item'})[2].text
    except Exception as e:
        return np.nan

# FUNCTION TO EXTRACT RATING OF THE MOVIE

In [11]:
def get_rating(soup):
    try:
        return soup.findAll('span', {'class':'sc-bde20123-1 cMEQkK'})[0].text
    except Exception as e:
        return np.nan

# FUNCTION TO EXTRACT NUMBER OF USERS OF THE MOVIE

In [12]:
def get_users(soup):
    try:
        return soup.findAll('div', {'class':'sc-bde20123-3 gPVQxL'})[0].text
    except Exception as e:
        return np.nan

# FUNCTION TO EXTRACT DIRECTOR NAME OF THE MOVIE

In [13]:
def get_director(soup):
    try:
        dir_data = soup.findAll('li', {'class':'ipc-metadata-list__item'})[0].findAll('a')
        dir_list = [dir.text for dir in dir_data]
        return dir_list
    except Exception as e:
        return np.nan

# FUNCTION TO EXTRACT WRITER NAME OF THE MOVIE

In [14]:
def get_writer(soup):
    try:
        wrt_data = soup.findAll('li', {'class':'ipc-metadata-list__item'})[1].findAll('a')
        wrt_list = [wrt.text for wrt in wrt_data]
        return wrt_list
    except Exception as e:
        return np.nan

# FUNCTION TO EXTRACT GENRE OF THE MOVIE

In [15]:
def get_genre(soup):
    try:
        genre_data = soup.findAll('div', {'class':'ipc-chip-list__scroller'})[0].findAll('a')
        genre_list = [genre.text for genre in genre_data]
        return genre_list
    except Exception as e:
        return np.nan

# FUNCTION TO EXTRACT STARS OF THE MOVIE

In [16]:
def get_stars(soup):
    try:
        star_data = soup.findAll('div', {'class':'ipc-metadata-list-item__content-container'})[2].findAll('a')
        star_list = [star.text for star in star_data]
        return star_list
    except Exception as e:
        return np.nan

# MAIN LOOP TO ITERATE THROUGH EACH MOVIE'S WEBPAGE AND EXTRACT DATA

In [17]:
name = []
year = []
rate = []
duration = []
rating = []
reviews = []
dir = []
writers = []
genre = []
stars = []
for link in tqdm(link_list):
    driver.get(f'https://www.imdb.com{link}')
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    name.append(get_name(soup))
    year.append(get_year(soup))
    rate.append(get_rate(soup))
    duration.append(get_duration(soup))
    rating.append(get_rating(soup))
    reviews.append(get_users(soup))
    dir.append(get_director(soup))
    writers.append(get_writer(soup))
    genre.append(get_genre(soup))
    stars.append(get_stars(soup))
    

100%|██████████| 100/100 [11:26<00:00,  6.86s/it]


# CREATING DATAFRAME FOR COLLECTED DATASET 

In [18]:
df = pd.DataFrame({
    'rank':np.arange(1,101),
    'name':name,
    'year':year,
    'certification':rate,
    'duration':duration,
    'rating':rating,
    'review_count':reviews,
    'director':dir,
    'writer':writers,
    'genre':genre,
    'stars':stars
})

# CHECKING THE DATASET

In [19]:
df.head()

Unnamed: 0,rank,name,year,certification,duration,rating,review_count,director,writer,genre,stars
0,1,Disaster Movie,2008,PG-13,1h 27m,1.9,94K,"[Jason Friedberg, Aaron Seltzer]","[Jason Friedberg, Aaron Seltzer]","[Comedy, Sci-Fi]","[Carmen Electra, Vanessa Lachey, Nicole Parker]"
1,2,Manos: The Hands of Fate,1966,Not Rated,1h 10m,1.6,37K,[Harold P. Warren],[Harold P. Warren],[Horror],"[Tom Neyman, John Reynolds, Diane Adelson]"
2,3,Birdemic: Shock and Terror,2010,Not Rated,1h 45m,1.7,25K,[James Nguyen],"[Writer, James Nguyen, ]","[Horror, Thriller]","[Alan Bagh, Whitney Moore, Tippi Hedren]"
3,4,Superbabies: Baby Geniuses 2,2004,PG,1h 28m,1.5,32K,[Bob Clark],"[Writers, Robert Grasmere, Francisca Matos, St...","[Comedy, Family, Sci-Fi]","[Jon Voight, Scott Baio, Vanessa Angel]"
4,5,The Hottie & the Nottie,2008,PG-13,1h 31m,2.0,39K,[Tom Putnam],[Heidi Ferrer],"[Comedy, Romance]","[Paris Hilton, Joel David Moore, Christine Lakin]"


In [20]:
df.sample(20)

Unnamed: 0,rank,name,year,certification,duration,rating,review_count,director,writer,genre,stars
62,63,The Master of Disguise,2002,PG,1h 20m,3.3,27K,[Perry Andelin Blake],"[Dana Carvey, Harris Goldberg]","[Adventure, Comedy, Family]","[Dana Carvey, Jennifer Esposito, Harold Gould]"
10,11,Battlefield Earth,2000,PG-13,1h 57m,2.5,82K,[Roger Christian],"[Corey Mandell, J.D. Shapiro, L. Ron Hubbard]","[Action, Adventure, Sci-Fi]","[John Travolta, Forest Whitaker, Barry Pepper]"
21,22,Date Movie,2006,PG-13,1h 23m,2.8,62K,"[Aaron Seltzer, Jason Friedberg]","[Jason Friedberg, Aaron Seltzer]","[Comedy, Romance]","[Alyson Hannigan, Adam Campbell, Fred Willard]"
28,29,Jaws: The Revenge,1987,PG-13,1h 29m,3.0,49K,[Joseph Sargent],"[Writers, Peter Benchley, Michael De Guzman, ]","[Adventure, Horror, Thriller]","[Lorraine Gary, Lance Guest, Mario Van Peebles]"
78,79,The NeverEnding Story III,1994,G,1h 35m,3.2,12K,[Peter MacDonald],"[Jeff Lieberman, Karin Howard, Michael Ende]","[Adventure, Comedy, Family]","[Jason James Richter, Melody Kay, Jack Black]"
2,3,Birdemic: Shock and Terror,2010,Not Rated,1h 45m,1.7,25K,[James Nguyen],"[Writer, James Nguyen, ]","[Horror, Thriller]","[Alan Bagh, Whitney Moore, Tippi Hedren]"
72,73,3 Ninjas: High Noon at Mega Mountain,1998,PG,1h 33m,2.9,12K,[Sean McNamara],"[Sean McNamara, Jeff Phillips]","[Action, Adventure, Comedy]","[Loni Anderson, Hulk Hogan, Jim Varney]"
84,85,Black Christmas,2019,PG-13,1h 32m,3.5,18K,[Sophia Takal],"[Sophia Takal, April Wolfe, Roy Moore]","[Horror, Mystery, Thriller]","[Imogen Poots, Aleyse Shannon, Lily Donoghue]"
55,56,Breach,2020,R,1h 32m,2.9,19K,[John Suits],"[Edward Drake, Corey Large]","[Action, Horror, Sci-Fi]","[Cody Kearsley, Bruce Willis, Rachel Nichols]"
14,15,Race 3,2018,Not Rated,2h 40m,1.9,48K,[Remo D'Souza],"[Writers, Shiraz Ahmed, Kiran Kotrial, Athar N...","[Action, Crime, Thriller]","[Anil Kapoor, Salman Khan, Bobby Deol]"


In [21]:
df.tail()

Unnamed: 0,rank,name,year,certification,duration,rating,review_count,director,writer,genre,stars
95,96,The Love Guru,2008,PG-13,1h 27m,3.8,54K,[Marco Schnabel],"[Mike Myers, Graham Gordy]","[Comedy, Romance, Sport]","[Mike Myers, Jessica Alba, Romany Malco]"
96,97,The Hungover Games,2014,R,1h 25m,3.5,16K,[Josh Stolberg],"[Writers, Kyle Barnett Anderson, David Bernste...","[Adventure, Comedy, Fantasy]","[Ben Begley, Herbert Russell, Ross Nathan]"
97,98,In the Name of the King: A Dungeon Siege Tale,2007,PG-13,2h 7m,3.8,51K,[Uwe Boll],"[Writers, Doug Taylor, Jason Rappaport, Dan St...","[Action, Adventure, Fantasy]","[Jason Statham, Ron Perlman, Ray Liotta]"
98,99,The Avengers,1998,PG-13,1h 29m,3.8,45K,[Jeremiah S. Chechik],"[Sydney Newman, Don MacPherson]","[Action, Adventure, Sci-Fi]","[Ralph Fiennes, Uma Thurman, Sean Connery]"
99,100,The Flintstones in Viva Rock Vegas,2000,PG,1h 30m,3.7,25K,[Brian Levant],"[Writers, William Hanna, Joseph Barbera, Debor...","[Comedy, Family, Romance]","[Mark Addy, Stephen Baldwin, Kristen Johnston]"


# SAVING THE DATASET

In [22]:
df.to_csv('lowest_ranked_movies_data.csv',index= False)