#### Import libraries

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import re

In [2]:
def findNextPageURL(current_html):
    next_link = current_html.find('div', class_ = "desc")
    #print(next_link)
    if not next_link:
        return None
    pn_url = next_link.find_all('a')
    part_url = ''
    for _url in pn_url:
        if re.search(r"Next", _url.text):
            part_url = _url['href']
            
    if part_url == '':
        return None;
        
    response = None
    top1000_url = base_url + part_url
    for attempt in range(10):
        try:
            response = requests.get(top1000_url).text
        except:
            continue
        break
    
    if response:
        next_page_html = bs(response, 'html.parser')
    else:
        return None
    
    return next_page_html

In [3]:
top250_movie_url = "https://www.imdb.com/search/title/?groups=top_250&sort=user_rating"
response = requests.get(top250_movie_url)
top250_movie_html = bs(response.text, 'html.parser')
base_url = 'https://www.imdb.com/'
top250_movies_list = []
i = 1
while True:
    top250_movies_list = top250_movies_list + top250_movie_html.find_all('div', class_ = "lister-item mode-advanced")
    top250_movie_html = findNextPageURL(top250_movie_html)
    if pd.isnull(top250_movie_html):
        break
    print(i)
    i += 1
print(len(top250_movies_list))

1
2
3
4
250


In [162]:
def tile_info(main):
    title_info_dict = {}
    try:
        title_section = main.find('div', class_ = 'TitleBlock__TitleContainer-sc-1nlhx7j-1 jxsVNt')
    
        title_info_dict['Title'] =  title_section.h1.text
        if len(title_section.ul.contents) == 3:
            title_info_dict['Year'] = title_section.ul.select("li:nth-child(1)")[0].a.text
            title_info_dict['Certificate'] = title_section.ul.select("li:nth-child(2)")[0].a.text
            title_info_dict['Time'] = title_section.ul.select("li:nth-child(3)")[0].text
        elif len(title_section.ul.contents) == 2:
            title_info_dict['Year'] = title_section.ul.select("li:nth-child(1)")[0].a.text
            title_info_dict['Time'] = title_section.ul.select("li:nth-child(2)")[0].text
        else:
            title_info_dict['Year'] = title_section.ul.select("li:nth-child(1)")[0].a.text
    except Exception as e: print('title_info function ',  e)
    
    return title_info_dict

def content_info(main):
    content_info_dict = {}
    try:
        content_section = main.find('div', class_ = 'Hero__ContentContainer-kvkd64-10 eaUohq')
        content_info_dict['Genere'] = [item.text for item in content_section.div.div.div.find_all('a')]
        content_info_dict['plot'] = content_section.div.div.p.text
        content_info_dict['Rating'] = content_section.div.select('div:nth-child(2)')[0].div.div.a.div.div.select('div:nth-child(2)')[0].div.span.text
        content_info_dict['Voting'] = content_section.div.select('div:nth-child(2)')[0].div.div.a.div.div.select('div:nth-child(2)')[0].select('div:nth-child(3)')[0].text
    except Exception as e: print('content_info function ', e)
    
    return content_info_dict
    
def cast_info(main):
    cast_info_dict = {}
    try:
        
        star_section = main.find('div', class_ = re.compile("title-cast__grid"))
        star_list = []
        for star in star_section.contents:
            star_list.append(star.contents[1].a.text)
        
        cast_info_dict['Stars'] = star_list
        
        cast_section = main.find('ul', class_ = re.compile("CastMetaDataList"))
        cast_section_itr = cast_section.li

        for i in range(2):
            key = cast_section_itr.contents[0].text
            if key == 'Writer':
                key = 'Writers'
            if key == 'Director':
                key = 'Directors'
            values = cast_section_itr.div.ul.find_all('li')
            v = []
            for value in values:
                v.append(value.a.text)
            cast_info_dict[key] = v
            cast_section_itr = cast_section_itr.find_next_sibling('li')
            
    except Exception as e: print('cast_info function ', e)
    
    return cast_info_dict

def review_info(main):
    review_info_dict = {}
    try:
        regex = re.compile('Hero__WatchContainer')
        list_scores = main.find('div', attrs = {'class':regex}).ul
        for child in list_scores.children:
            try:
                key_value = child.a.span
                key = key_value.select('span.label')[0].text
                value = key_value.select('span.score')[0].text
                review_info_dict[key] = value
            except Exception as e: print('review_info function ', e)
    
    except Exception as e: print(e)
        
    return review_info_dict


def extract_movie_info(url):
    movie_info_dict = {}
    try:
        # Read the URL and find the main body
        for attemp in range(10):
            try:
                r = requests.get(url)
            except Exception as e: print(e)
            break
            
        html_content = r.text
        soup = bs(html_content,'html.parser')
        #print(soup.prettify())
        main = soup.find('div', class_ = re.compile('ipc-page-content-container--full BaseLayout'))
        #print(main)
        
        # Find title section and read tite related info
        title_info_dict = tile_info(main)
        movie_info_dict.update(title_info_dict)
        
        # Find Genre, Plot and Ratings
        content_info_dict = content_info(main)
        movie_info_dict.update(content_info_dict)

        # Find Cast Info
        cast_info_dict = cast_info(main)
        movie_info_dict.update(cast_info_dict)

        # Find Other Scores
        review_info_dict = review_info(main)
        movie_info_dict.update(review_info_dict)

    except Exception as e: print(e)
    return movie_info_dict

In [173]:
print(pd.Series(extract_movie_info(top250_movies_info[220]['URL'])))

Title                                                      La Haine
Year                                                           1995
Certificate                                                      15
Time                                                       1h 38min
Genere                                               [Crime, Drama]
plot              24 hours in the lives of three young men in th...
Rating                                                          8.1
Voting                                                         158K
Stars             [Vincent Cassel, Hubert Koundé, Saïd Taghmaoui...
Directors                                       [Mathieu Kassovitz]
Writers                                         [Mathieu Kassovitz]
User reviews                                                    239
Critic reviews                                                  135
dtype: object


In [None]:
top250_movies_info = []
base_url = 'https://www.imdb.com/'
i = 1
for movie_item in top250_movies_list:
    if i >  1000:
        i+= 1
        continue
    part_url = movie_item.h3.a['href']
    full_url = base_url + part_url
    movie_info = extract_movie_info(full_url)
    movie_info['URL'] = full_url
    top250_movies_info.append(movie_info)
    #print(i, '\n', pd.Series(movie_info))
    print(i, movie_info['Title'])
    i += 1
    

In [187]:
top250_df = pd.DataFrame(top250_movies_info)
top250_df.tail(10)

Unnamed: 0,Title,Year,Certificate,Time,Genere,plot,Rating,Voting,Stars,Directors,Writers,User reviews,Critic reviews,Metascore,URL
240,On the Waterfront,1954,A,1h 48min,"[Crime, Drama, Thriller]",An ex-prize fighter turned longshoreman strugg...,8.1,147K,"[Marlon Brando, Karl Malden, Lee J. Cobb, Rod ...",[Elia Kazan],"[Budd Schulberg, Malcolm Johnson, Robert Siodmak]",361,170,91.0,https://www.imdb.com//title/tt0047296/
241,The Wages of Fear,1953,A,2h 11min,"[Adventure, Drama, Thriller]","In a decrepit South American village, four men...",8.1,57K,"[Yves Montand, Charles Vanel, Peter van Eyck, ...",[Henri-Georges Clouzot],"[Georges Arnaud, Henri-Georges Clouzot, Jérôme...",180,145,85.0,https://www.imdb.com//title/tt0046268/
242,The Third Man,1949,A,1h 44min,"[Film-Noir, Mystery, Thriller]",Pulp novelist Holly Martins travels to shadowy...,8.1,163K,"[Orson Welles, Joseph Cotten, Alida Valli, Tre...",[Carol Reed],"[Graham Greene, Orson Welles, Alexander Korda]",497,261,97.0,https://www.imdb.com//title/tt0041959/
243,Rebecca,1940,A,2h 10min,"[Drama, Mystery, Romance]",A self-conscious woman juggles adjusting to he...,8.1,130K,"[Laurence Olivier, Joan Fontaine, George Sande...",[Alfred Hitchcock],"[Daphne Du Maurier, Robert E. Sherwood, Joan H...",374,212,86.0,https://www.imdb.com//title/tt0032976/
244,Mr. Smith Goes to Washington,1939,U,2h 9min,"[Comedy, Drama]",A naive man is appointed to fill a vacancy in ...,8.1,110K,"[James Stewart, Jean Arthur, Claude Rains, Edw...",[Frank Capra],"[Sidney Buchman, Lewis R. Foster, Myles Connolly]",318,96,73.0,https://www.imdb.com//title/tt0031679/
245,Gone with the Wind,1939,PG,3h 58min,"[Drama, History, Romance]",A manipulative woman and a roguish man conduct...,8.1,299K,"[Clark Gable, Vivien Leigh, Thomas Mitchell, B...","[Victor Fleming, George Cukor, Sam Wood]","[Margaret Mitchell, Sidney Howard, Oliver H.P....",919,203,97.0,https://www.imdb.com//title/tt0031381/
246,It Happened One Night,1934,A,1h 45min,"[Comedy, Romance]",A renegade reporter and a crazy young heiress ...,8.1,97K,"[Clark Gable, Claudette Colbert, Walter Connol...",[Frank Capra],"[Robert Riskin, Samuel Hopkins Adams]",322,123,87.0,https://www.imdb.com//title/tt0025316/
247,The Passion of Joan of Arc,1928,A,1h 50min,"[Biography, Drama, History]","In 1431, Jeanne d'Arc is placed on trial on ch...",8.1,50K,"[Maria Falconetti, Eugene Silvain, André Berle...",[Carl Theodor Dreyer],"[Joseph Delteil, Carl Theodor Dreyer]",210,141,,https://www.imdb.com//title/tt0019254/
248,Sunrise: A Song of Two Humans,1927,A,1h 34min,"[Drama, Romance]",A sophisticated urban woman seduces a farmer i...,8.1,49K,"[George O'Brien, Janet Gaynor, Margaret Living...",[F.W. Murnau],"[Carl Mayer, Hermann Sudermann, Katherine Hill...",270,122,,https://www.imdb.com//title/tt0018455/
249,The General,1926,U,1h 7min,"[Action, Adventure, Comedy]",When Union spies steal an engineer's beloved l...,8.1,85K,"[Buster Keaton, Marion Mack, Glen Cavender, Ji...","[Clyde Bruckman, Buster Keaton]","[Buster Keaton, Clyde Bruckman, Al Boasberg]",311,120,,https://www.imdb.com//title/tt0017925/


#### Save Data

In [176]:
def save_csv(addr, df):
    df.to_csv(addr, index = False)

In [194]:
addr = "C:\\Users\\moham\\3D Objects\My Files\\Code\Python\\Test Codes\\Web Scraping\\IMDB top 1000\\Top250_Movies_Info.csv"
save_csv(addr , top250_df)

#### Load Data

In [184]:
def load_csv(addr):
    return pd.read_csv(addr)

In [246]:
addr = "C:\\Users\\moham\\3D Objects\My Files\\Code\Python\\Test Codes\\Web Scraping\\IMDB top 1000\\Top250_Movies_Info.csv"
top250_df = load_csv(addr)
top250_df.tail(3)

Unnamed: 0,Title,Year,Certificate,Time,Genere,plot,Rating,Voting,Stars,Directors,Writers,User reviews,Critic reviews,Metascore,URL
247,The Passion of Joan of Arc,1928,A,1h 50min,"['Biography', 'Drama', 'History']","In 1431, Jeanne d'Arc is placed on trial on ch...",8.1,50K,"['Maria Falconetti', 'Eugene Silvain', 'André ...",['Carl Theodor Dreyer'],"['Joseph Delteil', 'Carl Theodor Dreyer']",210,141,,https://www.imdb.com//title/tt0019254/
248,Sunrise: A Song of Two Humans,1927,A,1h 34min,"['Drama', 'Romance']",A sophisticated urban woman seduces a farmer i...,8.1,49K,"[""George O'Brien"", 'Janet Gaynor', 'Margaret L...",['F.W. Murnau'],"['Carl Mayer', 'Hermann Sudermann', 'Katherine...",270,122,,https://www.imdb.com//title/tt0018455/
249,The General,1926,U,1h 7min,"['Action', 'Adventure', 'Comedy']",When Union spies steal an engineer's beloved l...,8.1,85K,"['Buster Keaton', 'Marion Mack', 'Glen Cavende...","['Clyde Bruckman', 'Buster Keaton']","['Buster Keaton', 'Clyde Bruckman', 'Al Boasbe...",311,120,,https://www.imdb.com//title/tt0017925/


#### Modify Strings to Numbers

In [240]:
import re
def time2minutes(time):
    result = 0
    hours = re.search(r'^(\d+)(?:h)',time)
    if hours:
        result += int(hours.group(1)) * 60
        
    minutes = re.search(r'(\d+)(?:min)$',time)
    if minutes:
        result += int(minutes.group(1))
    
    return result

In [222]:
import re
unit2num_dict = {"B":1000000000,"M":1000000, "K":1000,"":1}

def unit2numbers(string):
    if not isinstance(string, str):
        return string
    regex = r"(\d+\.?\d*)\s?([MK]?)"
    find_ex = re.search(regex, string)
    if find_ex:
        number = find_ex.group(1)
        unit = find_ex.group(2)
        return float(number) * unit2num_dict[unit]
    else:
        return None

In [248]:
top250_df[pd.isna(top250_df['minutes'])]

Unnamed: 0,Title,Year,Certificate,Time,Genere,plot,Rating,Voting,Stars,Directors,Writers,User reviews,Critic reviews,Metascore,URL,minutes


In [247]:
top250_df['Year'] = top250_df['Year'].astype(np.int)
top250_df['minutes'] = top250_df['Time'].apply(lambda x: time2minutes(x)).astype(int)
top250_df['Rating'] = top250_df['Rating'].astype(np.float)
top250_df['Voting'] = top250_df['Voting'].apply(lambda x: unit2numbers(x)).astype(np.int)
top250_df['User reviews'] = top250_df['User reviews'].apply(lambda x: unit2numbers(x)).astype(np.int)
#top250_df['Critic reviews'] = top250_df['Critic reviews'].apply(lambda x: unit2numbers(x))
top250_df['Metascore'] = top250_df['Metascore'].astype(np.float)
top250_df.head(3)

Unnamed: 0,Title,Year,Certificate,Time,Genere,plot,Rating,Voting,Stars,Directors,Writers,User reviews,Critic reviews,Metascore,URL,minutes
0,The Shawshank Redemption,1994,15,2h 22min,['Drama'],Two imprisoned men bond over a number of years...,9.3,2400000,"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'...",['Frank Darabont'],"['Stephen King', 'Frank Darabont']",9300,184,80.0,https://www.imdb.com//title/tt0111161/,142
1,The Godfather,1972,X,2h 55min,"['Crime', 'Drama']",An organized crime dynasty's aging patriarch t...,9.2,1700000,"['Marlon Brando', 'Al Pacino', 'James Caan', '...",['Francis Ford Coppola'],"['Mario Puzo', 'Francis Ford Coppola']",4500,267,100.0,https://www.imdb.com//title/tt0068646/,175
2,The Dark Knight,2008,12A,2h 32min,"['Action', 'Crime', 'Drama']",When the menace known as the Joker wreaks havo...,9.0,2400000,"['Christian Bale', 'Heath Ledger', 'Aaron Eckh...",['Christopher Nolan'],"['Jonathan Nolan', 'Christopher Nolan', 'David...",7500,434,84.0,https://www.imdb.com//title/tt0468569/,152


#### Clean Data

In [193]:
top250_df['Year'].fillna(top250_df['Year'].mean(), inplace = True)
top250_df['minutes'].fillna(top250_df['minutes'].mean(), inplace = True)
top250_df['Rating'].fillna(top250_df['Rating'].mean(), inplace = True)
top250_df['Voting'].fillna(top250_df['Voting'].mean(), inplace = True)
top250_df['User reviews'].fillna(top250_df['User reviews'].mean(), inplace = True)
top250_df['Critic reviews'].fillna(top250_df['Critic reviews'].mean(), inplace = True)
top250_df['Metascore'].fillna(top250_df['Metascore'].mean(), inplace = True)
top250_df.tail(3)

Unnamed: 0,Title,Year,Certificate,Time,Genere,plot,Rating,Voting,Stars,Directors,Writers,User reviews,Critic reviews,Metascore,URL,minutes
247,The Passion of Joan of Arc,1928,A,1h 50min,"['Biography', 'Drama', 'History']","In 1431, Jeanne d'Arc is placed on trial on ch...",8.1,50000.0,"['Maria Falconetti', 'Eugene Silvain', 'André ...",['Carl Theodor Dreyer'],"['Joseph Delteil', 'Carl Theodor Dreyer']",210,141,82.660465,https://www.imdb.com//title/tt0019254/,110.0
248,Sunrise: A Song of Two Humans,1927,A,1h 34min,"['Drama', 'Romance']",A sophisticated urban woman seduces a farmer i...,8.1,49000.0,"[""George O'Brien"", 'Janet Gaynor', 'Margaret L...",['F.W. Murnau'],"['Carl Mayer', 'Hermann Sudermann', 'Katherine...",270,122,82.660465,https://www.imdb.com//title/tt0018455/,94.0
249,The General,1926,U,1h 7min,"['Action', 'Adventure', 'Comedy']",When Union spies steal an engineer's beloved l...,8.1,85000.0,"['Buster Keaton', 'Marion Mack', 'Glen Cavende...","['Clyde Bruckman', 'Buster Keaton']","['Buster Keaton', 'Clyde Bruckman', 'Al Boasbe...",311,120,82.660465,https://www.imdb.com//title/tt0017925/,67.0


In [183]:
top250_df.sort_values(['Rating','User reviews', 'Metascore'], ascending = False, inplace = True)
top250_df.head(10)

Unnamed: 0,Title,Year,Certificate,Time,Genere,plot,Rating,Voting,Stars,Directors,Writers,User reviews,Critic reviews,Metascore,URL,minutes
0,The Shawshank Redemption,1994,15,2h 22min,[Drama],Two imprisoned men bond over a number of years...,9.3,2400000.0,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",[Frank Darabont],"[Stephen King, Frank Darabont]",9300.0,184,80.0,https://www.imdb.com//title/tt0111161/,142.0
1,The Godfather,1972,X,2h 55min,"[Crime, Drama]",An organized crime dynasty's aging patriarch t...,9.2,1700000.0,"[Marlon Brando, Al Pacino, James Caan, Diane K...",[Francis Ford Coppola],"[Mario Puzo, Francis Ford Coppola]",4500.0,267,100.0,https://www.imdb.com//title/tt0068646/,175.0
2,The Dark Knight,2008,12A,2h 32min,"[Action, Crime, Drama]",When the menace known as the Joker wreaks havo...,9.0,2400000.0,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",[Christopher Nolan],"[Jonathan Nolan, Christopher Nolan, David S. G...",7500.0,434,84.0,https://www.imdb.com//title/tt0468569/,152.0
4,12 Angry Men,1957,U,1h 36min,"[Crime, Drama]",A jury holdout attempts to prevent a miscarria...,9.0,718000.0,"[Henry Fonda, Lee J. Cobb, Martin Balsam, John...",[Sidney Lumet],[Reginald Rose],1800.0,159,96.0,https://www.imdb.com//title/tt0050083/,96.0
3,The Godfather: Part II,1974,X,3h 22min,"[Crime, Drama]",The early life and career of Vito Corleone in ...,9.0,1200000.0,"[Al Pacino, Robert De Niro, Robert Duvall, Dia...",[Francis Ford Coppola],"[Francis Ford Coppola, Mario Puzo]",1100.0,188,90.0,https://www.imdb.com//title/tt0071562/,202.0
5,The Lord of the Rings: The Return of the King,2003,12A,3h 21min,"[Action, Adventure, Drama]",Gandalf and Aragorn lead the World of Men agai...,8.9,1700000.0,"[Elijah Wood, Viggo Mortensen, Ian McKellen, O...",[Peter Jackson],"[J.R.R. Tolkien, Fran Walsh, Philippa Boyens]",3800.0,359,94.0,https://www.imdb.com//title/tt0167260/,201.0
6,Pulp Fiction,1994,18,2h 34min,"[Crime, Drama]","The lives of two mob hitmen, a boxer, a gangst...",8.9,1900000.0,"[John Travolta, Uma Thurman, Samuel L. Jackson...",[Quentin Tarantino],"[Quentin Tarantino, Roger Avary]",3200.0,295,94.0,https://www.imdb.com//title/tt0110912/,154.0
7,Schindler's List,1993,15,3h 15min,"[Biography, Drama, History]","In German-occupied Poland during World War II,...",8.9,1300000.0,"[Liam Neeson, Ralph Fiennes, Ben Kingsley, Car...",[Steven Spielberg],"[Thomas Keneally, Steven Zaillian]",2000.0,179,94.0,https://www.imdb.com//title/tt0108052/,195.0
10,The Lord of the Rings: The Fellowship of the Ring,2001,PG,2h 58min,"[Action, Adventure, Drama]",A meek Hobbit from the Shire and eight compani...,8.8,1700000.0,"[Elijah Wood, Ian McKellen, Orlando Bloom, Sea...",[Peter Jackson],"[J.R.R. Tolkien, Fran Walsh, Philippa Boyens]",5500.0,349,92.0,https://www.imdb.com//title/tt0120737/,178.0
8,Inception,2010,12A,2h 28min,"[Action, Adventure, Sci-Fi]",A thief who steals corporate secrets through t...,8.8,2100000.0,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",[Christopher Nolan],[Christopher Nolan],4300.0,482,74.0,https://www.imdb.com//title/tt1375666/,148.0


In [None]:
f