## Step 1 - Data Collection

In this step, we will collect information regarding IMDb.

In [30]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import time

In [31]:
baseURL = 'https://www.imdb.com'

## Get genre
url = "https://www.imdb.com/chart/top/"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all('li', class_='subnav_item_main')
listGenre = [{'Name':item.text.replace('\n','').strip(),'URL':baseURL+item.a['href']} for item in results]
df_genre = pd.DataFrame(listGenre)
print ("DF genres and url:")
df_genre

DF genres and url:


Unnamed: 0,Name,URL
0,Action,https://www.imdb.com/search/title?genres=actio...
1,Adventure,https://www.imdb.com/search/title?genres=adven...
2,Animation,https://www.imdb.com/search/title?genres=anima...
3,Biography,https://www.imdb.com/search/title?genres=biogr...
4,Comedy,https://www.imdb.com/search/title?genres=comed...
5,Crime,https://www.imdb.com/search/title?genres=crime...
6,Drama,https://www.imdb.com/search/title?genres=drama...
7,Family,https://www.imdb.com/search/title?genres=famil...
8,Fantasy,https://www.imdb.com/search/title?genres=fanta...
9,Film-Noir,https://www.imdb.com/search/title?genres=film_...


In [33]:
## ---------------------------------------------
def GetDataPerGenre(genre, uri): #per page
    time.sleep(0.2)
    page = requests.get(uri)
    soup = BeautifulSoup(page.content, 'html.parser')
    content = soup.find_all('div', class_='lister-item mode-advanced')
    data = []
    for el in content:
        name = el('h3',class_='lister-item-header')[0].a.text ## get item without list
        i = el('h3',class_='lister-item-header')[0].a['href'].split('/')[2]  ## get item without list
        year = el('h3',class_='lister-item-header')[0].find('span', class_='lister-item-year').text.replace('(','').replace(')','').replace('I','').strip()
        imdb_rating = el('div',class_='ratings-bar')[0].find('div', class_='ratings-imdb-rating').strong.text
        director = el('div', class_='lister-item-content')[0].find('p', class_='').find_all('a')[0].text
        votes = el('div', class_='lister-item-content')[0].find('p',class_='sort-num_votes-visible').find_all('span',{"name":"nv"})[0].text
        malesRating, malesCount, femalesRating, femalesCount = GetRatingDataByID(i)
        try:
            gross = el('div', class_='lister-item-content')[0].find('p',class_='sort-num_votes-visible').find_all('span',{"name":"nv"})[1].text
        except:
            gross = np.nan
            
        data.append({'Name':name,'Genre':genre,'ID':i,'Year':year,'Imdb Rating':imdb_rating,'Director':director, 
                     'Votes':votes,'Gross':gross, 'malesRating':malesRating, 'malesCount':malesCount, 
                     'femalesRating':femalesRating ,'femalesCount':femalesCount})
    return data
## ---------------------------------------------
def GetRatingDataByID(id):
    time.sleep(0.5)
    uri = "https://www.imdb.com/title/{0}/ratings/?ref_=tt_ov_rt".format(id)
    page = requests.get(uri)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    if (len(soup.find_all('table')) > 2):
        content = soup.find_all('table')[1]
    else:
        content = soup.find_all('table')[0]
    
    males = content.find_all('tr')[2]
    malesRating = males.find('div', class_='bigcell').text
    malesCount = males.find('div', class_='smallcell').text.strip()

    females = content.find_all('tr')[3]
    femalesRating = females.find('div', class_='bigcell').text
    femalesCount = females.find('div', class_='smallcell').text.strip()

    return (malesRating, malesCount, femalesRating, femalesCount)
## ---------------------------------------------
def GetNextPage(uri):
    time.sleep(0.2)
    page = requests.get(uri)
    soup = BeautifulSoup(page.content, 'html.parser')
    href = soup.find('a', class_='next-page')
    ## end of paging
    if (href == None):
        return ""
    
    return "https://www.imdb.com{0}".format(href['href'])
## ---------------------------------------------
def Init(genre, uri):
    temp = []
    temp.extend(GetDataPerGenre(genre, uri)) # first page
    nextUri = uri
    count = 0
    while (True):
        nextUri = GetNextPage(nextUri)
        if (nextUri == "" or count == 4):
            break
        temp.extend(GetDataPerGenre(genre, nextUri))
        count = count + 1
        
    return temp
## ---------------------------------------------
data = []
for item in listGenre:
    data.extend(Init(item['Name'], item['URL']))

print (len(data))
df = pd.DataFrame(data)
df

genre: Action
uri https://www.imdb.com/search/title?genres=action&sort=user_rating,desc&title_type=feature&num_votes=25000,
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=action&sort=user_rating,desc&start=51
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=action&sort=user_rating,desc&start=101
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=action&sort=user_rating,desc&start=151
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=action&sort=user_rating,desc&start=201
genre: Adventure
uri https://www.imdb.com/search/title?genres=adventure&sort=user_rating,desc&title_type=feature&num_votes=25000,
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=adventure&sort=user_rating,desc&start=51
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=adventure&sort=user_rating,desc&start=101
uri https://www.

uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=mystery&sort=user_rating,desc&start=151
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=mystery&sort=user_rating,desc&start=201
genre: Romance
uri https://www.imdb.com/search/title?genres=romance&sort=user_rating,desc&title_type=feature&num_votes=25000,
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=romance&sort=user_rating,desc&start=51
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=romance&sort=user_rating,desc&start=101
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=romance&sort=user_rating,desc&start=151
uri https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=romance&sort=user_rating,desc&start=201
genre: Sci-Fi
uri https://www.imdb.com/search/title?genres=sci_fi&sort=user_rating,desc&title_type=feature&num_votes=25000,
uri https://www.imd

Unnamed: 0,Name,Genre,ID,Year,Imdb Rating,Director,Votes,Gross,malesRating,malesCount,femalesRating,femalesCount
0,The Dark Knight,Action,tt0468569,2008,9.0,Christopher Nolan,2573361,$534.86M,9.0,1480015,8.7,265210
1,The Lord of the Rings: The Return of the King,Action,tt0167260,2003,9.0,Peter Jackson,1786873,$377.85M,9.0,1030137,8.9,218645
2,Inception,Action,tt1375666,2010,8.8,Christopher Nolan,2283048,$292.58M,8.8,1263368,8.7,294017
3,The Lord of the Rings: The Two Towers,Action,tt0167261,2002,8.8,Peter Jackson,1613775,$342.55M,8.8,925097,8.7,202157
4,The Lord of the Rings: The Fellowship of the Ring,Action,tt0120737,2001,8.8,Peter Jackson,1807955,$315.54M,8.8,1025720,8.8,229858
...,...,...,...,...,...,...,...,...,...,...,...,...
4686,Bandidas,Western,tt0416496,2006,5.7,Joachim Rønning,35548,,5.6,24259,5.9,5058
4687,The Dark Tower,Western,tt1648190,2017,5.6,Nikolaj Arcel,137688,$50.70M,5.5,85904,5.9,15506
4688,Wild Wild West,Western,tt0120891,1999,4.9,Barry Sonnenfeld,160516,$113.81M,4.9,109780,5.1,16453
4689,The Ridiculous 6,Western,tt2479478,2015,4.8,Frank Coraci,48957,,4.8,31476,5.1,4110


In [34]:
# Export data to a csv file
df.to_csv("IMDb_rating_data.csv")