## Step 1 - Data Collection

In this step, we will collect information regarding IMDb.

In [3]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import time

In [4]:
baseURL = 'https://www.imdb.com'

## Get genre
url = "https://www.imdb.com/chart/top/"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all('li', class_='subnav_item_main')
listGenre = [{'Name':item.text.replace('\n','').strip(),'URL':baseURL+item.a['href']} for item in results]
df_genre = pd.DataFrame(listGenre)

### Table of Genre with links

In [5]:
df_genre

Unnamed: 0,Name,URL
0,Action,https://www.imdb.com/search/title?genres=actio...
1,Adventure,https://www.imdb.com/search/title?genres=adven...
2,Animation,https://www.imdb.com/search/title?genres=anima...
3,Biography,https://www.imdb.com/search/title?genres=biogr...
4,Comedy,https://www.imdb.com/search/title?genres=comed...
5,Crime,https://www.imdb.com/search/title?genres=crime...
6,Drama,https://www.imdb.com/search/title?genres=drama...
7,Family,https://www.imdb.com/search/title?genres=famil...
8,Fantasy,https://www.imdb.com/search/title?genres=fanta...
9,Film-Noir,https://www.imdb.com/search/title?genres=film_...


### Retrieving data from the IMDB website using crawling

#### Finding data by tags and retrieving them
#### We pull 4 pages for each genre

In [6]:
## ---------------------------------------------
def GetDataPerGenre(genre, uri): #per page
    time.sleep(0.2)
    page = requests.get(uri)
    soup = BeautifulSoup(page.content, 'html.parser')
    content = soup.find_all('div', class_='lister-item mode-advanced')
    data = []
    for el in content:
        name = el('h3',class_='lister-item-header')[0].a.text ## get item without list
        i = el('h3',class_='lister-item-header')[0].a['href'].split('/')[2]  ## get item without list
        year = el('h3',class_='lister-item-header')[0].find('span', class_='lister-item-year').text.replace('(','').replace(')','').replace('I','').strip()
        imdb_rating = el('div',class_='ratings-bar')[0].find('div', class_='ratings-imdb-rating').strong.text
        director = el('div', class_='lister-item-content')[0].find('p', class_='').find_all('a')[0].text
        votes = el('div', class_='lister-item-content')[0].find('p',class_='sort-num_votes-visible').find_all('span',{"name":"nv"})[0].text
        malesRating, malesCount, femalesRating, femalesCount = GetRatingDataByID(i)
        try:
            gross = el('div', class_='lister-item-content')[0].find('p',class_='sort-num_votes-visible').find_all('span',{"name":"nv"})[1].text
        except:
            gross = np.nan
            
        data.append({'Name':name,'Genre':genre,'ID':i,'Year':year,'Imdb Rating':imdb_rating,'Director':director, 
                     'Votes':votes,'Gross':gross, 'malesRating':malesRating, 'malesCount':malesCount, 
                     'femalesRating':femalesRating ,'femalesCount':femalesCount})
    return data
## ---------------------------------------------
def GetRatingDataByID(id):
    time.sleep(0.5)
    uri = "https://www.imdb.com/title/{0}/ratings/?ref_=tt_ov_rt".format(id)
    page = requests.get(uri)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    if (len(soup.find_all('table')) > 2):
        content = soup.find_all('table')[1]
    else:
        content = soup.find_all('table')[0]
    
    males = content.find_all('tr')[2]
    malesRating = males.find('div', class_='bigcell').text
    malesCount = males.find('div', class_='smallcell').text.strip()

    females = content.find_all('tr')[3]
    femalesRating = females.find('div', class_='bigcell').text
    femalesCount = females.find('div', class_='smallcell').text.strip()

    return (malesRating, malesCount, femalesRating, femalesCount)
## ---------------------------------------------
def GetNextPage(uri):
    time.sleep(0.2)
    page = requests.get(uri)
    soup = BeautifulSoup(page.content, 'html.parser')
    href = soup.find('a', class_='next-page')
    ## end of paging
    if (href == None):
        return ""
    
    return "https://www.imdb.com{0}".format(href['href'])
## ---------------------------------------------
def Prepare(genre, uri):
    temp = []
    temp.extend(GetDataPerGenre(genre, uri)) # first page
    nextUri = uri
    count = 0
    while (True):
        nextUri = GetNextPage(nextUri)
        if (nextUri == "" or count == 4):
            break
        temp.extend(GetDataPerGenre(genre, nextUri))
        count = count + 1
        
    return temp
## ---------------------------------------------

In [None]:
data = []
for item in listGenre:
    data.extend(Prepare(item['Name'], item['URL']))

print (len(data))
df = pd.DataFrame(data)
df

### Save data to a csv file

In [34]:
df.to_csv("IMDb_rating_data.csv")