In [94]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import time
import random
from fake_useragent import UserAgent
from dateutil.parser import parse

SHOWS SCRAPER

In [95]:
def avg_runtime_f(soup): #counting average runtime
    try:
        avg_runtime = soup.find(text='Runtime').next.text.split()
        avg_runtime = int(avg_runtime[0]) if len(avg_runtime) <= 2 else int(avg_runtime[0])*60 + int(avg_runtime[2])
        if avg_runtime > 180:
            avg_runtime = 42 #if shows longer than 3 hours 42 if statistic lenght of the show
        if avg_runtime == 1: #if it is one hour
            avg_runtime = 60 
    except: #if there is no runtime
        avg_runtime = 0 #or should I break from that loop and not include?
    return avg_runtime

def origin_f(soup): #finding origin
    try:
        return soup.find(text='Country of origin').next.find('a').text
    except: #if there is more then one, select first
        return soup.find(text='Countries of origin').next.find('a').text
    
def creators_f(soup): #finding creators
    try:
        try:
            creators = [i.text for i in soup.find(text='Creators').next.find_all('a')]
        except: #if there is only one creator
            creators = list([soup.find(text='Creator').next.next.next.next.text])
    except: #if there is no creators, or should I break from that loop and not include?
        creators = [] 
    return creators

def count_awards(people): #counting awards for person
    x=[]
    for i in people:
        url_c = 'https://www.imdb.com' + i
        response_c = requests.get(url_c)
        page_c = response_c.text
        soup_c = BeautifulSoup(page_c, "lxml")
        try:
            x.append(int(soup_c.find(class_='desc').text.split(" ")[2]))
        except:
            x.append(0)
    return x    

def creators_a_f(soup): #counting awards for creators
    try:
        try:
            cr = [i['href'].split('?')[0]+'awards' for i in soup.find(text='Creators').next.find_all('a')]
        except: #if there is only one creator
            cr = [soup.find(text='Creator').next.next.find('a')['href'].split('?')[0]+'awards']        
        creators_awards = count_awards(cr)        
    except: #if there is no creators, or should I break from that loop and not include?
        creators_awards = []
    return creators_awards

def stars_f(soup): #finding stars
    try:
        try:
            stars = [i.text for i in soup.find(text='Stars').next.find_all('a')]
        except: #if there is only one star
            stars = list([soup.find(text='Star').next.next.next.next.text]) # did not find such case, but just to be safe
    except: #if there is no stars, or should I break from that loop and not include?
        stars = [] 
    return stars

def stars_a_f(soup): #counting awards for stars
    try:
        try:
            st = [i['href'].split('?')[0]+'awards' for i in soup.find(text='Stars').next.find_all('a')]
        except: #if there is only one star
            st = [soup.find(text='Star').next.next.find('a')['href'].split('?')[0]+'awards']
        stars_awards = count_awards(st)        
    except: #if there is no stars, or should I break from that loop and not include?
        stars_awards = []
    return stars_awards

def company_f(soup):
    try:
        try:
            company = [i.text for i in soup.find(text='Production companies').next.find_all('a')]
        except: #if there is only one production company
            company = [soup.find(text='Production company').next.text]
    except: #if there is no stars, or should I break from that loop and not include?
        company = [] 
    return company

def scrape_shows(show_url_list):    
    pipeline_list = []
    
    for show_url in show_url_list:

        time.sleep(.5+2*random.random()) #random intervals before every move
        
        try:
            ua = UserAgent()
            user_agent = {'User-agent': ua.random} #random agent for safety

            response = requests.get(show_url, headers = user_agent) #page of the show
            page = response.text
            soup = BeautifulSoup(page, "lxml")

            url_ep = show_url + '/episodes?season=1' #page with episodes
            response_ep = requests.get(url_ep)
            page_ep = response_ep.text
            soup_ep = BeautifulSoup(page_ep, "lxml")    

            pipeline_list += [{
                "link": show_url,
                "imdb_rating": float(soup.find(class_="sc-7ab21ed2-1 jGRxWM").text),
                "title": soup.find(class_="sc-94726ce4-0 cMYixt").next.next.text,
                "s01_episodes": int(len(soup_ep.find_all('div', class_='info'))),
                "avg_runtime": avg_runtime_f(soup),
                "genres": [i.text for i in soup.find_all('li', class_="ipc-inline-list__item ipc-chip__text")],
                "rel_date": parse(' '.join(soup.find(text='Release date').next.find('a').next.text.split()[0:3])),
                "certification": soup.find(class_="sc-8c396aa2-2 itZqyK").next.next.next.next.text,
                "origin": origin_f(soup),
                "company": company_f(soup),
                "creators": creators_f(soup),
                "creators_a": creators_a_f(soup),
                "stars": stars_f(soup),
                "stars_a": stars_a_f(soup)
            }]
            
            pd.DataFrame(pipeline_list).to_pickle("./shows.pkl") #saving dictionary for safety as df
            
        except:
            print('Scraping error: '+show_url)
    return pipeline_list  

LIST OF THE SHOWS SCRAPPER

In [96]:
imdb_search_urls = ['https://www.imdb.com/search/title/?title_type=tv_series&release_date=2019-01-01,2021-12-31&num_votes=1000,&view=simple&sort=release_date,asc&count=250', 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=2019-01-01,2021-12-31&num_votes=1000,&view=simple&sort=release_date,asc&count=250&start=251&ref_=adv_nxt', 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=2019-01-01,2021-12-31&num_votes=1000,&view=simple&sort=release_date,asc&count=250&start=501&ref_=adv_nxt', 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=2019-01-01,2021-12-31&num_votes=1000,&view=simple&sort=release_date,asc&count=250&start=751&ref_=adv_nxt', 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=2019-01-01,2021-12-31&num_votes=1000,&view=simple&sort=release_date,asc&count=250&start=1001&ref_=adv_nxt', 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=2019-01-01,2021-12-31&num_votes=1000,&view=simple&sort=release_date,asc&count=250&start=1251&ref_=adv_nxt']
#250 TV Series, Released between 2019-01-01 and 2021-12-31, Rating Count at least 1,000 (Sorted by Release Date Ascending)

show_url_list=[]

for imdb_search_url in imdb_search_urls:
    
    time.sleep(.5+2*random.random()) #random intervals
    
    response = requests.get(imdb_search_url)
    response.status_code  #200 = success!
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    urls = ['https://www.imdb.com'+i.parent.find('a')['href'][:-1] for i in soup.find_all(class_="lister-item-index unbold text-primary")]  
    
    show_url_list +=urls

len(show_url_list)

1385

SCRAPING ALL SHOWS

In [97]:
shows = scrape_shows(show_url_list)

Scraping error: https://www.imdb.com/title/tt9426194
Scraping error: https://www.imdb.com/title/tt13176816
Scraping error: https://www.imdb.com/title/tt9654080
Scraping error: https://www.imdb.com/title/tt9908860
Scraping error: https://www.imdb.com/title/tt9817218
Scraping error: https://www.imdb.com/title/tt10243628
Scraping error: https://www.imdb.com/title/tt8995604
Scraping error: https://www.imdb.com/title/tt8661868
Scraping error: https://www.imdb.com/title/tt10098248
Scraping error: https://www.imdb.com/title/tt8762206
Scraping error: https://www.imdb.com/title/tt10327294
Scraping error: https://www.imdb.com/title/tt10681780
Scraping error: https://www.imdb.com/title/tt11007186
Scraping error: https://www.imdb.com/title/tt11041046
Scraping error: https://www.imdb.com/title/tt9104072
Scraping error: https://www.imdb.com/title/tt10122474
Scraping error: https://www.imdb.com/title/tt11285856
Scraping error: https://www.imdb.com/title/tt10726424
Scraping error: https://www.imdb.com

In [98]:
len(shows) #I would not scrape again to not disturb IMDB, just to get data from 29 more shows

1355

In [99]:
shows_final = pd.read_pickle("./shows.pkl")

In [100]:
shows_final.head()

Unnamed: 0,link,imdb_rating,title,s01_episodes,avg_runtime,genres,rel_date,certification,origin,company,creators,creators_a,stars,stars_a
0,https://www.imdb.com/title/tt7670568,5.8,The Masked Singer,10,60,"[Game-Show, Music, Reality-TV]",2019-01-02,TV-PG,United States,"[Smart Dog Media, Fox Alternative Entertainmen...",[],[],"[Jenny McCarthy-Wahlberg, Ken Jeong, Nicole Sc...","[3, 5, 1]"
1,https://www.imdb.com/title/tt8001250,6.2,Siempre Bruja,11,40,"[Drama, Fantasy]",2019-01-01,TV-14,Colombia,[Caracol],[],[],"[Sofía Araujo Mejía, Angely Gaviria, Sofía Ara...","[0, 0, 0]"
2,https://www.imdb.com/title/tt8115560,6.6,Tidying Up with Marie Kondo,8,40,[Reality-TV],2019-01-01,TV-PG,United States,"[Netflix, The Jackal Group]",[Marie Kondo],[1],"[Marie Kondo, Charlotte Hervieux, Marie Iida]","[1, 0, 0]"
3,https://www.imdb.com/title/tt8324422,8.1,PEN15,10,30,[Comedy],2019-02-08,TV-MA,United States,"[Awesomeness TV, Odenkirk Provissiero Entertai...","[Anna Konkle, Maya Erskine, Sam Zvibleman]","[1, 1, 4]","[Maya Erskine, Anna Konkle, Melora Walters]","[1, 1, 5]"
4,https://www.imdb.com/title/tt8888322,6.6,Bloom,6,60,"[Drama, Mystery, Sci-Fi]",2019-01-01,,Australia,[Playmaker Media],[Glen Dolman],[5],"[Bryan Brown, Phoebe Tonkin, Genevieve Morris]","[11, 0, 1]"


In [101]:
shows_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1355 entries, 0 to 1354
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   link           1355 non-null   object        
 1   imdb_rating    1355 non-null   float64       
 2   title          1355 non-null   object        
 3   s01_episodes   1355 non-null   int64         
 4   avg_runtime    1355 non-null   int64         
 5   genres         1355 non-null   object        
 6   rel_date       1355 non-null   datetime64[ns]
 7   certification  1355 non-null   object        
 8   origin         1355 non-null   object        
 9   company        1355 non-null   object        
 10  creators       1355 non-null   object        
 11  creators_a     1355 non-null   object        
 12  stars          1355 non-null   object        
 13  stars_a        1355 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(10)
memory usage: 148