# Data Collection

Run the code below to go into previous directory

In [1]:
%cd ..

C:\Users\ganiy\OneDrive\Documents\IMDB-TV


Uncomment the codes below and run to create the directories if this is your first time running it. If it returns a statement stating that the directories already exists, proceed. 

In [2]:
#%mkdir tv-series-data/
#%mkdir tv-series-data-named/
#%mkdir cumulative-data/

In [3]:
import pandas as pd
import numpy as np
import os
%matplotlib inline
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import re
from requests import get

### IMDb Top 250 TV shows

The goal of this section is to obtain the top 250 highest rated TV shows on IMDb

#### TV Series List

In [4]:
# url with the link to the top 250 highest rated TV shows list
url = 'https://www.imdb.com/chart/toptv'
response = get(url)

In [5]:
tv_soup = BeautifulSoup(response.text, 'html.parser')

In [6]:
# collecting info about all the tv shows from the website
containers = tv_soup.find_all('td', class_='titleColumn')

In [7]:
containers_rating = tv_soup.find_all('td', class_ = "ratingColumn imdbRating")

In [8]:
rating_list = []
for i in range(len(containers_rating)):
    rating = containers_rating[i].strong["title"]
    rating = rating[:3]
    rating_list.append(rating)

In [9]:
# listing out the encoded title of all the tv shows
tv_list = []
for i in range(0,len(containers)):
    title = containers[i].a['href']
    title = title.split("/")[2]
    tv_list.append(title)

In [10]:
# creating a list that extracts information about each tv such as 
# title, rating, total_votes, description, release year, its link and its encoded title
comprehensive_list = []
for tv in tv_list:
    response = get('https://www.imdb.com/title/' + tv + "/")
    tv_soup = BeautifulSoup(response.text, 'html.parser')
    title_verbose = tv_soup.find('title').string
    releaseYear = re.findall(r'[0-9][0-9][0-9][0-9]', title_verbose)
    tv_soup = BeautifulSoup(response.text, 'html.parser')
    #tv_title = tv_soup.find('title').string
    #rating = tv_soup.find("span", class_ = "AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV").string
    rating_count = tv_soup.find("div", class_ ="sc-7ab21ed2-3 dPVcnq").string
    tv_title = tv_soup.find("h1", {"data-testid": "hero-title-block__title"}).string
    description = tv_soup.find("span", {"data-testid": "plot-xl"}).string
    link = 'https://www.imdb.com/title/' + tv
    encoded_title = tv
    genres = tv_soup.find_all('a', class_ = "GenresAndPlot__GenreChip-sc-cum89p-3 LKJMs ipc-chip ipc-chip--on-baseAlt")
    genre = ""
    for g in genres:
        genre += g.text + ", "
    genre = genre.strip(", ")
    comprehensive_list.append([tv_title, rating_count, description, releaseYear[0], link, encoded_title, genre])

In [11]:
### converting the comprehensive list into a data frame
tv_best = pd.DataFrame(comprehensive_list, columns = ["title","total_votes", "show_desc", "year", "link", "encoded_title", "genre"])

In [12]:
tv_best["rating"] = rating_list

In [13]:
tv_best["rating"] = tv_best["rating"].astype(float)

In [14]:
tv_best = tv_best[["title","rating","total_votes", "show_desc", "year", "link", "genre","encoded_title"]]

In [15]:
# converting K to 000 for total votes
tv_best["total_votes"] = tv_best["total_votes"].str.replace("K", "000")
tv_best["total_votes"] = tv_best["total_votes"].str.replace("M", "000000")

In [16]:
# reseting index of data frame
tv_best = tv_best.reset_index(drop = True)

In [17]:
# using a for loop to return a better result for the 
for i in range(len(tv_best)):
    if "." in tv_best.loc[i, "total_votes"]:
        tv_best.loc[i,"total_votes"] = tv_best.loc[i, "total_votes"][:-1]

In [18]:
# replacing . with an empty string so total votes can be converted into integer
tv_best["total_votes"] = tv_best["total_votes"].str.replace(".", "")

  


In [19]:
# transforming certain columns into integers
tv_best["rating"] = pd.to_numeric(tv_best["rating"], downcast="float")
tv_best["total_votes"] = tv_best["total_votes"].astype(int)
tv_best["year"] = tv_best["year"].astype(int)

In [20]:
tv_best

Unnamed: 0,title,rating,total_votes,show_desc,year,link,genre,encoded_title
0,Planet Earth II,9.4,119000,David Attenborough returns with a new wildlife...,2016,https://www.imdb.com/title/tt5491994,,tt5491994
1,Breaking Bad,9.4,1700000,A high school chemistry teacher diagnosed with...,2008,https://www.imdb.com/title/tt0903747,,tt0903747
2,Planet Earth,9.4,186000,"Emmy Award-winning, 11 episodes, five years in...",2006,https://www.imdb.com/title/tt0795176,,tt0795176
3,Band of Brothers,9.4,428000,The story of Easy Company of the U.S. Army 101...,2001,https://www.imdb.com/title/tt0185906,,tt0185906
4,Chernobyl,9.3,675000,"In April 1986, an explosion at the Chernobyl n...",2019,https://www.imdb.com/title/tt7366338,,tt7366338
...,...,...,...,...,...,...,...,...
245,"Love, Death & Robots",8.4,134000,A collection of animated short stories that sp...,2019,https://www.imdb.com/title/tt9561862,,tt9561862
246,Foyle's War,8.4,15000,"As WWII rages, DCS Foyle fights his own war on...",2002,https://www.imdb.com/title/tt0310455,,tt0310455
247,Jesus of Nazareth,8.4,22000,Beginning before the Nativity and extending th...,1977,https://www.imdb.com/title/tt0075520,,tt0075520
248,Clannad: After Story,8.4,11000,,2008,https://www.imdb.com/title/tt1298820,,tt1298820


In [21]:
encoded_desc = tv_best[["encoded_title", "show_desc"]]

In [22]:
tv_best.to_csv("cumulative-data/IMDb_top_250.csv", index = False)

### Collecting episode data

The following functions and code were used to collect data from the TV episodes

In [23]:
def tv_special(link):
    # This function is for tv that do not have numerous seasons but rather one season.
    # Due to their layout it is harder to obtain information about the episodes by season.
    # Hence, it was optimal to go to the page with all the episodes in ascending order.
    
    # obtaining url to obtain tv info
    url = 'https://www.imdb.com/title/' + link + "/"
    response = get(url)
    # parsing content of request
    tv_soup = BeautifulSoup(response.text, 'html.parser')
    # collecting title data
    tv_title = tv_soup.find("h1", {"data-testid": "hero-title-block__title"}).string
    # collecting data of total number of episodes
    total_episodes = int(tv_soup.find("span", class_ = "ipc-title__subtext").text)
    # obtaining new link for extracting tv information by episode
    new_link = "https://www.imdb.com/search/title/?series=" + link + "&view=simple&sort=release_date,asc"
    response = get(new_link)
    # parsing content of request
    tv_soup = BeautifulSoup(response.text, 'html.parser')
    l = tv_soup.find_all("span", class_ = "lister-item-header")
    comprehend = []
    episode = 0
    season = 1
    for u in l:
        u = str(u)
        # finding all tv titles
        tv = re.findall(r'/title/tt[0-9]*/', u)
        v = str(tv[1]).split("/")[2]
        url_n = 'https://www.imdb.com/title/' + v + "/"
        response = get(url_n)
        tv_soup = BeautifulSoup(response.text, 'html.parser')
        # obtaining tv title
        title = tv_soup.find("h1", {"data-testid": "hero-title-block__title"}).text
        episode += 1
        # checking to make sure rating is not empty
        if tv_soup.find("span", class_ = "sc-7ab21ed2-1 jGRxWM")== None:
            rating = "0"
        else:
            # recording rating
            rating = tv_soup.find("span", class_ = "sc-7ab21ed2-1 jGRxWM").text
        # checking to make sure total_votes is not empty
        if tv_soup.find("div", class_ ="sc-7ab21ed2-3 dPVcnq") == None:
            total_votes = "0"
        else:
            # recording total_votes
            total_votes = tv_soup.find("div", class_ ="sc-7ab21ed2-3 dPVcnq").text
        # checking to make sure air date is not empty
        if tv_soup.find("li", class_ ="ipc-inline-list__item") == None:
            airdate = ""
        else:
            # recording airdate
            airdate = tv_soup.find("li", class_ ="ipc-inline-list__item").text
        desc = tv_soup.find("span", class_ = "sc-16ede01-2 gXUyNh").text
        comprehend.append([tv_title, season, episode, title, airdate, rating, total_votes, desc, url_n, link])

    # The while loop below is used to extend to following pages so their information can be extracted
    n = 51
    while n < total_episodes:
        # obtaining urls
        url = "https://www.imdb.com/search/title/?series=" + link + "&view=simple&sort=release_date,asc&start=" + str(n) + "&ref_=adv_nxt"
        response = get(url)
        tv_soup = BeautifulSoup(response.text, 'html.parser')
        l = tv_soup.find_all("span", class_ = "lister-item-header")
        for u in l:
            u = str(u)
            tv = re.findall(r'/title/tt[0-9]*/', u)
            v = str(tv[1]).split("/")[2]
            url_n = 'https://www.imdb.com/title/' + v + "/"
            response = get(url_n)
            tv_soup = BeautifulSoup(response.text, 'html.parser')
            title = tv_soup.find("h1", {"data-testid": "hero-title-block__title"}).text
            episode += 1
            if tv_soup.find("span", class_ = "sc-7ab21ed2-1 jGRxWM")== None:
                rating = "0"
            else:
                rating = tv_soup.find("span", class_ = "sc-7ab21ed2-1 jGRxWM").text
            if tv_soup.find("div", class_ ="sc-7ab21ed2-3 dPVcnq") == None:
                total_votes = "0"
            else:
                total_votes = tv_soup.find("div", class_ ="sc-7ab21ed2-3 dPVcnq").text
            if tv_soup.find("li", class_ ="ipc-inline-list__item") == None:
                airdate = ""
            else:
                airdate = tv_soup.find("li", class_ ="ipc-inline-list__item").text
            desc = tv_soup.find("span", class_ = "sc-16ede01-2 gXUyNh").text
            comprehend.append([tv_title, season, episode, title, airdate, rating, total_votes, desc, url_n, link])
        n += 50
    return (comprehend)

In [24]:
def episode_recorder(episode):
    # This function is used for recording episodes information for tvs that have multiple seasons
    
    # recording episode number
    episode_number = int(episode.meta['content'])
    # recording episode title
    title = episode.a['title']
    # recording air date
    airdate = episode.find('div', class_='airdate').text.strip()
    # making sure the airdate value is not empty
    if len(airdate.split(" ")) >= 3:
        # transforming the format of the airdate
        new = airdate.split(" ")
        new = [new[1], new[0], new[2]]
        new[1] = new[1] + ","
        new = " ".join(new)
        airdate = new.replace(".", "")
        if episode.find('span', class_='ipl-rating-star__rating')== None:
            rating = "0"
        else:
            rating = episode.find('span', class_='ipl-rating-star__rating').text
        if episode.find('span', class_='ipl-rating-star__total-votes') == None:
            total_votes = "0"
        else:
            total_votes = episode.find('span', class_='ipl-rating-star__total-votes').text
    else:
        airdate = ""
        rating = "0"
        total_votes = "0"
    desc = episode.find('div', class_='item_description').text.strip()
    return [episode_number, title, airdate, rating, total_votes, desc]

In [25]:
def tv_output(link):
    tv_episodes = []
    url = 'https://www.imdb.com/title/' + link + "/"
    response = get(url)
    tv_soup = BeautifulSoup(response.text, 'html.parser')
    tv_title = tv_soup.find("h1", {"data-testid": "hero-title-block__title"}).string
    if tv_soup.find("select", class_ = "ipc-simple-select__input")== None or tv_soup.find("select", class_ = "ipc-simple-select__input")["aria-label"][2:] != "seasons":
        tv_episodes.extend(tv_special(link))
    else: 
        n = int(tv_soup.find("select", class_ = "ipc-simple-select__input")["aria-label"][0])
        for sn in range(1,n+1):
            response = get('https://www.imdb.com/title/' + link + '/episodes?season=' + str(sn))

            page_html = BeautifulSoup(response.text, 'html.parser')

            episode_containers = page_html.find_all('div', class_ = 'info')

            
            for episodes in episode_containers:
                season = sn
                url_n1 = "https://www.imdb.com" + episodes.find('strong').a["href"]
                #episode_number = episodes.meta['content']
                #title = episodes.a['title']
                #airdate = episodes.find('div', class_='airdate').text.strip()
                #episode_data = [tv_title, season, episode_number, title, airdate, rating, total_votes, desc]
                episode_data = [tv_title, season]
                l = episode_recorder(episodes)
                episode_data.extend(l)
                episode_data.append(url_n1)
                episode_data.append(link)
                tv_episodes.append(episode_data)
    return(tv_episodes)

### The mini dataset

The mini dataset extracts information about the first five TV shows to showcase how the final dataset will look like

In [26]:
# Testing out function to make sure it can collect data about tv
l = []
for tv in tv_list[:5]:
    l.extend(tv_output(tv))
df = pd.DataFrame(l, columns = ['tv', 'season', 'episode_number', 'title', 'airdate', 'rating', 'total_votes', 'episode_desc', 'episode_url', 'encoded_title'])
df = df.merge(encoded_desc, on = "encoded_title", how = "inner")

In [27]:
df["airdate"] = df["airdate"].str.replace("Episode aired ", "")
df["airdate"] = df["airdate"].str.replace("Episode airs ", "")
df["total_votes"] = df["total_votes"].str.replace("K", "000")
df["total_votes"] = df["total_votes"].str.replace(",", "")
df["total_votes"] = df["total_votes"].str.replace("(", "")
df["total_votes"] = df["total_votes"].str.replace(")", "")

  """
  


In [28]:
for i in range(len(df)):
    if "." in df.loc[i, "total_votes"]:
        df.loc[i,"total_votes"] = df.loc[i, "total_votes"][:-1]

In [29]:
df["total_votes"] = df["total_votes"].str.replace(".", "")

  """Entry point for launching an IPython kernel.


In [30]:
# transforming certain columns into integers
df["season"] = df["season"].astype(int)
df["episode_number"] = df["episode_number"].astype(int)
df["rating"] = pd.to_numeric(df["rating"], downcast="float")
df["total_votes"] = df["total_votes"].astype(int)

In [31]:
df

Unnamed: 0,tv,season,episode_number,title,airdate,rating,total_votes,episode_desc,episode_url,encoded_title,show_desc
0,Planet Earth II,1,1,Islands,"Feb 18, 2017",9.3,4600,Wildlife documentary series with David Attenbo...,https://www.imdb.com/title/tt6142646/,tt5491994,David Attenborough returns with a new wildlife...
1,Planet Earth II,1,2,Mountains,"Feb 25, 2017",8.9,3500,The wildlife documentary series with David Att...,https://www.imdb.com/title/tt6209126/,tt5491994,David Attenborough returns with a new wildlife...
2,Planet Earth II,1,3,Jungles,"Mar 4, 2017",8.7,3100,Jungles provide the richest habitats on the pl...,https://www.imdb.com/title/tt6209130/,tt5491994,David Attenborough returns with a new wildlife...
3,Planet Earth II,1,4,Deserts,"Mar 11, 2017",8.6,2800,The world's deserts force animals to come up w...,https://www.imdb.com/title/tt6209134/,tt5491994,David Attenborough returns with a new wildlife...
4,Planet Earth II,1,5,Grasslands,"Mar 18, 2017",8.5,2600,Grasslands cover one quarter of all land and s...,https://www.imdb.com/title/tt6209140/,tt5491994,David Attenborough returns with a new wildlife...
...,...,...,...,...,...,...,...,...,...,...,...
89,Chernobyl,1,1,1:23:45,"May 6, 2019",9.4,51000,Plant workers and firefighters put their lives...,https://www.imdb.com/title/tt8162428/,tt7366338,"In April 1986, an explosion at the Chernobyl n..."
90,Chernobyl,1,2,Please Remain Calm,"May 13, 2019",9.6,49000,"With untold millions at risk, Ulana makes a de...",https://www.imdb.com/title/tt8482972/,tt7366338,"In April 1986, an explosion at the Chernobyl n..."
91,Chernobyl,1,3,"Open Wide, O Earth","May 20, 2019",9.5,46000,Valery creates a detailed plan to decontaminat...,https://www.imdb.com/title/tt9166672/,tt7366338,"In April 1986, an explosion at the Chernobyl n..."
92,Chernobyl,1,4,The Happiness of All Mankind,"May 27, 2019",9.4,43000,Valery and Boris attempt to find solutions to ...,https://www.imdb.com/title/tt9166678/,tt7366338,"In April 1986, an explosion at the Chernobyl n..."


### Storing the data sets

The code below was used to store all the datasets in different data sets 

In [32]:
name_list = []
count = 1

In [33]:
for tv in tv_list[:5]: # to obtain all the datasets, replace tv_list[:5] with tv_list
    new_list = tv_output(tv)
    df = pd.DataFrame(new_list, columns = ['tv', 'season', 'episode_number', 'title', 'airdate', 'rating', 'total_votes', 'episode_desc', 'episode_url', 'encoded_title'])
    df = df.merge(encoded_desc, on = "encoded_title", how = "inner")
    df["airdate"] = df["airdate"].str.replace("Episode aired ", "")
    df["airdate"] = df["airdate"].str.replace("Episode airs ", "")
    df["total_votes"] = df["total_votes"].str.replace("K", "000")
    df["total_votes"] = df["total_votes"].str.replace(",", "")
    df["total_votes"] = df["total_votes"].str.replace("(", "")
    df["total_votes"] = df["total_votes"].str.replace(")", "")
    for i in range(len(df)):
        if "." in df.loc[i, "total_votes"]:
            df.loc[i,"total_votes"] = df.loc[i, "total_votes"][:-1]
    df["total_votes"] = df["total_votes"].str.replace(".", "")
    df["season"] = df["season"].astype(int)
    df["episode_number"] = df["episode_number"].astype(int)
    df["rating"] = pd.to_numeric(df["rating"], downcast="float")
    df["total_votes"] = df["total_votes"].astype(int)
    df = df[df["rating"] > 0]
    df.to_csv("tv-series-data/" + tv + ".csv", index = False)
    # splitting the tv column into multiple columns so as to obtain tv name
    new = df["tv"].str.split(" ", expand = True)
    new = new.fillna("")
    word = "-".join(new.iloc[0,])
    name = word.replace(":", "").strip("-").lower()
    name = name.replace(";", "")
    name = name.replace("!","")
    name = name.replace(".", "")
    name = name.replace("/", "")
    name = name.replace("'", "")
    name = name.replace(",", "")
    name = name.replace("?", "")
    if name in name_list:
        name += str(count)
    name_list.append(name)
    df.to_csv("tv-series-data-named/" + name + ".csv", index = False)

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  


The code below combines all the tv shows' datasets into one huge dataset

In [34]:
df = pd.DataFrame(columns = ['tv', 'season', 'episode_number', 'title', 'airdate', 'rating', 'total_votes', 'episode_desc', 'episode_url', 'encoded_title', 'show_desc'])
for tv in tv_list: 
    if os.path.isfile("tv-series-data/" + tv + ".csv"):
        df1 = pd.read_csv("tv-series-data/" + tv + ".csv")
        df = pd.concat([df1, df], axis = 0)
        df = df.reset_index(drop = True)
df.to_csv("cumulative-data/tv_dataset.csv", index = False)