# Data Collection

Uncomment the code below and run it to go into a previous directory

In [None]:
%cd ..

Uncomment the codes below and run to create the directories if this is your first time running it. If it returns a statement stating that the directories already exists, proceed. 

In [None]:
#%mkdir anime-series-data/
#%mkdir anime-series-data-named/
#%mkdir cumulative-data/

In [None]:
import pandas as pd
import numpy as np
import os
%matplotlib inline
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import re
from requests import get

### IMDb Top 250 TV shows

The goal of this section is to obtain the top 250 highest rated TV shows on IMDb

#### TV Series List

In [None]:
# url with the link to the top 250 highest rated TV shows list
url = 'https://www.imdb.com/chart/toptv'
response = get(url)

In [None]:
tv_soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
# collecting info about all the tv shows from the website
containers = tv_soup.find_all('td', class_='titleColumn')

In [None]:
containers_rating = tv_soup.find_all('td', class_ = "ratingColumn imdbRating")

In [None]:
rating_list = []
for i in range(len(containers_rating)):
    rating = containers_rating[i].strong["title"]
    rating = rating[:3]
    rating_list.append(rating)

In [None]:
# listing out the encoded title of all the tv shows
tv_list = []
for i in range(0,len(containers)):
    title = containers[i].a['href']
    title = title.split("/")[2]
    tv_list.append(title)

In [None]:
# creating a list that extracts information about each tv such as 
# title, rating, total_votes, description, release year, its link and its encoded title
comprehensive_list = []
for tv in tv_list:
    response = get('https://www.imdb.com/title/' + tv + "/")
    tv_soup = BeautifulSoup(response.text, 'html.parser')
    title_verbose = tv_soup.find('title').string
    releaseYear = re.findall(r'[0-9][0-9][0-9][0-9]', title_verbose)
    tv_soup = BeautifulSoup(response.text, 'html.parser')
    #tv_title = tv_soup.find('title').string
    #rating = tv_soup.find("span", class_ = "AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV").string
    rating_count = tv_soup.find("div", class_ ="AggregateRatingButton__TotalRatingAmount-sc-1ll29m0-3 jkCVKJ").string
    tv_title = tv_soup.find("h1", {"data-testid": "hero-title-block__title"}).string
    description = tv_soup.find("span", {"data-testid": "plot-xl"}).string
    link = 'https://www.imdb.com/title/' + tv
    encoded_title = tv
    comprehensive_list.append([tv_title, rating_count, description, releaseYear[0], link, encoded_title])

In [None]:
### converting the comprehensive list into a data frame
tv_best = pd.DataFrame(comprehensive_list, columns = ["title","total_votes", "description", "year", "link", "encoded_title"])

In [None]:
tv_best["rating"] = rating_list

In [None]:
tv_best["rating"] = tv_best["rating"].astype(float)

In [None]:
tv_best = tv_best[["title","rating","total_votes", "description", "year", "link", "encoded_title"]]

In [None]:
# converting K to 000 for total votes
tv_best["total_votes"] = tv_best["total_votes"].str.replace("K", "000")
tv_best["total_votes"] = tv_best["total_votes"].str.replace("M", "000000")

In [None]:
# reseting index of data frame
tv_best = tv_best.reset_index(drop = True)

In [None]:
# using a for loop to return a better result for the 
for i in range(len(tv_best)):
    if "." in tv_best.loc[i, "total_votes"]:
        tv_best.loc[i,"total_votes"] = tv_best.loc[i, "total_votes"][:-1]

In [None]:
# replacing . with an empty string so total votes can be converted into integer
tv_best["total_votes"] = tv_best["total_votes"].str.replace(".", "")

In [None]:
# transforming certain columns into integers
tv_best["rating"] = pd.to_numeric(tv_best["rating"], downcast="float")
tv_best["total_votes"] = tv_best["total_votes"].astype(int)
tv_best["year"] = tv_best["year"].astype(int)

In [None]:
tv_best

In [None]:
tv_best.to_csv("cumulative-data/IMDb_top_250.csv", index = False)

### Collecting episode data

The following functions and code were used to collect data from the TV episodes

In [None]:
def tv_special(link):
    # This function is for tv that do not have numerous seasons but rather one season.
    # Due to their layout it is harder to obtain information about the episodes by season.
    # Hence, it was optimal to go to the page with all the episodes in ascending order.
    
    # obtaining url to obtain tv info
    url = 'https://www.imdb.com/title/' + link + "/"
    response = get(url)
    # parsing content of request
    tv_soup = BeautifulSoup(response.text, 'html.parser')
    # collecting title data
    tv_title = tv_soup.find("h1", {"data-testid": "hero-title-block__title"}).string
    # collecting data of total number of episodes
    total_episodes = int(tv_soup.find("span", class_ = "ipc-title__subtext").text)
    # obtaining new link for extracting tv information by episode
    new_link = "https://www.imdb.com/search/title/?series=" + link + "&view=simple&sort=release_date,asc"
    response = get(new_link)
    # parsing content of request
    tv_soup = BeautifulSoup(response.text, 'html.parser')
    l = tv_soup.find_all("span", class_ = "lister-item-header")
    comprehend = []
    episode = 0
    season = 1
    for u in l:
        u = str(u)
        # finding all tv titles
        tv = re.findall(r'/title/tt[0-9]*/', u)
        v = str(tv[1]).split("/")[2]
        url = 'https://www.imdb.com/title/' + v + "/"
        response = get(url)
        tv_soup = BeautifulSoup(response.text, 'html.parser')
        # obtaining tv title
        title = tv_soup.find("h1", {"data-testid": "hero-title-block__title"}).text
        episode += 1
        # checking to make sure rating is not empty
        if tv_soup.find("span", class_ = "AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV")== None:
            rating = "0"
        else:
            # recording rating
            rating = tv_soup.find("span", class_ = "AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV").text
        # checking to make sure total_votes is not empty
        if tv_soup.find("div", class_ ="AggregateRatingButton__TotalRatingAmount-sc-1ll29m0-3 jkCVKJ") == None:
            total_votes = "0"
        else:
            # recording total_votes
            total_votes = tv_soup.find("div", class_ ="AggregateRatingButton__TotalRatingAmount-sc-1ll29m0-3 jkCVKJ").text
        # checking to make sure air date is not empty
        if tv_soup.find("li", class_ ="ipc-inline-list__item") == None:
            airdate = ""
        else:
            # recording airdate
            airdate = tv_soup.find("li", class_ ="ipc-inline-list__item").text
        desc = tv_soup.find("span", class_ = "GenresAndPlot__TextContainerBreakpointXL-sc-cum89p-2 eqbKRZ").text
        comprehend.append([tv_title, season, episode, title, airdate, rating, total_votes, desc, link])

    # The while loop below is used to extend to following pages so their information can be extracted
    n = 51
    while n < total_episodes:
        # obtaining urls
        url = "https://www.imdb.com/search/title/?series=" + link + "&view=simple&sort=release_date,asc&start=" + str(n) + "&ref_=adv_nxt"
        response = get(url)
        tv_soup = BeautifulSoup(response.text, 'html.parser')
        l = tv_soup.find_all("span", class_ = "lister-item-header")
        for u in l:
            u = str(u)
            tv = re.findall(r'/title/tt[0-9]*/', u)
            v = str(tv[1]).split("/")[2]
            url = 'https://www.imdb.com/title/' + v + "/"
            response = get(url)
            tv_soup = BeautifulSoup(response.text, 'html.parser')
            title = tv_soup.find("h1", {"data-testid": "hero-title-block__title"}).text
            episode += 1
            if tv_soup.find("span", class_ = "AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV")== None:
                rating = "0"
            else:
                rating = tv_soup.find("span", class_ = "AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV").text
            if tv_soup.find("div", class_ ="AggregateRatingButton__TotalRatingAmount-sc-1ll29m0-3 jkCVKJ") == None:
                total_votes = "0"
            else:
                total_votes = tv_soup.find("div", class_ ="AggregateRatingButton__TotalRatingAmount-sc-1ll29m0-3 jkCVKJ").text
            if tv_soup.find("li", class_ ="ipc-inline-list__item") == None:
                airdate = ""
            else:
                airdate = tv_soup.find("li", class_ ="ipc-inline-list__item").text
            desc = tv_soup.find("span", class_ = "GenresAndPlot__TextContainerBreakpointXL-sc-cum89p-2 eqbKRZ").text
            comprehend.append([tv_title, season, episode, title, airdate, rating, total_votes, desc, link])
        n += 50
    return (comprehend)

In [None]:
def episode_recorder(episode):
    # This function is used for recording episodes information for tvs that have multiple seasons
    
    # recording episode number
    episode_number = int(episode.meta['content'])
    # recording episode title
    title = episode.a['title']
    # recording air date
    airdate = episode.find('div', class_='airdate').text.strip()
    # making sure the airdate value is not empty
    if len(airdate.split(" ")) >= 3:
        # transforming the format of the airdate
        new = airdate.split(" ")
        new = [new[1], new[0], new[2]]
        new[1] = new[1] + ","
        new = " ".join(new)
        airdate = new.replace(".", "")
        new =[]
        if episode.find('span', class_='ipl-rating-star__rating')== None:
            rating = "0"
        else:
            rating = episode.find('span', class_='ipl-rating-star__rating').text
        if episode.find('span', class_='ipl-rating-star__total-votes') == None:
            total_votes = "0"
        else:
            total_votes = episode.find('span', class_='ipl-rating-star__total-votes').text
    else:
        airdate = ""
        rating = "0"
        total_votes = "0"
    desc = episode.find('div', class_='item_description').text.strip()
    return [episode_number, title, airdate, rating, total_votes, desc]

In [None]:
def tv_output(link):
    tv_episodes = []
    url = 'https://www.imdb.com/title/' + link + "/"
    response = get(url)
    tv_soup = BeautifulSoup(response.text, 'html.parser')
    tv_title = tv_soup.find("h1", {"data-testid": "hero-title-block__title"}).string
    if tv_soup.find("select", class_ = "ipc-simple-select__input")== None or tv_soup.find("select", class_ = "ipc-simple-select__input")["aria-label"][2:] != "seasons":
        tv_episodes.extend(tv_special(link))
    else: 
        n = int(tv_soup.find("select", class_ = "ipc-simple-select__input")["aria-label"][0])
        for sn in range(1,n+1):
            response = get('https://www.imdb.com/title/' + link + '/episodes?season=' + str(sn))

            page_html = BeautifulSoup(response.text, 'html.parser')

            episode_containers = page_html.find_all('div', class_ = 'info')

            
            for episodes in episode_containers:
                season = sn
                #episode_number = episodes.meta['content']
                #title = episodes.a['title']
                #airdate = episodes.find('div', class_='airdate').text.strip()
                #episode_data = [tv_title, season, episode_number, title, airdate, rating, total_votes, desc]
                episode_data = [tv_title, season]
                l = episode_recorder(episodes)
                episode_data.extend(l)
                episode_data.append(link)
                tv_episodes.append(episode_data)
    return(tv_episodes)

### The mini dataset

The mini dataset extracts information about the first five TV shows to showcase how the final dataset will look like

In [None]:
# Testing out function to make sure it can collect data about tv
l = []
for tv in tv_list[:5]:
    l.extend(tv_output(tv))
df = pd.DataFrame(l, columns = ['tv', 'season', 'episode_number', 'title', 'airdate', 'rating', 'total_votes', 'desc', 'encoded_title'])

In [None]:
df["airdate"] = df["airdate"].str.replace("Episode aired ", "")
df["total_votes"] = df["total_votes"].str.replace("K", "000")
df["total_votes"] = df["total_votes"].str.replace(",", "")
df["total_votes"] = df["total_votes"].str.replace("(", "")
df["total_votes"] = df["total_votes"].str.replace(")", "")

In [None]:
for i in range(len(df)):
    if "." in df.loc[i, "total_votes"]:
        df.loc[i,"total_votes"] = df.loc[i, "total_votes"][:-1]

In [None]:
df["total_votes"] = df["total_votes"].str.replace(".", "")

In [None]:
# transforming certain columns into integers
df["season"] = df["season"].astype(int)
df["episode_number"] = df["episode_number"].astype(int)
df["rating"] = pd.to_numeric(df["rating"], downcast="float")
df["total_votes"] = df["total_votes"].astype(int)

In [None]:
df

### Storing the data sets

The code below was used to store all the datasets in different data sets 

In [None]:
name_list = []
count = 1

In [None]:
for tv in tv_list[:5]: # to obtain all the datasets, replace tv_list[:5] with tv_list
    new_list = tv_output(tv)
    df = pd.DataFrame(new_list, columns = ['tv', 'season', 'episode_number', 'title', 'airdate', 'rating', 'total_votes', 'desc', 'encoded_title'])
    df["airdate"] = df["airdate"].str.replace("Episode aired ", "")
    df["total_votes"] = df["total_votes"].str.replace("K", "000")
    df["total_votes"] = df["total_votes"].str.replace(",", "")
    df["total_votes"] = df["total_votes"].str.replace("(", "")
    df["total_votes"] = df["total_votes"].str.replace(")", "")
    for i in range(len(df)):
        if "." in df.loc[i, "total_votes"]:
            df.loc[i,"total_votes"] = df.loc[i, "total_votes"][:-1]
    df["total_votes"] = df["total_votes"].str.replace(".", "")
    df["season"] = df["season"].astype(int)
    df["episode_number"] = df["episode_number"].astype(int)
    df["rating"] = pd.to_numeric(df["rating"], downcast="float")
    df["total_votes"] = df["total_votes"].astype(int)
    df.to_csv("tv-series-data/" + tv + ".csv", index = False)
    # splitting the tv column into multiple columns so as to obtain tv name
    new = df["tv"].str.split(" ", expand = True)
    new = new.fillna("")
    word = "-".join(new.iloc[0,])
    name = word.replace(":", "").strip("-").lower()
    name = name.replace(";", "")
    name = name.replace("!","")
    name = name.replace(".", "")
    name = name.replace("/", "")
    name = name.replace("'", "")
    name = name.replace(",", "")
    name = name.replace("?", "")
    if name in name_list:
        name += str(count)
    name_list.append(name)
    df.to_csv("tv-series-data-named/" + name + ".csv", index = False)

The code below combines all the anime datasets into one huge dataset

In [None]:
df = pd.DataFrame(columns = ['tv', 'season', 'episode_number', 'title', 'airdate', 'rating', 'total_votes', 'desc', 'encoded_title'])
for tv in tv_list: 
    if os.path.isfile("tv-series-data/" + tv + ".csv"):
        df1 = pd.read_csv("tv-series-data/" + tv + ".csv")
        df = pd.concat([df1, df], axis = 0)
        df = df.reset_index(drop = True)
df.to_csv("cumulative-data/tv_dataset.csv", index = False)