### This code scrapes Flixable's Disney+ page of shows and movies (which contains links to individual titles' data); then it uses the links to get data on each title in Disney+'s catalog.

The output data contains the following variables:
* blank column that designates index
* **title**: name of the title.
* **imdb**: IMDb score of the title.
* **year**: year of release of the title. In the case of shows, it's the release of the first season of the show.
* **content**: MPAA or TV rating, if one is available.
* **runtime**: for movies, the runtime in minutes and seconds. For shows, the number of seasons (a season need only have at least one episode released to count on both Flixable and Disney+).
* **added**: date the title was added to Disney+. In the case of shows, it appears to be the date the most recent season was added (though at the time of writing, this doesn't hold true for *Loki*, for reasons I'm unsure of).
* 44 genre columns, each of which has a True or False value depending on if the title is classified as such on Flixable. At the time of writing (10/19/2023), Flixable only features three of a show's genres on Disney+: its first three in alphabetical order. It also doesn't feature all genres on Disney+ (Horror, for example).
* **link**: link to the Flixable page for the title.

Required packages:
* selenium
* pandas
* bs4
* requests
* re
* time

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from selenium import webdriver
from selenium.webdriver import Keys, ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from time import sleep

In [2]:
# initializes our Edge session
driver = webdriver.Edge()

url = "https://flixable.com/disney-plus/?s="
driver.get(url)

driver.maximize_window()

# This scroll method is adapted from an answer provided by Ratmir Asanov on Stack Overflow
# Source: https://stackoverflow.com/a/48851166

# Get scroll height.
last_height = driver.execute_script("return document.body.scrollHeight")
refresh = False

while True:

    # Scroll down to the bottom.
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load the page.
    sleep(5)

    # Calculate new scroll height and compare with last scroll height.
    new_height = driver.execute_script("return document.body.scrollHeight")

    if new_height == last_height:
        if refresh:
            exp = driver.page_source
            break
        driver.refresh()
        refresh = True
    else:
        refresh = False
    last_height = new_height

# closes the driver
driver.quit()

In [3]:
# saves only the movie/show cards as BeautifulSoup
cards = BeautifulSoup(exp).find_all("div", class_ = "card-body")

In [4]:
# initializing general show info our output df
link = []
title = []
imdb = []
year = []
content = []
runtime = []
added = []

In [5]:
# assemble links to be scraped (we don't scrape the titles here because many of them are truncated and end in ellipses)
for i in cards:
    link.append("https://flixable.com" + i.find_all("a")[0]["href"])
    try:
        imdb.append(float(i.find_all("span")[1].get_text()[0:3]))
    except Exception:
        imdb.append(None)

dfsize = len(link)

In [6]:
# initialize genres
GAcAd = [False] * dfsize
GAnimals = [False] * dfsize
GAnimation = [False] * dfsize
GAnime = [False] * dfsize
GAnthology = [False] * dfsize
GBio = [False] * dfsize
GBuddy = [False] * dfsize
GComedy = [False] * dfsize
GComing = [False] * dfsize
GConcert = [False] * dfsize
GCrime = [False] * dfsize
GDance = [False] * dfsize
GDisaster = [False] * dfsize
GDocumentary = [False] * dfsize
GDocuseries = [False] * dfsize
GDrama = [False] * dfsize
GFamily = [False] * dfsize
GFantasy = [False] * dfsize
GFNoir = [False] * dfsize
GGame = [False] * dfsize
GHistorical = [False] * dfsize
GKids = [False] * dfsize
GLifestyle = [False] * dfsize
GMedical = [False] * dfsize
GMusic = [False] * dfsize
GMusical = [False] * dfsize
GMystery = [False] * dfsize
GParody = [False] * dfsize
GPolice = [False] * dfsize
GProcedural = [False] * dfsize
GReality = [False] * dfsize
GRomance = [False] * dfsize
GRomCom = [False] * dfsize
GSciFi = [False] * dfsize
GSoap = [False] * dfsize
GSports = [False] * dfsize
GSpy = [False] * dfsize
GSuperhero = [False] * dfsize
GSurvival = [False] * dfsize
GTalk = [False] * dfsize
GThriller = [False] * dfsize
GTravel = [False] * dfsize
GVariety = [False] * dfsize
GWestern = [False] * dfsize

In [7]:
# a function used in the next block of code;
# this function takes the "genres" variable created for each title and advances to the next genre
def success(current, curnum):
    newnum = curnum + 1
    try:
        newcur = genres[newnum].get_text()
    # this function is proofed against Flixable supporting more than 3 genres by returning a fake genre when out of range
    # unfortunately, Flixable presently only shows 3 genres, when Disney+ is capable of assigning titles more genres
    except IndexError:
        newcur = "ZNull"
    return newcur, newnum

In [8]:
for i in range(0, len(link)):
    # scraping a single Flixable page
    
    r = requests.get(link[i])
    soup = BeautifulSoup(r.text)
    title.append(soup.find_all("h1")[0].get_text())
    info = soup.find_all("div", class_ = "col-lg-8")[0] # this contains much of our desired metadata
    year.append(info.find_all("span", class_ = "mr-2")[0].get_text())
    try:
        content.append(info.find_all("span", class_ = "mr-2")[1].get_text())
    except Exception:
        content.append(None)        
    runtime.append(info.find_all("span", class_ = "mr-2")[2].get_text())
    try:
        added.append(info.find_all("p", class_ = "mb-2")[0].get_text().strip().replace("Added to Disney+:", "").replace("New Season Added:", ""))
    except IndexError:
        added.append(None)
    # this is a small list containing the genres listed on Flixable
    genres = info.find_all("a", href = re.compile("/disney-plus/genre/*"))
    
    curnum = 0
    current = genres[0].get_text()
    
    # sorting algorithm that checks by first letter, and in some cases second, to minimize number of checks
    if (current[0] == "A"):
        if (current == "Action-Adventure"):
            GAcAd[i] = True
            current, curnum = success(current, curnum)
        if (current[1] == "n"):
            if (current[2] == "i"):
                if (current == "Animals & Nature"):
                    GAnimals[i] = True
                    current, curnum = success(current, curnum)
                if (current == "Animation"):
                    GAnimation[i] = True
                    current, curnum = success(current, curnum)
                if (current == "Anime"):
                    GAnime[i] = True
                    current, curnum = success(current, curnum)
            if (current == "Anthology"):
                GAnthology[i] = True
                current, curnum = success(current, curnum)
    if (current[0] == "B"):
        if (current == "Biographical"):
            GBio[i] = True
            current, curnum = success(current, curnum)
        if (current == "Buddy"):
            GBuddy[i] = True
            current, curnum = success(current, curnum)
    if (current[0] == "C"):
        if (current[1] == "o"):
            if (current[2] == "m"):
                if (current == "Comedy"):
                    GComedy[i] = True
                    current, curnum = success(current, curnum)
                if (current == "Coming of Age"):
                    GComing[i] = True
                    current, curnum = success(current, curnum)
            if (current == "Concert Movie"):
                GConcert[i] = True
                current, curnum = success(current, curnum)
        if (current == "Crime"):
            GCrime[i] = True
            current, curnum = success(current, curnum)
    if (current[0] == "D"):
        if (current == "Dance"):
            GDance[i] = True
            current, curnum = success(current, curnum)
        if (current == "Disaster"):
            GDisaster[i] = True
            current, curnum = success(current, curnum)
        if (current == "Documentary"):
            GDocumentary[i] = True
            current, curnum = success(current, curnum)
        if (current == "Docuseries"):
            GDocuseries[i] = True
            current, curnum = success(current, curnum)
        if (current == "Drama"):
            GDrama[i] = True
            current, curnum = success(current, curnum)
    if (current[0] == "F"):
        if (current == "Family"):
            GFamily[i] = True
            current, curnum = success(current, curnum)
        if (current == "Fantasy"):
            GFantasy[i] = True
            current, curnum = success(current, curnum)
        if (current == "Film Noir"):
            GFNoir[i] = True
            current, curnum = success(current, curnum)
    if (current == "Game Show / Competition"):
        GGame[i] = True
        current, curnum = success(current, curnum)
    if (current == "Historical"):
        GHistorical[i] = True
        current, curnum = success(current, curnum)
    if (current == "Kids"):
        GKids[i] = True
        current, curnum = success(current, curnum)
    if (current == "Lifestyle"):
        GLifestyle[i] = True
        current, curnum = success(current, curnum)
    if (current[0] == "M"):
        if (current == "Medical"):
            GMedical[i] = True
            current, curnum = success(current, curnum)
        if (current == "Music"):
            GMusic[i] = True
            current, curnum = success(current, curnum)
        if (current == "Musical"):
            GMusical[i] = True
            current, curnum = success(current, curnum)
        if (current == "Mystery"):
            GMusical[i] = True
            current, curnum = success(current, curnum)
    if (current[0] == "P"):
        if (current == "Parody"):
            GParody[i] = True
            current, curnum = success(current, curnum)
        if (current == "Police / Cop"):
            GPolice[i] = True
            current, curnum = success(current, curnum)
        if (current == "Procedural"):
            GProcedural[i] = True
            current, curnum = success(current, curnum)
    if (current[0] == "R"):
        if (current == "Reality"):
            GReality[i] = True
            current, curnum = success(current, curnum)
        if (current == "Romance"):
            GRomance[i] = True
            current, curnum = success(current, curnum)
        if (current == "Romantic Comedy"):
            GRomCom[i] = True
            current, curnum = success(current, curnum)
    if (current[0] == "S"):
        if (current == "Science Fiction"):
            GSciFi[i] = True
            current, curnum = success(current, curnum)
        if (current == "Soap Opera / Melodrama"):
            GSoap[i] = True
            current, curnum = success(current, curnum)
        if (current[1] == "p"):
            if (current == "Sports"):
                GSports[i] = True
                current, curnum = success(current, curnum)
            if (current == "Spy/Espionage"):
                GSpy[i] = True
                current, curnum = success(current, curnum)
        if (current[1] == "u"):
            if (current == "Superhero"):
                GSuperhero[i] = True
                current, curnum = success(current, curnum)
            if (current == "Survival"):
                GSurvival[i] = True
                current, curnum = success(current, curnum)
    if (current[0] == "T"):
        if (current == "Talk Show"):
            GTalk[i] = True
            current, curnum = success(current, curnum)
        if (current == "Thriller"):
            GThriller[i] = True
            current, curnum = success(current, curnum)
        if (current == "Travel"):
            GTravel[i] = True
            current, curnum = success(current, curnum)
    if (current == "Variety"):
        GVariety[i] = True
        current, curnum = success(current, curnum)
    if (current == "Western"):
        GWestern[i] = True
        current, curnum = success(current, curnum)

In [9]:
movies = pd.DataFrame({"title": title, "imdb": imdb, "year": year, "content": content, "runtime": runtime, "added": added,
"GAcAd": GAcAd, "GAnimals": GAnimals, "GAnimation": GAnimation, "GAnime": GAnime, "GAnthology": GAnthology, "GBio": GBio,
"GBuddy": GBuddy, "GComedy": GComedy, "GComing": GComing, "GConcert": GConcert, "GCrime": GCrime, "GDance": GDance,
"GDisaster": GDisaster, "GDocumentary": GDocumentary, "GDocuseries": GDocuseries, "GDrama": GDrama, "GFamily": GFamily,
"GFantasy": GFantasy, "GFNoir": GFNoir, "GGame": GGame, "GHistorical": GHistorical, "GKids": GKids, "GLifestyle": GLifestyle,
"GMedical": GMedical, "GMusic": GMusic, "GMusical": GMusical, "GMystery": GMystery, "GParody": GParody, "GPolice": GPolice,
"GProcedural": GProcedural, "GReality": GReality, "GRomance": GRomance, "GRomCom": GRomCom, "GSciFi": GSciFi, "GSoap": GSoap,
"GSports": GSports, "GSpy": GSpy, "GSuperhero": GSuperhero, "GSurvival": GSurvival, "GTalk": GTalk, "GThriller": GThriller, 
"GTravel": GTravel, "GVariety": GVariety, "GWestern": GWestern, "link": link})

In [10]:
movies.to_csv("flixable.csv")