In [1]:
# Lib to get the html for a url adress
from urllib.request import urlopen
# Lib to search and get specifically datas of a html or xml files
from bs4 import BeautifulSoup
# Lib to work with dataframes
import pandas as pd
# Lib to open a browser and thus fully open an infinite scrolling page
from selenium import webdriver
# Lib to use keyboard commands
from selenium.webdriver.common.keys import Keys
# lib to search for page elements
from selenium.webdriver.common.by import By
# Lib to delay the execution
import time
# Lib to search a string by a partial string in a string list
import re

In [2]:
# list to indicate when the function of scrolling page must end
end_points = pd.Series({'netflix' : 'To and From New York', 
              'hulu' : 'The Twilight Zone', 
              'disney-plus': 'Imagination Movers', 
              'hbo-max' : 'Zapped', 
              'amazon-prime-video' : 'Zoombies'})
end_points

In [3]:
parental_guidelines = ['7+', '13+', '18+', '16+', 'ALL', 'ALL_AGES', 'AGES_18_', 'G', 'NC-17', 'NR', 'NOT RATED',
      'PG', 'PG-13', 'R', 'TV-NR', 'TV-PG', 'TV-14', 'TV-G', 'TV-Y', 'TV-Y7-FV', 'TV-Y7', 'TV-MA', 'UNRATED']

In [4]:
duration_check = ['min', 'Season']

In [5]:
# Function to full scrolling of an infinite scrolling page
def scrolling_page(streaming):
    
    base = 'https://flixable.com/'
    # condition to deal with the different urls
    if streaming == 'netflix':
        ad = ''
    else:
        ad = streaming
        
    # open the browser   
    driver = webdriver.Chrome()
    driver.get(base + ad)

    # delay to load the page
    time.sleep(0.5)

    # get the body of the page
    element = driver.find_element(By.TAG_NAME, "body")
    
    # variable to check when to do break condition check
    count = 0

    # loop to full open the page
    while True:
        element.send_keys(Keys.END)
        time.sleep(0.5)
        # command to go up one time to avoid loading error
        element.send_keys(Keys.PAGE_UP)
        
        count += 1
        if count == 20:
            count = 0
            # break condition check
            if driver.find_elements(By.CLASS_NAME, 'card-title')[-1].text == end_points[streaming]:
                print('Page fully open')
                break
    
    # converting and saving the page
    soup =  BeautifulSoup(driver.page_source, 'html.parser')

    return soup

In [6]:
# Function to create a list of titles links 
def links():
    list_links = []
    for link in soup.find_all("a", href=True):
        if '/title' in link["href"]:
                list_links.append(link["href"])
    return list_links

In [None]:
# Put in this variable which streaming service you want to scraping
streaming = 'netflix'

In [1]:
# Running the scrolling function with the streaming chosen
soup = scrolling_page(streaming)

# Running the links function
list_links = links()

# Create the list variable to place dataset of each title
cards = []

# Create a list for the ids of possible errors 
errors = []

# Create the string base for the links
base = 'https://flixable.com'

# Loop for open each link
for i in range(0, len(list_links),2):

    # Variable to store the link dataset
    card = {}
    
    url = base + list_links[i]
    try:
        response = urlopen(url)
    except:
        errors.append(i)
        continue
    
    # Read and parse the HTML link       
    html = response.read()
    html = html.decode('utf-8')
    soup =  BeautifulSoup(html, 'html.parser')
    
    # Get the title
    card['title'] = soup.find('h1', {'class' : 'title'}).getText()
    
    # Create some columns for datas
    card['release_year'] = 0
    card['parental_guidelines'] = ' '
    card['duration'] = ' '
    
    # Get the year, the parental guidelines and the duration
    size_mr2 = len(soup.findAll('span', {'class' : 'mr-2'}))
    for j in range(size_mr2):
        _ = soup.findAll('span', {'class' : 'mr-2'})[j].getText()
        if j == 0:
            card['release_year'] = _

        if _ in parental_guidelines:
            card['parental_guidelines'] = _
        
        if duration_check[0] in _ or duration_check[1] in _:
            card['duration'] = _
    
    # Get the genre
    x = str(soup.findAll('a', href=True))
    card['genre'] = []
    genres = re.findall(r'genre/(.*?)"', x)     
    if len(genres) > 0:
        for g in genres:
            if streaming == 'netflix':
                href = '/genre/' + g
            else:
                href = '/' + streaming + '/genre/' + g
            try:
                card['genre'].append(soup.find('a', href=href).getText())
            except:
                print('genre error')
                continue
    else:
        card['genre'] = ' '

    # Get the date_added
    try:
        card['date_added'] = soup.find('p', {'class' : 'mb-2'}).getText().strip().split(':')[1]
    except:
        card['date_added'] = ' '

    # Get the average_rating
    try:
        average_rating = soup.h6.contents[5].getText().split('/')[0]
        card['average_rating'] = average_rating
    except:
        card['average_rating'] = ' '

    # Get the description
    try:
        card['description'] = soup.findAll('p', {'class' : 'card-description'})[0].getText().strip()
    except:
        card['description'] = ' '

    # Join the link dataset to the list
    cards.append(card)
    
    # Completion percentage indicator
    print(round(i/len(list_links) * 100, 2), '%| indice:', i, '/', len(list_links))

In [13]:
# Create dataframe and save as a .csv file
df_disney_plus = pd.DataFrame(cards)
df_disney_plus.to_csv('df_disney_plus.csv', sep=';', index = False, encoding = 'utf-8-sig')

In [25]:
# Create dataframe and save as a .csv file
df_hulu = pd.DataFrame(cards)
df_hulu.to_csv('df_hulu.csv', sep=';', index = False, encoding = 'utf-8-sig')

In [31]:
# Create dataframe and save as a .csv file
df_hbo_max = pd.DataFrame(cards)
df_hbo_max.to_csv('df_hbo_max.csv', sep=';', index = False, encoding = 'utf-8-sig')

In [36]:
# Create dataframe and save as a .csv file
df_prime = pd.DataFrame(cards)
df_prime.to_csv('df_prime.csv', sep=';', index = False, encoding = 'utf-8-sig')

In [45]:
# Create dataframe and save as a .csv file
df_netflix = pd.DataFrame(cards)
df_netflix.to_csv('df_netflix.csv', sep=';', index = False, encoding = 'utf-8-sig')