In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
class Scraper:
    def __init__(self, url):
        self.chrome_options = webdriver.ChromeOptions()
        self.chrome_options.add_argument('--no-sandbox')
        self.chrome_options.add_argument('--window-size=1420,1080')
        self.chrome_options.add_argument('--headless')
        self.chrome_options.add_argument('--disable-gpu')
        self.chrome_options.add_argument('--disable-dev-shm-usage')  
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.url = url
    
    def access_website(self):
        self.driver.get(self.url)
        
    def click_on_by_text(self,text):
        self.driver.find_element_by_link_text(text).click()
    
    def click_on_by_xpath(self,xpath):
        self.driver.find_element_by_xpath(xpath).click()
    
    def accept_cookies(self):
        self.driver.find_element_by_xpath("//button[text()='Accept']").click()
    
    def getShow(self,results,category):
        all_widgets = self.driver.find_elements_by_class_name("explore-results")
        first_line = all_widgets[0]
        all_first_line_elements = first_line.find_elements_by_class_name('col-inline')
        for element in all_first_line_elements:
            try:
                show = element.text.split('\n')
                details = show[1].split('•')
                votes_details = details[1].split('(')
                results.append({'Type':category,'Nom':show[0],'Pays':details[0],'Note':votes_details[0],'Nb_votes':votes_details[1].split(' ')[0]})
            except:
                continue
        return results

    def getAllShows(self,pages,category): 
        results = []
        for i in range(pages):
            try:
                self.getShow(results,category)
                WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[text()='Suivant →']"))).click()
            except:
                continue
        return results
    
    def quitDriver(self):
        self.driver.quit()

In [3]:
chrome = Scraper('https://www.viki.com/?locale=fr')

In [4]:
chrome.access_website()

In [5]:
chrome.accept_cookies()

In [6]:
chrome.click_on_by_text('Explorer')

In [7]:
chrome.click_on_by_text('Tous les spectacles')

In [8]:
chrome.click_on_by_xpath("//span[text()='Tous les types']")

In [9]:
chrome.click_on_by_xpath("//div[text()='Télévision']")

In [10]:
series = chrome.getAllShows(50,'Série')

In [11]:
len(series)

597

In [12]:
chrome.click_on_by_xpath("//span[text()='Télévision']")

In [13]:
chrome.click_on_by_xpath("//div[text()='Films']")

In [14]:
movies = chrome.getAllShows(3,'Film')

In [15]:
len(movies)

49

In [16]:
chrome.quitDriver()

In [17]:
shows = series + movies

In [18]:
import pandas as pd

In [19]:
df_shows = pd.DataFrame(shows)

In [20]:
df_shows.head()

Unnamed: 0,Type,Nom,Pays,Note,Nb_votes
0,Série,The Penthouse,Corée,9.2,24788
1,Série,À L'ANTENNE Tu es mon héros,Chine Continentale,9.5,4595
2,Série,À L'ANTENNE The Sweet Blood,Corée,8.9,1916
3,Série,True Beauty,Corée,9.6,143241
4,Série,À L'ANTENNE The Penthouse 2 (Le Penthouse 2),Corée,9.5,13030


In [21]:
df_shows.tail()

Unnamed: 0,Type,Nom,Pays,Note,Nb_votes
641,Film,Green Snake,Chine Continentale,7.0,1059
642,Film,Give Seven Days,Chine Continentale,8.5,1196
643,Film,"Mon patron, mon héros",Corée,8.3,855
644,Film,La trompette de la falaise,Japon,8.7,1433
645,Film,Science et sensibilité,Chine Continentale,8.8,912


In [22]:
df_shows.to_csv('viki_shows.csv',index=False)