In [155]:
import requests
import bs4
import pandas as pd
from tqdm import tqdm

def find_element(element, block, class_type, name):
    try:
        return element.find(block,{class_type: name}).getText().replace('\n','').replace('  ','')
    except AttributeError:
        return ''

def get_content(soup):
    data = {}
    idy = 0

    for element in tqdm(soup.find_all('div',{'class': 'mainBroadcastCard-infos'})):
        info = {}
        # Starting hour
        info['heure'] = find_element(element, 'div', 'class', 'mainBroadcastCard-startingHour')
        # Title
        info['titre'] = find_element(element, 'h3', 'class', 'mainBroadcastCard-title')
        # Subtitle
        info['sous_titre'] = find_element(element, 'div', 'class', 'mainBroadcastCard-subtitle')
        # Type
        info['type'] = find_element(element, 'div', 'class', 'mainBroadcastCard-type')
        # Duration
        info['duree'] = find_element(element, 'span', 'class', 'mainBroadcastCard-durationContent')
        # Duration
        info['new'] = find_element(element, 'div', 'class', 'mainBroadcastCard-new')
        # Duration
        info['live'] = find_element(element, 'div', 'class', 'mainBroadcastCard-live')
        # Duration
        info['rebroadcast'] = find_element(element, 'div', 'class', 'mainBroadcastCard-rebroadcast')

        # Description
        links = element.find('h3',{'class': 'mainBroadcastCard-title'})
        for a in links.find_all('a', href=True): 
            if a.text: 
                desc_url = a['href']
        soup_desc = bs4.BeautifulSoup(requests.get(desc_url).text,'html.parser')
        try:
            info['description'] = soup_desc.find('p',{'class','synopsis-twoPart resume'}).getText()
        except AttributeError:
            info['description'] ='Aucune description'

        # insert into data
        data[idy] = info
        idy += 1

    ### Find and clean channels
    chaines = []
    for element in soup.find_all('h2',{'class': 'homeGrid-cardsChannelName'}):
        full_txt = element.getText().replace('\n','').replace('  ','')
        sr_only = element.find('span',{'class': 'sr-only'}).getText().replace('\n','').replace('  ','')
        for _ in range(2): # 2 evening time slots
            chaines.append(full_txt.replace(sr_only,''))  
            
    ### create and clean dataframe
    df = pd.DataFrame(data).T

    # append chaines
    df['chaines'] = chaines
    df = df.set_index('chaines')

    # merge broadcast columns
    df['diffusion'] = df.apply(lambda x: x['new']+x['live']+x['rebroadcast'],axis=1)
    df.drop(['new','live','rebroadcast'],axis=1,inplace=True)
    
    return df

    
url = 'https://www.programme-tv.net/'
page = requests.get(url)
soup = bs4.BeautifulSoup(page.text,'html.parser')

df = get_content(soup)

100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [00:13<00:00,  3.33it/s]


Unnamed: 0_level_0,heure,titre,sous_titre,type,duree,description,diffusion
chaines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TF1,21h05,The Resident,Ambiance cabaret,Série TV,50min,\nUn imitateur de Dolly Parton perd connaissan...,Inédit
TF1,22h50,Chicago Med,Sang pour cent,Série TV,45min,\nLes médecins prennent en charge un policier ...,Inédit
France 2,21h05,L'école de la vie,Zoé,Série TV,53min,\nVivant comme une honte la précarité de sa fa...,Inédit
France 2,22h55,6 à la maison,,Culture Infos,1h,"\nArtistes, humoristes ou encore intellectuels...",Direct
France 3,21h05,La carte aux trésors,L'Allier,Autre,2h05min,"\nLes candidats, Bérénice et Kévin, vont devoi...",
France 3,23h15,"Mireille Darc, la femme libre",,Culture Infos,1h55min,"\nLe 28 août 2017, Mireille Darc disparaissait...",
Canal+,21h05,The Big Ugly,,Cinéma,1h44min,\nEscortés par leurs compagnes Fiona et Jackie...,Inédit
Canal+,22h49,Escape from Pretoria,,Cinéma,1h43min,"\nMilitants anti-apartheid blancs, Tim Jenkins...",Rediffusion
France 5,20h50,La grande librairie,,Culture Infos,1h30min,"\nÉmission phare de la littérature, ""La Grande...",Direct
France 5,22h25,C ce soir,,Autre,1h05min,"\nChaque soir, Karim Rissouli, aux côtés Laure...",


In [156]:
df.loc['TF1']

Unnamed: 0_level_0,heure,titre,sous_titre,type,duree,description,diffusion
chaines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TF1,21h05,The Resident,Ambiance cabaret,Série TV,50min,\nUn imitateur de Dolly Parton perd connaissan...,Inédit
TF1,22h50,Chicago Med,Sang pour cent,Série TV,45min,\nLes médecins prennent en charge un policier ...,Inédit
