# Project Overview
Data will be scraped from www.ted.com, the official TED website.

TED website is a popular online repository of audiovisual recordings of speakers from diverse disciplines from all over the world who are generally given a maximum of 18 minutes to present their ideas.

The data to be scraped includes the speaker name, talk title, when the talk was published, talk duration, talk URL, number of views per talk, number of likes per talk, TED event, and the speaker occupation.

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import logging
logging.getLogger('WDM').setLevel(logging.NOTSET)

In [2]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.ted.com/talks?page=1")
time.sleep(3)
html_source = driver.page_source
soup = BeautifulSoup(html_source,'lxml')

# Speaker Name

In [3]:
page = 1
names = []
while page != 159:
    driver.get(f"https://www.ted.com/talks?page={page}")
    driver.implicitly_wait(3)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'lxml')
    
    for h4 in soup.find_all("h4", class_="h12 talk-link__speaker"):
        names.append(h4.get_text(strip=True))
    page = page + 1

In [4]:
names[:5]

['Henri Picciotto',
 'Andrew Ng',
 'Melodie Yashar',
 'Nicholas Leeper',
 'Hei Man Chan']

In [5]:
len(names)

5687

# Talk Title

In [6]:
page = 1
pst = []
while page != 159:
    driver.get(f"https://www.ted.com/talks?page={page}")
    driver.implicitly_wait(3)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'lxml')
    
    for a in soup.find_all('a', class_ = 'ga-link', attrs = {'data-ga-context': 'talks'}):
        pst.append(a.get_text(strip=True))
        
    page= page + 1

titles = []   
for i in pst[1::2]: # This for loop removes the time string appended to the titles
    titles.append(i)    

In [7]:
titles[:6]

['Can you solve the alien pyramid riddle?',
 'How AI could empower any business',
 'How to build for human life on Mars',
 'Is there a link between cancer and heart disease?',
 'Is it possible to lose weight fast?',
 "Is the pandemic actually over? It's complicated"]

In [8]:
len(titles)

5687

# Month/Year Posted

In [9]:
page = 1
posted = []
while page != 159:
    driver.get(f"https://www.ted.com/talks?page={page}")
    driver.implicitly_wait(3)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'lxml')
    
    for span in soup.find_all("span", class_ = 'meta__val'):
        posted.append(span.get_text(strip=True))
        
    page= page + 1

In [10]:
posted[:6]

['Sep 2022', 'Sep 2022', 'Sep 2022', 'Sep 2022', 'Sep 2022', 'Sep 2022']

In [11]:
len(posted)

5687

# Talk Duration

In [12]:
page = 1
duration = []
while page != 159:
    driver.get(f"https://www.ted.com/talks?page={page}")
    driver.implicitly_wait(3)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'lxml')
    
    for span in soup.find_all("span", class_ = 'thumb__duration'):
        duration.append(span.get_text(strip=True))
        
    page= page + 1 

In [13]:
duration[:8]

['4:54', '11:13', '10:37', '15:16', '4:37', '28:25', '17:05', '5:23']

In [14]:
len(duration)

5687

# URLs

In [15]:
page = 1
links = []
while page != 159:
    driver.get(f"https://www.ted.com/talks?page={page}")
    driver.implicitly_wait(3)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'lxml')
    
    for a_href in soup.find_all('a', class_ = 'ga-link', attrs = {'data-ga-context': 'talks'}):
            links.append('https://www.ted.com' + a_href["href"]) # Using first part of link and joining it with the href part
        
    page= page + 1
    
# Result had duplicate links, so this for loop fixed that.    
links2 = []
for link in links:
    if link not in links2:
        links2.append(link) # To deal with duplicate links

In [16]:
links2[:6]

['https://www.ted.com/talks/henri_picciotto_can_you_solve_the_alien_pyramid_riddle',
 'https://www.ted.com/talks/andrew_ng_how_ai_could_empower_any_business',
 'https://www.ted.com/talks/melodie_yashar_how_to_build_for_human_life_on_mars',
 'https://www.ted.com/talks/nicholas_leeper_is_there_a_link_between_cancer_and_heart_disease',
 'https://www.ted.com/talks/hei_man_chan_is_it_possible_to_lose_weight_fast',
 'https://www.ted.com/talks/anthony_fauci_is_the_pandemic_actually_over_it_s_complicated']

In [17]:
len(links2)

5687

# Number of Views / Event Name

In [18]:
# Views are interlinked to individual web pages, so URLs will be used to extract data from those specific pages.


views_no = []   
for url in links2:
    driver.get(url)
    driver.implicitly_wait(3)
    soup2 = BeautifulSoup(driver.page_source,"lxml")
    
    for d in soup2.find_all("div", class_ = "text-sm w-full truncate text-gray-900", attrs = {'data-testid': 'talk-meta'}):
        views_no.append(d.get_text(strip=True))
        

In [19]:
views_no[:7]

['Henri Picciotto • TED-Ed',
 '200,685 views | Andrew Ng • TED2022',
 '292,542 views | Melodie Yashar • TED2022',
 '431,971 views | Nicholas Leeper • TEDxVienna',
 '380,671 views | Hei Man Chan • TED-Ed',
 '445,255 views | Anthony Fauci • TED Membership',
 '414,705 views | Majora Carter • TED2022']

In [20]:
len(views_no)

5687

In [21]:
# Iterating through the views_no list to extract the views and the event name

views = [item.split(" views")[0] for item in views_no]

event_name = [item.split("• ")[-1] for item in views_no]

In [23]:
len(views)

5687

In [24]:
len(event_name)

5687

# Number of Likes

In [25]:
# Likes are interlinked to individual web pages, so URLs will be used to extract data from those specific pages.


likes_no = []   
for url in links2:
    driver.get(url)
    driver.implicitly_wait(3)
    soup2 = BeautifulSoup(driver.page_source,"lxml")
    
    for i in soup2.find_all("div", class_ = "transition-opacity duration-300 inline-flex items-center opacity-100"):
        spans = i.find_all("span")
        
        for span in spans:
            n = span.text
            if n != '':
                likes_no.append(n)
                
likes_no2 = [] # This removes duplicate likes. The above for loops duplicated the likes for each speaker.
for i in likes_no[ :  : 2]:
    likes_no2.append(i)  

In [26]:
likes_no2[:6]

['\xa0(35)', '\xa0(7.1K)', '\xa0(9.1K)', '\xa0(13K)', '\xa0(11K)', '\xa0(13K)']

In [27]:
# Editing likes_no and removing all the unwanted characters 
f = []
for i in likes_no2:
    f.append(i.strip(')'))
    likes = []
for i in f:
    likes.append(i.strip("\xa0("))        

In [28]:
len(likes)

5684

In [29]:
likes[:8]

['35', '7.1K', '9.1K', '13K', '11K', '13K', '12K', '4.7K']

# Creating The Dataframe

In [30]:
ted_dict = { 
    
    'Speaker': names,
    'Talk_title': titles,
    'When_posted': posted,
    'Talk_duration': duration,
    'Views_number': views,
    'Event_name': event_name,
    'Link': links2
}    

In [31]:
ted_df1 = pd.DataFrame(ted_dict)
ted_df1

Unnamed: 0,Speaker,Talk_title,When_posted,Talk_duration,Views_number,Event_name,Link
0,Henri Picciotto,Can you solve the alien pyramid riddle?,Sep 2022,4:54,Henri Picciotto • TED-Ed,TED-Ed,https://www.ted.com/talks/henri_picciotto_can_...
1,Andrew Ng,How AI could empower any business,Sep 2022,11:13,200685,TED2022,https://www.ted.com/talks/andrew_ng_how_ai_cou...
2,Melodie Yashar,How to build for human life on Mars,Sep 2022,10:37,292542,TED2022,https://www.ted.com/talks/melodie_yashar_how_t...
3,Nicholas Leeper,Is there a link between cancer and heart disease?,Sep 2022,15:16,431971,TEDxVienna,https://www.ted.com/talks/nicholas_leeper_is_t...
4,Hei Man Chan,Is it possible to lose weight fast?,Sep 2022,4:37,380671,TED-Ed,https://www.ted.com/talks/hei_man_chan_is_it_p...
...,...,...,...,...,...,...,...
5682,Hans Rosling,The best stats you've ever seen,Jun 2006,19:50,15420594,TED2006,https://www.ted.com/talks/hans_rosling_the_bes...
5683,Sir Ken Robinson,Do schools kill creativity?,Jun 2006,19:24,73839997,TED2006,https://www.ted.com/talks/sir_ken_robinson_do_...
5684,Majora Carter,Greening the ghetto,Jun 2006,18:36,3002749,TED2006,https://www.ted.com/talks/majora_carter_greeni...
5685,David Pogue,Simplicity sells,Jun 2006,21:26,2011584,TED2006,https://www.ted.com/talks/david_pogue_simplici...


# Speaker Occupations

# Names_two

In [32]:
page = 1
names2 = [] 
while page != 120:
    driver.get(f"https://www.ted.com/speakers?page={page}")
    driver.implicitly_wait(3)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'lxml')
     
    for i in soup.find_all("h4",  class_= "h7 m5"):
        names2.append(i.get_text(separator=" ").strip()) #Separator used because there was no space between names.
           
    page = page + 1           

In [33]:
names2[:5]

['Jennifer Aaker',
 'Sandra Aamodt',
 'Morra Aarons-Mele',
 'Trevor Aaronson',
 'Chris Abani']

In [34]:
len(names2)

3562

# Occupation

In [35]:
page = 1
occupation = []
while page != 120:
    driver.get(f"https://www.ted.com/speakers?page={page}")
    driver.implicitly_wait(3)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'lxml')
    
    for i in soup.find_all("p", class_="p4"):
        occupation.append(i.get_text(strip=True))
    page = page + 1   

In [36]:
occupation[:5]

['Behavioral scientist',
 'Neuroscientist and science writer',
 'Anxious overachiever',
 'Journalist',
 'Novelist, poet']

In [37]:
len(occupation)

3562

# Occupations Dataframe

In [38]:
ted_dict2 = { 
    
    'Speaker': names2,
    'Occupation': occupation
     
}

In [39]:
ted_df2 = pd.DataFrame(ted_dict2)
ted_df2

Unnamed: 0,Speaker,Occupation
0,Jennifer Aaker,Behavioral scientist
1,Sandra Aamodt,Neuroscientist and science writer
2,Morra Aarons-Mele,Anxious overachiever
3,Trevor Aaronson,Journalist
4,Chris Abani,"Novelist, poet"
...,...,...
3557,Ethan Zuckerman,"Blogger, digital visionary"
3558,Marlene Zuk,Evolutionary biologist
3559,Mikhail Zygar,"Journalist, writer, filmmaker"
3560,Karin Öberg,Space chemist
