# Step 1. Import the Library

In [26]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

# Step 2. Access the HTML content from the webpage by assigning the URL and creating a soup object

In [45]:
# Downloading cna data
headers = {'Accept-Language': 'en-US,en;q=0.8'}
url = 'https://ed.ted.com/lessons?direction=desc&sort=featured-position&user_by_click=student'
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Step 3. Extract the File

In [50]:
lesson = soup.select('h2.text-gray-700')

print(lesson[0])
print(" ")
print("how many lessons are available? ", len(soup.select('h2.text-gray-700')))

<h2 class="my-1.5 font-bold text-lg text-gray-700">
<a class="text-gray-700 hover:text-gray-700" data-turbo-frame="_top" href="/lessons/should-we-get-rid-of-standardized-testing-arlo-kempf">Should we get rid of standardized testing?</a>
</h2>
 
how many lessons are available?  24


In [51]:
text = soup.select('h2 a')
print(text)

[<a class="text-gray-700 hover:text-gray-700" data-turbo-frame="_top" href="/lessons/should-we-get-rid-of-standardized-testing-arlo-kempf">Should we get rid of standardized testing?</a>, <a class="text-gray-700 hover:text-gray-700" data-turbo-frame="_top" href="/lessons/how-schools-can-nurture-every-student-s-genius-trish-millines-dziko">How schools can nurture every student's genius - Trish Millines Dziko</a>, <a class="text-gray-700 hover:text-gray-700" data-turbo-frame="_top" href="/best_of_web/ltryN5j7">Why perfect grades don't matter</a>, <a class="text-gray-700 hover:text-gray-700" data-turbo-frame="_top" href="/lessons/why-are-there-so-many-types-of-apples-theresa-doud">Why are there so many types of apples?</a>, <a class="text-gray-700 hover:text-gray-700" data-turbo-frame="_top" href="/best_of_web/wsuJRkkK">Why we should draw more (and photograph less)</a>, <a class="text-gray-700 hover:text-gray-700" data-turbo-frame="_top" href="/lessons/why-should-you-read-flannery-o-connor

In [53]:
lesson_links = []

for l in soup.select('h2 a'):
  lesson_links.append("https://ed.ted.com/" + l.attrs.get('href'))

print(lesson_links[0])

https://ed.ted.com//lessons/should-we-get-rid-of-standardized-testing-arlo-kempf


# Step 4. Looping the process and storing the data

In [79]:
# create lesson title and lesson link list
lesson_title = []
lesson_link = []

for a in soup.select('h2 a'):
  # extract the lesson title text and the lesson link
  lesson_title.append(a.get_text())
  lesson_link.append("https://ed.ted.com/" + a.attrs.get('href'))

print(lesson_title)
print(lesson_link)

['Should we get rid of standardized testing?', "How schools can nurture every student's genius - Trish Millines Dziko", "Why perfect grades don't matter", 'Why are there so many types of apples?', 'Why we should draw more (and photograph less)', 'Why should you read Flannery O’Connor?', 'What is dyslexia?', '3 rules to spark learning - Ramsey Musallam', 'How can we support the emotional well-being of teachers? - Sydney Jensen', 'Myths you learned in health class', 'The dark history of IQ tests', '5 tips to improve your critical thinking', 'What’s the smartest age?', "A Parkland teacher's homework for us all - Diane Wolk-Rogers", 'How (and why) to read William Faulkner', 'How well do masks work?', 'How playing sports benefits your body ... and your brain', "The benefits of a good night's sleep", 'Am I Really A Visual Learner?', 'How to teach kids to talk about taboo topics - Liz Kleinrock', 'How to design a library that makes kids want to read - Michael Bierut', 'The benefits of good po

In [80]:
# create lesson category list
lesson_category = []

for c in soup.select('a.text-secondary-700'):
  # extract the lesson category text
  lesson_category.append(c.get_text())

print(lesson_category)

['Education Policy', 'Teaching & Education', 'Teaching & Education', 'Science & Technology', 'Teaching & Education', 'Literature & Language', 'Medical Conditions', 'Teaching & Education', 'Teaching & Education', 'Health', 'History', 'Thinking & Learning', 'Thinking & Learning', 'Public Health', 'Literature & Language', 'Public Health', 'Physical Fitness', 'Health', 'Thinking & Learning', 'Teaching & Education', 'Design, Engineering & Technology', 'Health', 'Health', 'Numbers & Operations']


In [81]:
# create lesson duration list
lesson_duration = []

for d in soup.select('p.sr-only'):
  # extract the lesson duration text, replace "Lesson duration " string with "", and clean the duration by removing extra spaces and newline characters
  duration = d.get_text()
  duration_only = duration.replace("Lesson duration ", "")
  cleaned_duration = ' '.join(duration_only.split())

  lesson_duration.append(cleaned_duration)

print(lesson_duration)

['05:41', '15:41', '04:58', '04:28', '02:53', '04:12', '04:35', '06:30', '11:32', '05:03', '06:10', '04:30', '04:53', '15:47', '04:41', '08:21', '03:47', '05:45', '02:38', '12:02', '12:27', '04:27', '02:43', '04:30']


In [82]:
# create lesson image link list
lesson_image_link = []

for i in soup.select('img.absolute'):
  # extract the lesson image link
  lesson_image_link.append(i.attrs.get('src'))

print(lesson_image_link)

['https://i.ytimg.com/vi/YtE0OsRWeYI/0.jpg', 'https://i.ytimg.com/vi/SUDMIA23_5s/0.jpg', 'https://i.ytimg.com/vi/KShfEMy8UZQ/0.jpg', 'https://i.ytimg.com/vi/mQePz62zkqA/0.jpg', 'https://i.ytimg.com/vi/k1eHm0PNnjo/0.jpg', 'https://i.ytimg.com/vi/9QVsGWsk7TU/0.jpg', 'https://i.ytimg.com/vi/zafiGBrFkRM/0.jpg', 'https://i.ytimg.com/vi/YsYHqfk0X2A/0.jpg', 'https://i.ytimg.com/vi/OfCLTQhW9GQ/0.jpg', 'https://i.ytimg.com/vi/pkzunP1s6cY/0.jpg', 'https://i.ytimg.com/vi/W2bKaw2AJxs/0.jpg', 'https://i.ytimg.com/vi/dItUGF8GdTw/0.jpg', 'https://i.ytimg.com/vi/sbCvQbBi2G8/0.jpg', 'https://i.ytimg.com/vi/kuT7zWZEwl0/0.jpg', 'https://i.ytimg.com/vi/wYQWh8VGL3M/0.jpg', 'https://i.ytimg.com/vi/0Tp0zB904Mc/0.jpg', 'https://i.ytimg.com/vi/hmFQqjMF_f0/0.jpg', 'https://i.ytimg.com/vi/gedoSfZvBgE/0.jpg', 'https://i.ytimg.com/vi/V-S_53HmEUA/0.jpg', 'https://i.ytimg.com/vi/G9-urSR19SI/0.jpg', 'https://i.ytimg.com/vi/YsA_JTeHJ6A/0.jpg', 'https://i.ytimg.com/vi/OyK0oE5rwFY/0.jpg', 'https://i.ytimg.com/vi/KJRzgl0

In [83]:
# create lesson viewers list
lesson_viewers = []

for v in soup.select('p.text-gray-700'):
  # extract the lesson view text, replace " Views" with "", and clean the lesson view by removing newline charaacters
  viewers = v.get_text()
  viewers_only = viewers.replace(" Views", "")
  cleaned_viewers = ' '.join(viewers_only.split())

  lesson_viewers.append(cleaned_viewers)

print(lesson_view)

['1,138,424', '68,558', '4,331,796', '1,024,319', '366,667', '365,126', '6,329,977', '375,073', '144,495', '516,267', '2,134,707', '9,714,219', '1,180,797', '66,820', '166,494', '2,142,401', '2,038,430', '6,201,353', '294,802', '136,983', '449,132', '13,338,526', '1,823,375', '2,298,571']


In [84]:
# create lesson description list
lesson_description = []

# loop through the 'lesson_link' link, where each link redirects to a lesson
for link in lesson_link:
  # re-send the HTTP Get requests and parse the HTML content inside the article
  response_link = requests.get(link, headers=headers)
  soup_link = BeautifulSoup(response_link.text, "html.parser")

  description = soup_link.select('.lessonDescription')

  # check if description is found on the page
  if description:
    # extract the description text and clean the source by removing extra spaces and newline characters
    description_text = description[0].get_text()
    cleaned_description = ' '.join(description_text.split())

    lesson_description.append(cleaned_description)

print(lesson_description)

['Although standardized testing is a particularly hot topic in education right now, this approach to measurement has been in use for two millennia. And while the results of standardized testing can help us understand some things, they can also be misleading if used incorrectly. So what do these tests actually measure? And are they worthwhile? Arlo Kempf investigates.', 'Forget home economics and standardized tests, education visionary Trish Millines Dziko has a much more engaging and fulfilling way for students to develop real-world skills. Get schooled by Dziko as she shares how project-based learning can transform public education and unlock genius for the next generation of critical thinkers, problem solvers, ideators and leaders.', 'Most American students strive for a 4.0 GPA and the highest test scores, but research shows that this quest for perfection actually discourages creativity and reduces academic risk-taking. In this episode of “School Myths” by The Atlantic, Alice Roth in

In [85]:
# create video lesson link list
video_lesson_link = []

# loop through the 'lesson_link' link, where each link redirects to a lesson
for link in lesson_link:
  # re-send the HTTP Get requests and parse the HTML content inside the article
  response_link = requests.get(link, headers=headers)
  soup_link = BeautifulSoup(response_link.text, "html.parser")

  # extract the lesson video link
  video_link = soup_link.select('#playerContainer')
  video_lesson_link.append("https://youtu.be/" + video_link[0].attrs.get('data-video-id'))

print(video_lesson_link)

['https://youtu.be/YtE0OsRWeYI', 'https://youtu.be/SUDMIA23_5s', 'https://youtu.be/KShfEMy8UZQ', 'https://youtu.be/mQePz62zkqA', 'https://youtu.be/k1eHm0PNnjo', 'https://youtu.be/9QVsGWsk7TU', 'https://youtu.be/zafiGBrFkRM', 'https://youtu.be/YsYHqfk0X2A', 'https://youtu.be/OfCLTQhW9GQ', 'https://youtu.be/pkzunP1s6cY', 'https://youtu.be/W2bKaw2AJxs', 'https://youtu.be/dItUGF8GdTw', 'https://youtu.be/sbCvQbBi2G8', 'https://youtu.be/kuT7zWZEwl0', 'https://youtu.be/wYQWh8VGL3M', 'https://youtu.be/0Tp0zB904Mc', 'https://youtu.be/hmFQqjMF_f0', 'https://youtu.be/gedoSfZvBgE', 'https://youtu.be/V-S_53HmEUA', 'https://youtu.be/G9-urSR19SI', 'https://youtu.be/YsA_JTeHJ6A', 'https://youtu.be/OyK0oE5rwFY', 'https://youtu.be/KJRzgl0FuMA', 'https://youtu.be/eVm063xmnow']


# Step 5. Save as DataFrame and store it as CSV for further analysis

In [86]:
print(len(lesson_title))
print(len(lesson_link))
print(len(lesson_category))
print(len(lesson_duration))
print(len(lesson_image_link))
print(len(lesson_viewers))
print(len(lesson_description))
print(len(video_lesson_link))

24
24
24
24
24
24
24
24


In [87]:
df = pd.DataFrame(
    {'Title': lesson_title,
     'Category' : lesson_category,
     'Duration': lesson_duration,
     'Viewers': lesson_viewers,
     'Description': lesson_description,
     'Video Link' : video_lesson_link,
     'Image Link' : lesson_image_link,
     'Lesson Link': lesson_link
     }
)

print(df.head())

df.to_csv('tededlesson.csv', index=False)

                                               Title              Category  \
0         Should we get rid of standardized testing?      Education Policy   
1  How schools can nurture every student's genius...  Teaching & Education   
2                    Why perfect grades don't matter  Teaching & Education   
3             Why are there so many types of apples?  Science & Technology   
4      Why we should draw more (and photograph less)  Teaching & Education   

  Duration    Viewers                                        Description  \
0    05:41  1,138,424  Although standardized testing is a particularl...   
1    15:41     68,558  Forget home economics and standardized tests, ...   
2    04:58  4,331,796  Most American students strive for a 4.0 GPA an...   
3    04:28  1,024,319  Have you ever walked into a grocery store and ...   
4    02:53    366,667  A photograph is a much better representation o...   

                     Video Link                                Image Link 