In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [424]:
baseurl = 'https://www.coursera.org'

In [425]:
header = {
    'User-Agents': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36'
}

# Scrapping Links

In [426]:
r = requests.get('https://www.coursera.org/certificates/data-science')
soup = BeautifulSoup(r.content, 'lxml')

In [427]:
course_list = soup.find_all('div',class_= 'cds-63 css-qsaw8 cds-65 cds-grid-item cds-134')

In [428]:
course_link = []

In [429]:
for i in course_list:
    for link in i.find_all('a',href = True):
        course_link.append(baseurl + link['href'])

In [430]:
course_link

['https://www.coursera.org/professional-certificates/google-data-analytics',
 'https://www.coursera.org/professional-certificates/ibm-data-science',
 'https://www.coursera.org/professional-certificates/ibm-data-analyst',
 'https://www.coursera.org/professional-certificates/ibm-data-engineer',
 'https://www.coursera.org/professional-certificates/meta-database-engineer',
 'https://www.coursera.org/professional-certificates/ibm-data-analyst-r-excel',
 'https://www.coursera.org/professional-certificates/data-warehouse-engineering',
 'https://www.coursera.org/professional-certificates/tensorflow-in-practice',
 'https://www.coursera.org/professional-certificates/azure-data-scientist',
 'https://www.coursera.org/professional-certificates/microsoft-azure-dp-203-data-engineering',
 'https://www.coursera.org/professional-certificates/ibm-machine-learning',
 'https://www.coursera.org/professional-certificates/sas-advanced-programmer',
 'https://www.coursera.org/professional-certificates/certified

# **Scrapping Course Name , Ratings , Financial Aid , No. of Students Enrolled**

In [415]:
testlink = 'https://www.coursera.org/professional-certificates/google-data-analytics'

In [416]:
r = requests.get(testlink , headers = header)
soup = BeautifulSoup(r.content, 'lxml')

# Name

In [421]:
name = soup.find('h1',class_ = 'cds-33 css-1shw822 cds-35').text.strip()
name

'IBM Data Engineering Professional Certificate'

# UpperGrid - Ratings ,  Duration , Langugage , Level 

In [360]:
grid = soup.find('div',class_ = 'cds-71 css-0 cds-72')
upper_grid = grid.find_all('p',class_ = 'cds-33 css-bku0rr cds-35')
upper_grid = np.array(upper_grid).reshape(-1)
upper_grid

array(['4.8/5', '6 Months', 'English', 'Beginner Level',
       '100% Self-Paced'], dtype='<U15')

# Ratings

In [361]:
ratings = upper_grid[0]
ratings

'4.8/5'

# Duration

In [362]:
duration = upper_grid[1]
duration

'6 Months'

# Language

In [363]:
lang = upper_grid[2]
lang

'English'

# Level

In [364]:
lvl = upper_grid[3]
lvl

'Beginner Level'

# LowerGrid - Total Ratings , Hours Per Week , Subtitles , Experience Required

In [365]:
l_grid = soup.find('div',class_ = 'cds-71 css-0 cds-72')
lower_grid = l_grid.find_all('p',class_ = 'cds-33 css-14d8ngk cds-35')
lower_grid = np.array(lower_grid).reshape(-1)
lower_grid

array([<span>77,398 ratings <br/>1,114,824 already enrolled</span>,
       'Under 10 hours of study a week', 'Subtitles: English',
       'No prior experience required.', 'Learn on your own time'],
      dtype=object)

# Total Ratings

In [366]:
tot_rat = np.array(lower_grid[0])[0].split(' ')[0]
# tot_rat = tot_rat.split(' ')[0]
tot_rat

'77,398'

# Enrolled

In [367]:
enroll = np.array(lower_grid[0])[2]
enroll = enroll.split(' ')[0]
enroll

'1,114,824'

# Hours Per Week

In [368]:
hpw = lower_grid[1].split(' ')[1]
hpw

'10'

# Subtitles

In [371]:
sub = lower_grid[2].split(' ')[1:]
sub

['English']

# Prior Experience

In [343]:
p_ex = lower_grid[3].split(' ')[0]
p_ex

'No'

# DataFrame

In [431]:
lst = []

In [432]:
for i,link in enumerate(course_link):
    if i == 4:
        break
    else:
        r = requests.get(link , headers = header)
        soup = BeautifulSoup(r.content, 'lxml')
        try:
            name = soup.find('h1',class_ = 'cds-33 css-1shw822 cds-35').text.strip()
            grid = soup.find('div',class_ = 'cds-71 css-0 cds-72')
            upper_grid = grid.find_all('p',class_ = 'cds-33 css-bku0rr cds-35')
            upper_grid = np.array(upper_grid).reshape(-1)    
            l_grid = soup.find('div',class_ = 'cds-71 css-0 cds-72')
            lower_grid = l_grid.find_all('p',class_ = 'cds-33 css-14d8ngk cds-35')
            lower_grid = np.array(lower_grid).reshape(-1)
            ratings = upper_grid[0]
            duration = upper_grid[1]
            lang = upper_grid[2]       
            lvl = upper_grid[3]
            tot_rat = np.array(lower_grid[0])[0].split(' ')[0]
            enroll = np.array(lower_grid[0])[2].split(' ')[0]
            hpw = lower_grid[1].split(' ')[1]
            sub = lower_grid[2].split(' ')[1:]
            p_ex = lower_grid[3].split(' ')[0]
        except:
            name = 'NA'
            ratings = 'NA'
            duration = 'NA'
            lang = 'NA'
            lvl = 'NA'
            tot_rat = 'NA'
            enroll = 'NA'
            hpw = 'NA'
            sub = 'NA'
            p_ex = 'NA'
        course = {
                  'Name':name,
                  'Ratings': ratings,
                  'Duration': duration,
                  'Language': lang, 
                  'Level': lvl,
                  'Total Ratings': tot_rat,
                  'Enrolled': enroll,
                  'Hours Per Week': hpw, 
                  'Subtitles': sub,
                  'Prerequisite': p_ex
        }
        print(course)

{'Name': 'NA', 'Ratings': 'NA', 'Duration': 'NA', 'Language': 'NA', 'Level': 'NA', 'Total Ratings': 'NA', 'Enrolled': 'NA', 'Hours Per Week': 'NA', 'Subtitles': 'NA', 'Prerequisite': 'NA'}
{'Name': 'IBM Data Science Professional Certificate', 'Ratings': '4.6/5', 'Duration': '11 Months', 'Language': 'English', 'Level': 'Beginner Level', 'Total Ratings': '57,528', 'Enrolled': '127,465', 'Hours Per Week': '4', 'Subtitles': ['English,', 'Arabic,', 'French,', 'Portuguese', '(European),', 'Italian,', 'Vietnamese,', 'German,', 'Russian,', 'Turkish,', 'Spanish,', 'Persian,', 'Korean'], 'Prerequisite': 'No'}
{'Name': 'NA', 'Ratings': 'NA', 'Duration': 'NA', 'Language': 'NA', 'Level': 'NA', 'Total Ratings': 'NA', 'Enrolled': 'NA', 'Hours Per Week': 'NA', 'Subtitles': 'NA', 'Prerequisite': 'NA'}
{'Name': 'IBM Data Engineering Professional Certificate', 'Ratings': '4.6/5', 'Duration': '15 Months', 'Language': 'English', 'Level': 'Beginner Level', 'Total Ratings': '1,978', 'Enrolled': '25,900', 'Ho

In [433]:
for i,link in enumerate(course_link):
    if i == 4:
        break
    else:
        r = requests.get(link , headers = header)
        soup = BeautifulSoup(r.content, 'lxml')
        name = soup.find('h1',class_ = 'cds-33 css-1shw822 cds-35').text.strip()
        try:
            grid = soup.find('div',class_ = 'cds-71 css-0 cds-72')
            upper_grid = grid.find_all('p',class_ = 'cds-33 css-bku0rr cds-35')
            upper_grid = np.array(upper_grid).reshape(-1)    
            l_grid = soup.find('div',class_ = 'cds-71 css-0 cds-72')
            lower_grid = l_grid.find_all('p',class_ = 'cds-33 css-14d8ngk cds-35')
            lower_grid = np.array(lower_grid).reshape(-1)
            ratings = upper_grid[0]
            duration = upper_grid[1]
            lang = upper_grid[2]       
            lvl = upper_grid[3]
            tot_rat = np.array(lower_grid[0])[0].split(' ')[0]
            enroll = np.array(lower_grid[0])[2].split(' ')[0]
            hpw = lower_grid[1].split(' ')[1]
            sub = lower_grid[2].split(' ')[1:]
            p_ex = lower_grid[3].split(' ')[0]
        except:
#             name = 'NA'
            ratings = 'NA'
            duration = 'NA'
            lang = 'NA'
            lvl = 'NA'
            tot_rat = 'NA'
            enroll = 'NA'
            hpw = 'NA'
            sub = 'NA'
            p_ex = 'NA'
        course = {
                  'Name':name,
                  'Ratings': ratings,
                  'Duration': duration,
                  'Language': lang, 
                  'Level': lvl,
                  'Total Ratings': tot_rat,
                  'Enrolled': enroll,
                  'Hours Per Week': hpw, 
                  'Subtitles': sub,
                  'Prerequisite': p_ex
        }
        print(course)

{'Name': 'Google Data Analytics Professional Certificate', 'Ratings': '4.8/5', 'Duration': '6 Months', 'Language': 'English', 'Level': 'Beginner Level', 'Total Ratings': '77,398', 'Enrolled': '1,114,824', 'Hours Per Week': '10', 'Subtitles': ['English'], 'Prerequisite': 'No'}
{'Name': 'IBM Data Science Professional Certificate', 'Ratings': '4.6/5', 'Duration': '11 Months', 'Language': 'English', 'Level': 'Beginner Level', 'Total Ratings': '57,528', 'Enrolled': '127,465', 'Hours Per Week': '4', 'Subtitles': ['English,', 'Arabic,', 'French,', 'Portuguese', '(European),', 'Italian,', 'Vietnamese,', 'German,', 'Russian,', 'Turkish,', 'Spanish,', 'Persian,', 'Korean'], 'Prerequisite': 'No'}
{'Name': 'IBM Data Analyst Professional Certificate', 'Ratings': '4.6/5', 'Duration': '11 Months', 'Language': 'English', 'Level': 'Beginner Level', 'Total Ratings': '9,551', 'Enrolled': '85,586', 'Hours Per Week': '3', 'Subtitles': ['English,', 'Arabic,', 'French,', 'Portuguese', '(European),', 'Italian

AttributeError: 'NoneType' object has no attribute 'text'

In [345]:
for link in course_link:
    r = requests.get(link , headers = header)
    soup = BeautifulSoup(r.content, 'lxml')
    try:
        grid = soup.find('div',class_ = 'cds-71 css-0 cds-72')
        upper_grid = grid.find_all('p',class_ = 'cds-33 css-bku0rr cds-35')
        upper_grid = np.array(upper_grid).reshape(-1)    
        l_grid = soup.find('div',class_ = 'cds-71 css-0 cds-72')
        lower_grid = l_grid.find_all('p',class_ = 'cds-33 css-14d8ngk cds-35')
        lower_grid = np.array(lower_grid).reshape(-1)
    except:
        grid = 'NA'
        l_grid = 'NA'
    try:
        name = soup.find('h1',class_ = 'cds-33 css-1shw822 cds-35').text.strip()
    except:
        name = 'NA'
    try:
        ratings = upper_grid[0]
    except:
        ratings = 'NA'
    try:
        duration = upper_grid[1]
    except:
        duration = 'NA'
    try: 
        lang = upper_grid[2]       
    except:
        lang = 'NA'
    try:
        lvl = upper_grid[3]
    except:
        lvl = 'NA'
    try:
        tot_rat = np.array(lower_grid[0])[0].split(' ')[0]
    except: 
        tot_rat = 'NA'
    try: 
        enroll = np.array(lower_grid[0])[2].split(' ')[0]
    except:
        enroll = 'NA'
    try:
        hpw = lower_grid[1].split(' ')[1]
    except:
        hpw = 'NA'
    try:
        sub = lower_grid[2].split(' ')[1]
    except:
        sub = 'NA'
    try:  
        p_ex = lower_grid[3].split(' ')[0]
    except:
        p_ex = 'NA'
# #     try:
# #         duration = upper_grid[1]
# #     except: 
# #         duration = 'NA'
# #     try
#     lang = upper_grid[2]
#     lvl = upper_grid[3]
#     tot_rat = np.array(lower_grid[0])[0].split(' ')[0]
#     enroll = np.array(lower_grid[0])[2].split(' ')[0]
#     hpw = lower_grid[1].split(' ')[1]
#     sub = lower_grid[2].split(' ')[1]
#     p_ex = lower_grid[3].split(' ')[0]

    course = {
              'Name':name,
              'Ratings': ratings,
              'Duration': duration,
              'Language': lang, 
              'Level': lvl,
              'Total Ratings': tot_rat,
              'Enrolled': enroll,
              'Hours Per Week': hpw, 
              'Subtitles': sub,
              'Prerequisite': p_ex
    }
    print(course)

{'Name': 'NA', 'Ratings': '4.8/5', 'Duration': '6 Months', 'Language': 'English', 'Level': 'Beginner Level', 'Total Ratings': '77,398', 'Enrolled': '1,114,824', 'Hours Per Week': '10', 'Subtitles': 'English', 'Prerequisite': 'No'}
{'Name': 'IBM Data Science Professional Certificate', 'Ratings': '4.6/5', 'Duration': '11 Months', 'Language': 'English', 'Level': 'Beginner Level', 'Total Ratings': '57,528', 'Enrolled': '127,465', 'Hours Per Week': '4', 'Subtitles': 'English,', 'Prerequisite': 'No'}
{'Name': 'NA', 'Ratings': '4.6/5', 'Duration': '11 Months', 'Language': 'English', 'Level': 'Beginner Level', 'Total Ratings': '57,528', 'Enrolled': '127,465', 'Hours Per Week': '4', 'Subtitles': 'English,', 'Prerequisite': 'No'}
{'Name': 'NA', 'Ratings': '4.6/5', 'Duration': '11 Months', 'Language': 'English', 'Level': 'Beginner Level', 'Total Ratings': '57,528', 'Enrolled': '127,465', 'Hours Per Week': '4', 'Subtitles': 'English,', 'Prerequisite': 'No'}
{'Name': 'NA', 'Ratings': '4.6/5', 'Dura

In [348]:
for a in course_link:
    print(a)

https://www.coursera.org/professional-certificates/google-data-analytics
https://www.coursera.org/professional-certificates/ibm-data-science
https://www.coursera.org/professional-certificates/ibm-data-analyst
https://www.coursera.org/professional-certificates/ibm-data-engineer
https://www.coursera.org/professional-certificates/meta-database-engineer
https://www.coursera.org/professional-certificates/ibm-data-analyst-r-excel
https://www.coursera.org/professional-certificates/data-warehouse-engineering
https://www.coursera.org/professional-certificates/tensorflow-in-practice
https://www.coursera.org/professional-certificates/azure-data-scientist
https://www.coursera.org/professional-certificates/microsoft-azure-dp-203-data-engineering
https://www.coursera.org/professional-certificates/ibm-machine-learning
https://www.coursera.org/professional-certificates/sas-advanced-programmer
https://www.coursera.org/professional-certificates/certified-data-science-practitioner
https://www.coursera.or