In [8]:
from bs4 import BeautifulSoup as bs
from splinter import Browser
import time
import datetime
import pandas as pd
import lxml
import re

In [2]:
# utility functions 

# gets alt tags for images that are otherwise given in table as NaN, for levels and country
def getAlt(x):
    for img in x.find_all('img'):
        if(img.get("alt")):
#             print(img.get("alt"))
            img.string = img.get("alt")
    
def getLinkData(x):
    for a in x.find_all('a'):
        getAlt(a)
        
# gets the table data from the raw scraping, organizing lists of data for the icons and links
def getTableData(row):
    if('tr_header' in row.get_attribute_list('class')):
        return row.td.get_text()
    results = []
    for div in row.find_all('div'):
        a = div.find('a')
        img = a.find('img')
        val = {
            "link": a.get('href'),
            "text": a.get_text(),
            "image": img.get('src')
        }
        results.append(val)
        
    return results
            
# reorganizes the raw scraped data into a list of dictionaries
# with identical keys (to become columns)
def collapseHeaderRows(row_list, condensor_str):
    i = 0
    curr_grouping = row_list[i]
    updated_data_list = []
    while(i < len(row_list)):
        if(row_list[i] == str(row_list[i])):
            curr_grouping = row_list[i]
            row_list.remove(curr_grouping)
        else:
            for j in range(len(row_list[i])):
                row_list[i][j][condensor_str] = curr_grouping
                updated_data_list.append(row_list[i][j])
            i += 1
                
    return updated_data_list
        
# creates a DF from the reorganized data, with custom names for
# the values to combine (from the "header" row above the list of items in the table) and
# the "text" of those items (may be "Factions", "Unit Types", etc.)
def createDFfromRowData(row_data, condensor_str, text_name):
    header = row_data.pop(0)
    row_data_condensed = collapseHeaderRows(row_data, condensor_str)

    return pd.DataFrame(data={
        condensor_str:[a[condensor_str] for a in row_data_condensed],
        text_name: [a["text"] for a in row_data_condensed],
        "link": [a["link"] for a in row_data_condensed],
        "image": [a["image"] for a in row_data_condensed],
    })

In [70]:

# start by getting data for each playable faction, then add unit scraping on top of it (helper function later)
def getCoursesData():
    base_url = "https://www.udacity.com"
    url = base_url + "/courses/all"

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    browser.visit(url)
    
    #add a delay so page fully loads
    time.sleep(6)
    
    courses = []
    
    try:
        current_course_catalog = browser.html

        news_soup = bs(current_course_catalog, "html.parser")

        available_courses = news_soup.find(class_=re.compile("^catalog.*"))
        print(len(available_courses))

        
        for i, course in enumerate(available_courses.find_all("li")):
            course_type = course.find(class_=re.compile("^card_flag.*")).get_text()
            course_title = course.find(class_=re.compile("^card_title.*")).get_text()
            course_desc = course.find(class_=re.compile("^card_summary.*")).get_text()
            details = course.find_all(class_=re.compile("^card_detailContent.*"))
            course_level = course.find(class_=re.compile("^card_level.*")).get_text()
            course_duration = course.find(class_=re.compile("^card_duration.*")).get_text()
            
            try:
                course_skills_covered = details[0].get_text()
                course_prerequisites = details[1].get_text()
            except:
                course_prerequisites = None
                pass
            
            rating = None
            review_count = None
            try:
                rating = float(course.find(class_=re.compile("^rating-stars_visuallyHidden.*")).get_text().split(" ")[0])
                review_count = int(course.find(class_=re.compile("^card_reviewCount.*")).get_text().split(" ")[0])
            except:
                pass
            
            affiliates = None
            try:
                affiliates = course.find(class_=re.compile("^card_affiliatesContent.*")).get_text()
            except:
                pass
            
            link = base_url + course.find("a")["href"]
            
            courses.append((course_title, course_type, course_desc, course_level, course_duration, rating, review_count, course_skills_covered, course_prerequisites, affiliates, link))
            
    except:
        print("Something went wrong!")
        
    
    browser.quit()

    return courses

In [71]:
#get basic playable faction data (by scraping honga.net)
# output_df = getFactionsData()

# output_df

courses = getCoursesData()
courses

3


[('Data Engineering with AWS',
  'nanodegree',
  'Learn to design data models, build data warehouses and data lakes, automate data pipelines, and manage massive datasets.',
  'intermediate',
  '4 Months',
  4.6,
  1802,
  'AWS Glue,  Amazon S3,  AWS Data Warehouse,  Redshift,  Apache Airflow,  Data Pipeline DAGs,  Data Extraction,  Data Pipeline Creation,  ETL,  OLAP Cubes,  Data Warehouse Architecture,  Cloud Computing Fluency,  Data Pipeline Maintenance',
  'Intermediate Python, intermediate SQL & command line',
  None,
  'https://www.udacity.com/course/data-engineer-nanodegree--nd027'),
 ('Product Manager',
  'nanodegree',
  'Envision and execute the development of industry-defining products, and learn how to successfully bring them to market.',
  'beginner',
  '4 Months',
  4.7,
  864,
  'Product Strategy, Product Design, Product Development, Design Sprint, Product Launch',
  'No Experience Required',
  None,
  'https://www.udacity.com/course/product-manager-nanodegree--nd036'),
 (

In [76]:
cols = ["Title", "Type", "Description", "Level", "Duration", "Rating", "Review Count", "Skills Covered", "Prerequisites", "Affiliates", "URL"]
courses_df = pd.DataFrame(courses, columns=cols)
courses_df

Unnamed: 0,Title,Type,Description,Level,Duration,Rating,Review Count,Skills Covered,Prerequisites,Affiliates,URL
0,Data Engineering with AWS,nanodegree,"Learn to design data models, build data wareho...",intermediate,4 Months,4.6,1802.0,"AWS Glue, Amazon S3, AWS Data Warehouse, Re...","Intermediate Python, intermediate SQL & comman...",,https://www.udacity.com/course/data-engineer-n...
1,Product Manager,nanodegree,Envision and execute the development of indust...,beginner,4 Months,4.7,864.0,"Product Strategy, Product Design, Product Deve...",No Experience Required,,https://www.udacity.com/course/product-manager...
2,C++,nanodegree,Get hands-on experience building five real-wor...,intermediate,4 Months,4.5,1126.0,"Data Structures & Algorithms, Memory Managemen...",Intermediate Programming,,https://www.udacity.com/course/c-plus-plus-nan...
3,Business Analytics,nanodegree,Gain foundational data skills like analyzing d...,beginner,3 Months,4.8,2649.0,"Excel & Spreadsheets, SQL, Data Visualization,...",,Mode,https://www.udacity.com/course/business-analyt...
4,Data Scientist,nanodegree,"Build effective machine learning models, run d...",advanced,4 Months,4.7,1212.0,"Machine Learning, Deep Learning, Software Engi...","Python, SQL & Statistics","Bertelsmann, Figure Eight, IBM Watson, Insight...",https://www.udacity.com/course/data-scientist-...
...,...,...,...,...,...,...,...,...,...,...,...
262,Front-End Interview Prep,free,Answer front-end technical and behavioral inte...,intermediate,1 Week,,,"Interview questions, Common FE Questions, Whit...",,,https://www.udacity.com/course/front-end-inter...
263,Full-Stack Interview Prep,free,Answer common full stack and web security inte...,intermediate,1 Week,,,"Interview practice, Common FS Questions, White...",,,https://www.udacity.com/course/full-stack-inte...
264,Data Structures & Algorithms in Swift,free,Review and practice the skills technical inter...,intermediate,4 Weeks,,,"Interview practice, Swift, Data structures, Ca...",,,https://www.udacity.com/course/data-structures...
265,iOS Interview Prep,free,Answer iOS and mobile development interview qu...,intermediate,1 Week,,,"Interview practice, Common iOS Questions, Whit...",,,https://www.udacity.com/course/ios-interview-p...


In [79]:
courses_df.to_csv("./output/all_courses.csv", index=False)
courses_df

Unnamed: 0,Title,Type,Description,Level,Duration,Rating,Review Count,Skills Covered,Prerequisites,Affiliates,URL
0,Data Engineering with AWS,nanodegree,"Learn to design data models, build data wareho...",intermediate,4 Months,4.6,1802.0,"AWS Glue, Amazon S3, AWS Data Warehouse, Re...","Intermediate Python, intermediate SQL & comman...",,https://www.udacity.com/course/data-engineer-n...
1,Product Manager,nanodegree,Envision and execute the development of indust...,beginner,4 Months,4.7,864.0,"Product Strategy, Product Design, Product Deve...",No Experience Required,,https://www.udacity.com/course/product-manager...
2,C++,nanodegree,Get hands-on experience building five real-wor...,intermediate,4 Months,4.5,1126.0,"Data Structures & Algorithms, Memory Managemen...",Intermediate Programming,,https://www.udacity.com/course/c-plus-plus-nan...
3,Business Analytics,nanodegree,Gain foundational data skills like analyzing d...,beginner,3 Months,4.8,2649.0,"Excel & Spreadsheets, SQL, Data Visualization,...",,Mode,https://www.udacity.com/course/business-analyt...
4,Data Scientist,nanodegree,"Build effective machine learning models, run d...",advanced,4 Months,4.7,1212.0,"Machine Learning, Deep Learning, Software Engi...","Python, SQL & Statistics","Bertelsmann, Figure Eight, IBM Watson, Insight...",https://www.udacity.com/course/data-scientist-...
...,...,...,...,...,...,...,...,...,...,...,...
262,Front-End Interview Prep,free,Answer front-end technical and behavioral inte...,intermediate,1 Week,,,"Interview questions, Common FE Questions, Whit...",,,https://www.udacity.com/course/front-end-inter...
263,Full-Stack Interview Prep,free,Answer common full stack and web security inte...,intermediate,1 Week,,,"Interview practice, Common FS Questions, White...",,,https://www.udacity.com/course/full-stack-inte...
264,Data Structures & Algorithms in Swift,free,Review and practice the skills technical inter...,intermediate,4 Weeks,,,"Interview practice, Swift, Data structures, Ca...",,,https://www.udacity.com/course/data-structures...
265,iOS Interview Prep,free,Answer iOS and mobile development interview qu...,intermediate,1 Week,,,"Interview practice, Common iOS Questions, Whit...",,,https://www.udacity.com/course/ios-interview-p...
