# Task Description : 

Building a “Courses Recommendation System” based on udacity and Udemy courses
- The task will be scraped and stored
- then generalize this data in a one collection instead of two and clean it.
- pass this data to any default Machine learning model and get the predictions ( a pipeline for this task.)

#  Import Libraries

In [1]:
# !pip install scrapy
# !pip install requests
# !pip install beautifulsoup4
# !pip install requests-html
# !pip install phantomjs
# !pip install selenium
# !pip install webdriver-manager
# !pip install pyudemy
# !pip install python-decouple
# !pip install xmltojson
# !pip install utils

In [2]:
# Supressing the warning messages
import warnings
warnings.filterwarnings('ignore')

In [88]:
import os
import re
import io
import time
import selenium
import requests
import pandas as pd
import numpy as np
from pandas import plotting# 2: Reading the Dataset
from pyudemy import Udemy
from decouple import config
from PIL import Image
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import ElementClickInterceptedException
from bs4 import BeautifulSoup
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from lxml import html
from urllib.error import HTTPError
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

## Getting Udaicty courses

In [4]:
def Fun_getInstructor (_url):
    try:
        response = requests.get(_url)
        content = response.content
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}') 
    else:
#         print('Success!')
        course_soup = BeautifulSoup(content, 'html.parser')
        _instructor_list= []
        for _instructor in course_soup.findAll('h5',class_="degree-instructors_instructorName__2F9E_") :
            _instructor_list.append (_instructor.get_text())
        return _instructor_list

In [5]:
def GetUdacity (url):
    try:

        #Install ChromeDriverManager Driver
        driver = webdriver.Chrome(ChromeDriverManager().install())

        #use ChromeDriverManager to fetch soup data from browser
        driver.get(url)
        source = driver.page_source
        soup = bs(source,"html.parser")
        driver.close()

        #Get parent elemnt for each course
        courses_soup = soup.findAll(class_="card_container__25DrK")
        list_dics = []

        #Iterate parent elemnt 
        for _course in range (len(courses_soup)) :
            _url = "https://www.udacity.com" + courses_soup[_course]['href']
            _id = _url.split('--')[1].split("'")[0]
            _name = courses_soup[_course]['aria-label']
            _level = courses_soup[_course].find(class_="card_level__2HNxe").get_text()
            _category = courses_soup[_course].find(class_="card_flag__2XEZl").get_text()
            # _price =
            _about = courses_soup[_course].find(class_="card_summary__1HlQ7").get_text()
            _duration = courses_soup[_course].find(class_="card_duration__1hWII").get_text()

            #check if _skills_covered & _prerequisites are nullable values
            try:
                _skills_covered =courses_soup[_course].findAll(class_="card_details__3VdjA")[0].find(class_ ="card_detailContent__2eJIl").get_text()
            except Exception as err:
                _skills_covered =''
            try:
                _prerequisites = courses_soup[_course].findAll(class_="card_details__3VdjA")[0].findAll(class_ ="card_detailContent__2eJIl")[1].get_text()   
            except Exception as err:
                _prerequisites=''
            _course_info = {
                "url" : _url ,
                "id" : _id ,
                "name" : _name ,
                "level" : _level ,
                "category" : _category ,
                "about" : _about ,
                "duration" : _duration ,
                "about" : _about ,
                "skills" : _skills_covered.split(',') ,
                "prerequisites" : _prerequisites,
                "instructors" : Fun_getInstructor(_url),
                "source": "Udacity"
               }
            #append each course item to the List of Dictionaries
            list_dics.append(_course_info)
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    else:
#         print('Success!')
        return list_dics

In [8]:
#get Data from Udacity
url_Udacity = "https://www.udacity.com/courses/all"
UdacityList = GetUdacity(url_Udacity)

In [7]:
len (UdacityList)

265

In [9]:
UdacityList[0]

{'url': 'https://www.udacity.com/course/data-engineer-nanodegree--nd027',
 'id': 'nd027',
 'name': 'Data Engineer',
 'level': 'intermediate',
 'category': 'nanodegree',
 'about': 'Data Engineering is the foundation for the new world of Big Data. Enroll now to build production-ready data infrastructure, an essential skill for advancing your data career.',
 'duration': '5 Months',
 'skills': ['Data Modeling',
  ' Data Pipelines',
  ' Data Lakes',
  ' Spark',
  ' Airflow'],
 'prerequisites': 'Intermediate Python & SQL',
 'instructors': ['Amanda Moran',
  'Ben Goldberg',
  'Sameh El-Ansary',
  'Olli Iivonen',
  'David Drummond',
  'Judit Lantos',
  'Juno Lee'],
 'source': 'Udacity'}

In [10]:
_UdacityList = UdacityList

There are almost 265 unique links that belong to the different courses on Udacity. 

Each course URL has been visited in the above code to extract the information needed. The relevent HTML tags and class names were extracted by visiting the URL, right clicking on the element of interest and selecting Inspect to view the related HTML. This has be used with the BeautifulSoup find and find_all functions to get the information.

## Getting Udemy courses

there are 2 ways to get list of courses on Udemy : 
-  Either scrapping HTML response (Have not been used )
-  OR using **Udemy REST API** and  <a href="https://udemy.app.box.com/s/uwa9onfjh6y3kousfqss72ufdit38iz8"> it's documentaion</a>. **which is required within  <a href="https://drive.google.com/file/d/1quAO2A7h8F1DG_3pZWgEGTo98mr1pNaw/view?fbclid=IwAR2ysKjiD7ukDegXFcJ3VjhTrAjzN_8Y3gmPxCdAURlUeJehoqQpjSeLcuM"> the task document </a>**
we already created an API client. API client consists of a bearer token, which is connected to a user account on Udemy.


In [11]:
client_id = 'daiolx0INQiNWo6krQ3SJboygiH5ExjVlOZZOpsR'
client_secret = '4f7hZeHGcW34tx0D5x836JvLXYJRx2shWotK1uA4VZXIFoOpkN3EyD94b4ZL4ES80Ezq3tApZzuGaNYfgqQZX0O1czzmD5H3uaINJ16BDZTPAMTnKcZAveyy9fzrsbE8'

### first way : scrapping HTML response

In [13]:
import sys, json
import base64
import requests
import xmltojson
from xml.sax.saxutils import escape

def main():
    # API Authentication Parameters.
    portal_name = 'company'
    portal_id = '8888'
    # base64 HTTP Header
    auth_token = 'Basic ' + base64.b64encode("{}:{}".format(client_id,client_secret).encode()).decode('ascii')
    # Https GET Call
    header = {'Authorization': auth_token , "Accept": "application/json, text/plain, */*" ,  "Content-Type": "application/json;charset=utf-8"}        
    url = 'https://' + portal_name + '.udemy.com/api-2.0/organizations/' + portal_id +'/courses/list/'
    querystring = {"fields[courses]": "@all", "page": "1", "page_size": "1"}
    r = requests.request("GET", url, headers=header, params=querystring )
    
    print("Request status code" , r.status_code)
#     print(r.text)
    
    
    if 'json' in r.headers.get('Content-Type'):
        js = r.json()
        response_data = json.loads(r.text)
        result = response_data.get('result')
        print(r)
    else:
        print('Response content is not in JSON format.')
        js = 'spam'
        
if __name__== "__main__":
    main()

Request status code 200
Response content is not in JSON format.


### Second way : Udemy API

**Discovering API and methods**

In [14]:
udemy = Udemy(client_id,client_secret )

In [15]:
help(udemy.courses)

Help on method courses in module pyudemy.udemy:

courses(**kwargs) method of pyudemy.udemy.Udemy instance
    Returns list of courses.
    
    To see the list of accepted parameters go to:
    https://www.udemy.com/developers/methods/get-courses-list/



In [16]:
#Example of course_detail within Udemy API
udemy.course_detail(14284)

{'_class': 'course',
 'id': 14284,
 'title': 'Powerful Business Writing #2 – How to Write in Fewer Words',
 'url': '/course/write-less-say-more/',
 'is_paid': True,
 'price': '$49.99',
 'price_detail': {'amount': 49.99,
  'currency': 'USD',
  'price_string': '$49.99',
  'currency_symbol': '$'},
 'price_serve_tracking_id': 'cHkrpNkeRSCU0bdR8Ll5-A',
 'visible_instructors': [{'_class': 'user',
   'title': 'Caroline McDevitt',
   'name': 'Caroline',
   'display_name': 'Caroline McDevitt',
   'job_title': 'Business writer and editor',
   'image_50x50': 'https://img-c.udemycdn.com/user/50x50/145688_fe41_2.jpg',
   'image_100x100': 'https://img-c.udemycdn.com/user/100x100/145688_fe41_2.jpg',
   'initials': 'CM',
   'url': '/user/carolinejenkins2/'}],
 'image_125_H': 'https://img-c.udemycdn.com/course/125_H/14284_6833_16.jpg',
 'image_240x135': 'https://img-c.udemycdn.com/course/240x135/14284_6833_16.jpg',
 'is_practice_test_course': False,
 'image_480x270': 'https://img-c.udemycdn.com/course/

In [17]:
UdemyList = udemy.courses()
print (type (UdemyList))

<class 'dict'>


In [18]:
# UdemyList['results'] contains all info about courses , it's list of dictionaries
print ( len (UdemyList['results']))
#extrcat list of dict from 'results' into 'UdemyList'
UdemyList = UdemyList.get('results')
print ( len (UdemyList) )

12
12


In [19]:
def fun_getCourseSyllabus(id):
    details = udemy.public_curriculum(id)
    syllabus = [] 
    for key in details.get('results'):
            syllabus.append(key['title'])
    return syllabus

In [20]:
#create dic of list
UdemyListofcourses =[]
for _course in range (len(UdemyList)-1) :
        _url = "https://www.udemy.com"+ UdemyList[_course]['url']
        _id = UdemyList[_course]['id']
        try:
            _name = UdemyList[_course]['title']
        except:
            _name = UdemyList[_course]['title'] ='No title'
        _category = UdemyList[_course]['_class']
        _price =UdemyList[_course]['price']
        _instructors = []
        _instructors.append( UdemyList[_course]['visible_instructors'][0].get('display_name'))
        _about = UdemyList[_course]['headline']
        _course_info = {
            "url" : _url ,
            "id" : _id ,
            "name" : _name ,
            "category" : _category ,
            "about" : _about ,
            "price" : _price,
            "description" : _about ,
            "instructors" : _instructors,
            "Syllabus" : fun_getCourseSyllabus(_id),
            "source": "Udemy"
           }
        #append each course item to the List of Dictionaries
        UdemyListofcourses.append(_course_info)
print (len(UdemyListofcourses), 'Courses')

11 Courses


## Generalizling the data

concatenate the two collections in the most suitable form

In [50]:
print ('Udacity courses= '  , len (UdacityList) )
print ('Udemy courses= ' ,len (UdemyListofcourses))

Udacity courses=  320
Udemy courses=  11


In [51]:
#add UdacityList to all_courses
allCourses = []
allCourses = UdacityList

In [52]:
#add UdemyList to all_courses
for _course in UdemyListofcourses :
    allCourses.append(_course)
    
print(len(allCourses) , 'courses')

331 courses


## Exploratory Data Analysis
In this step, one should load the data and analyze it. 
Let's load the data using `pandas` and have a look at the generated `DataFrame`.
- first step , will be loading all_courses into a pandas dataframe 
- econd step will be cleanning the data if it have some tages like the ones in "description", in period “the final collection
should have the same values as above and it should be clean or preprocessed ”

In [54]:
df = pd.DataFrame(allCourses)

In [26]:
#sample of Udacity courses
df.head(3)

Unnamed: 0,url,id,name,level,category,about,duration,skills,prerequisites,instructors,source,price,description,Syllabus
0,https://www.udacity.com/course/data-engineer-n...,nd027,Data Engineer,intermediate,nanodegree,Data Engineering is the foundation for the new...,5 Months,"[Data Modeling, Data Pipelines, Data Lakes, ...",Intermediate Python & SQL,"[Amanda Moran, Ben Goldberg, Sameh El-Ansary, ...",Udacity,,,
1,https://www.udacity.com/course/data-analyst-na...,nd002,Data Analyst,intermediate,nanodegree,"Use Python, SQL, and statistics to uncover ins...",4 Months,"[Data Wrangling, Matplotlib, Bootstrapping, ...",Python & SQL,"[Josh Bernhard , Sebastian Thrun, Derek Steer,...",Udacity,,,
2,https://www.udacity.com/course/product-manager...,nd036,Product Manager,beginner,nanodegree,Envision and execute the development of indust...,4 Months,"[Product Strategy, Product Design, Product D...",No Experience Required,"[Anastasia Root, Alex King, Yuva Murugan]",Udacity,,,


In [27]:
#sample of Udemy courses
df.tail(3)

Unnamed: 0,url,id,name,level,category,about,duration,skills,prerequisites,instructors,source,price,description,Syllabus
273,https://www.udemy.com/course/wordpress-mastery/,14727,WordPress Essentials,,course,"Learn WordPress basics, tips and tricks to run...",,,,[Stone River eLearning],Udemy,$49.99,"Learn WordPress basics, tips and tricks to run...","[Welcome to WordPress Mastery!, Ready for Some..."
274,https://www.udemy.com/course/plein-air-paintin...,15574,Plein Air Essentials - Learn Basics of Plein A...,,course,Learn the essentials of plein air painting in ...,,,,[Michael Chesley Johnson AIS PSA MPAC],Udemy,$19.99,Learn the essentials of plein air painting in ...,"[Introduction, Introduction, Manual, Equipment..."
275,https://www.udemy.com/course/building-your-pla...,14219,Building Your Platform,,course,What Businesses of All Sizes and Types Can Lea...,,,,[Phil Simon],Udemy,$49.99,What Businesses of All Sizes and Types Can Lea...,"[Course, The Age of the Platform in Three Minu..."


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276 entries, 0 to 275
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            276 non-null    object
 1   id             276 non-null    object
 2   name           276 non-null    object
 3   level          265 non-null    object
 4   category       276 non-null    object
 5   about          276 non-null    object
 6   duration       265 non-null    object
 7   skills         265 non-null    object
 8   prerequisites  265 non-null    object
 9   instructors    276 non-null    object
 10  source         276 non-null    object
 11  price          11 non-null     object
 12  description    11 non-null     object
 13  Syllabus       11 non-null     object
dtypes: object(14)
memory usage: 30.3+ KB


In [32]:
df.columns

Index(['url', 'id', 'name', 'level', 'category', 'about', 'duration', 'skills',
       'prerequisites', 'instructors', 'source', 'price', 'description',
       'Syllabus'],
      dtype='object')

In [33]:
df.describe().T

Unnamed: 0,count,unique,top,freq
url,276,276,https://www.udacity.com/course/ios-interview-p...,1
id,276,276,ud884,1
name,276,275,Artificial Intelligence,2
level,265,4,intermediate,151
category,276,3,free,194
about,276,267,,6
duration,265,26,4 Months,39
skills,265,247,[],13
prerequisites,265,58,,194
instructors,276,80,[],195


In [34]:
df.dtypes

url              object
id               object
name             object
level            object
category         object
about            object
duration         object
skills           object
prerequisites    object
instructors      object
source           object
price            object
description      object
Syllabus         object
dtype: object

### Handling missing values

In [36]:
#Looking out for missing values and handling them
df.isnull().sum()

url                0
id                 0
name               0
level             11
category           0
about              0
duration          11
skills            11
prerequisites     11
instructors        0
source             0
price            265
description      265
Syllabus         265
dtype: int64

In [92]:
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
df['price'].replace(np.nan, 0)

0       0.00
1       0.00
2       0.00
3       0.00
4       0.00
       ...  
326    19.99
327    49.99
328    49.99
329    19.99
330    49.99
Name: price, Length: 331, dtype: float64

In [84]:
df['duration'] = df['duration'].str.replace('Months', '')
df['duration'] = df['duration'].str.replace('Hours', '')
df['duration'] = df['duration'].str.replace('Month', '')
df['duration'] = df['duration'].str.replace('Weeks', '')
df['duration'] = df['duration'].str.replace('Week', '')
df['duration'] = df['duration'].str.replace('Hour', '')
df['duration'] = pd.to_numeric(df['duration'],errors='coerce')
df['duration'].head(3)

0    5.0
1    4.0
2    4.0
Name: duration, dtype: float64

## Data Splitting

Now it's time to split the dataset for the training step. Typically the dataset is split into 3 subsets, namely, the training, validation and test sets. In our case, the test set will be **Udacity Courses** as thier prices are hidden with Udacity and cant be scarpping . So we'll split the "training" set into training and validation sets with 0.8:0.2 ratio. 


In [55]:
df = df.drop(columns = ['id'])
df.columns

Index(['url', 'name', 'level', 'category', 'about', 'duration', 'skills',
       'prerequisites', 'instructors', 'source', 'price', 'description',
       'Syllabus'],
      dtype='object')

In [95]:
X = df[['name','level','category','duration' ,'source' ]]
y =df['price']

In [96]:
X = pd.get_dummies(data=X, drop_first=True)

In [106]:
X.replace(np.nan, 0)

0       0.00
1       0.00
2       0.00
3       0.00
4       0.00
       ...  
326    19.99
327    49.99
328    49.99
329    19.99
330    49.99
Name: price, Length: 331, dtype: float64

In [107]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = .20, random_state = 40)

## Model Training

Let's train a model with the data! We'll train a Random Forest Classifier to demonstrate the process of making submissions. 

In [109]:
# Create an instance of the model
model = LinearRegression()
 
# Train the model
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
predicted

In [None]:
print("The accuracy of the model on the validation set is ", (classifier.score(X_val, y_val)))

## Model Prediction

In [None]:
y_test_predicted = model.predict(X_test)

test_df['price'] = y_test_predicted

test_df.head()