# Import Libraries

In [1]:
# Pandas and Numpy
import pandas as pd
import numpy as np
# Sklearn
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Load Dataset

In [2]:
all_courses_df = pd.read_csv('all_courses.csv')

In [3]:
print("The shape of the training dataset is {}.\n".format(all_courses_df.shape))

The shape of the training dataset is (282, 20).



In [4]:
all_courses_df.columns

Index(['Unnamed: 0', '_id', 'title', 'url', 'description', 'syllabus',
       'skills', 'ratings_count', 'prerequisites', 'duration', 'category',
       'level', 'schoolName', 'instructors', 'enrolled_students_count',
       'avg_rating', 'num_reviews', 'price', 'source', 'Unnamed: 19'],
      dtype='object')

In [5]:
all_courses_df.drop(['Unnamed: 19', 'Unnamed: 0', '_id'], axis=1, inplace=True)

# EDA

**First few row of training dataset**

In [6]:
all_courses_df.head()

Unnamed: 0,title,url,description,syllabus,skills,ratings_count,prerequisites,duration,category,level,schoolName,instructors,enrolled_students_count,avg_rating,num_reviews,price,source
0,Learn Serbian 102,https://www.udemy.com/course/serbian-202/,"Perfect for false beginners in Serbian, Bosnia...",[],Not Available,14,"['PC, laptop or ipad']",4.5 hours,course,beginner,Provided by Udemy Instructors,['Marina Petrovic'],120,4.5,14,$99.99,Udemy
1,Powerful Business Writing #2 – How to Write in...,https://www.udemy.com/course/write-less-say-more/,Boost your business writing skills – tips & ch...,"['Write shorter, more concise messages – with ...",Not Available,103,['The desire to write concisely and with confi...,43 mins,course,beginner,Provided by Udemy Instructors,['Caroline McDevitt'],839,3.7,103,$49.99,Udemy
2,SketchUp to LayOut,https://www.udemy.com/course/sketchuptolayout/,The essential guide to creating construction d...,['Have complete control over the look of your ...,Not Available,250,[],3.5 hours,course,beginner,Provided by Udemy Instructors,['Matt Donley'],1418,4.7,250,$49.99,Udemy
3,Learn Adobe Dreamweaver CS6 - For Absolute Beg...,https://www.udemy.com/course/learn-adobe-dream...,A beginner level course for those new to Dream...,['Get started with Dreamweaver by navigating t...,Not Available,172,['Dreamweaver CS6'],12 hours,course,beginner,Provided by Udemy Instructors,['Simon Sez IT'],12905,4.5,172,$89.99,Udemy
4,Celebrating Financial Freedom,https://www.udemy.com/course/celebrating-finan...,The Christian Get Out of Debt Course,['To get you out of debt and into prosperity u...,Not Available,22,['None'],2 hours,course,beginner,Provided by Udemy Instructors,['Jason Cabler'],125,4.8,22,$94.99,Udemy


**What about info?**

In [7]:
all_courses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   title                    282 non-null    object
 1   url                      282 non-null    object
 2   description              279 non-null    object
 3   syllabus                 282 non-null    object
 4   skills                   282 non-null    object
 5   ratings_count            282 non-null    object
 6   prerequisites            282 non-null    object
 7   duration                 279 non-null    object
 8   category                 282 non-null    object
 9   level                    282 non-null    object
 10  schoolName               282 non-null    object
 11  instructors              282 non-null    object
 12  enrolled_students_count  282 non-null    object
 13  avg_rating               282 non-null    object
 14  num_reviews              282 non-null    o

**What about Nulls?**

In [8]:
all_courses_df.isnull().sum()

title                      0
url                        0
description                3
syllabus                   0
skills                     0
ratings_count              0
prerequisites              0
duration                   3
category                   0
level                      0
schoolName                 0
instructors                0
enrolled_students_count    0
avg_rating                 0
num_reviews                0
price                      0
source                     0
dtype: int64

## Data Cleaning


Problems


In [9]:
all_courses_df['ratings_count'].unique()

array(['14', '103', '250', '172', '22', '137', '556', '2556', '121',
       '123', '18', '338', '94', '32', '186', '175', '1746', '627', '1',
       '824', '162', '97', '854', '80', '2', '8', '571', '1400', '2250',
       '700', 'Not Available', '1200', '1600', '2050', '1000', '1650',
       '850', '750', '950', '300', '450', '400', '550', '500', '350',
       '1850', '100'], dtype=object)

In [10]:
all_courses_df['avg_rating'].unique()

array(['4.5', '3.7', '4.7', '4.8', '4.6', '3.4', '4.4', '3.9', '3', '5',
       '4', '4.3', '3.8', '4.9', 'Not Available'], dtype=object)

In [11]:
all_courses_df['num_reviews'].unique()

array(['14', '103', '250', '172', '22', '137', '556', '2554', '121',
       '123', '18', '338', '94', '32', '186', '175', '1746', '627', '1',
       '824', '162', '97', '853', '80', '2', '8', '571', '1400', '2250',
       '700', 'Not Available', '1200', '1600', '2050', '1000', '1650',
       '850', '750', '950', '300', '450', '400', '550', '500', '350',
       '1850', '100'], dtype=object)

In [12]:
all_courses_df['level'].unique()

array(['beginner', 'excpert', 'intermediate', 'advanced', 'free'],
      dtype=object)

Solutions


In [13]:
# remove 'Not Available' for features avg_rating, num_reviews, ratings_count
# reomve free (wrong level) from level 
df = all_courses_df[all_courses_df['num_reviews'] != 'Not Available']
df = df[df['avg_rating'] != 'Not Available']
df = df[df['ratings_count'] != 'Not Available']
df = df[df['level'] != 'free']

In [14]:
df['num_reviews'].unique()

array(['14', '103', '250', '172', '22', '137', '556', '2554', '121',
       '123', '18', '338', '94', '32', '186', '175', '1746', '627', '1',
       '824', '162', '97', '853', '80', '2', '8', '571', '1400', '2250',
       '700', '1200', '1600', '2050', '1000', '1650', '850', '750', '950',
       '300', '450', '400', '550', '500', '350', '1850', '100'],
      dtype=object)

In [15]:
df['ratings_count'].unique()

array(['14', '103', '250', '172', '22', '137', '556', '2556', '121',
       '123', '18', '338', '94', '32', '186', '175', '1746', '627', '1',
       '824', '162', '97', '854', '80', '2', '8', '571', '1400', '2250',
       '700', '1200', '1600', '2050', '1000', '1650', '850', '750', '950',
       '300', '450', '400', '550', '500', '350', '1850', '100'],
      dtype=object)

In [16]:
df['avg_rating'].unique()

array(['4.5', '3.7', '4.7', '4.8', '4.6', '3.4', '4.4', '3.9', '3', '5',
       '4', '4.3', '3.8', '4.9'], dtype=object)

In [17]:
df['level'].unique()

array(['beginner', 'excpert', 'intermediate', 'advanced'], dtype=object)

In [18]:
# convert feature to numeric
df['avg_rating'] = df['avg_rating'].astype('float64')
df['ratings_count'] = df['ratings_count'].astype('float64')
df['num_reviews'] = df['num_reviews'].astype('float64')

## Feature Selection


In [19]:
df.columns

Index(['title', 'url', 'description', 'syllabus', 'skills', 'ratings_count',
       'prerequisites', 'duration', 'category', 'level', 'schoolName',
       'instructors', 'enrolled_students_count', 'avg_rating', 'num_reviews',
       'price', 'source'],
      dtype='object')

In [20]:
features = ['ratings_count', 'level', 'avg_rating', 'num_reviews']
y = 'price'

In [21]:
df = df[df[y] != 'None']

In [22]:
df[features].head()

Unnamed: 0,ratings_count,level,avg_rating,num_reviews
0,14.0,beginner,4.5,14.0
1,103.0,beginner,3.7,103.0
2,250.0,beginner,4.7,250.0
3,172.0,beginner,4.5,172.0
4,22.0,beginner,4.8,22.0


In [23]:
df[y].head()

0    $99.99
1    $49.99
2    $49.99
3    $89.99
4    $94.99
Name: price, dtype: object

In [24]:
price_list = []
for p in df['price']:
    try:
        price_list.append(float(p.split('$')[1])*15.73)
    except:
        price_list.append(float(p.split(' ')[1]))

In [25]:
df['new_price'] = price_list

In [26]:
df[features].head()

Unnamed: 0,ratings_count,level,avg_rating,num_reviews
0,14.0,beginner,4.5,14.0
1,103.0,beginner,3.7,103.0
2,250.0,beginner,4.7,250.0
3,172.0,beginner,4.5,172.0
4,22.0,beginner,4.8,22.0


In [27]:
df['new_price'].head()

0    1572.8427
1     786.3427
2     786.3427
3    1415.5427
4    1494.1927
Name: new_price, dtype: float64

## Model Selection


**use pipeline**


In [28]:
num_cols = ['ratings_count', 'avg_rating', 'num_reviews']
cat_cols = ['level']
y = df['new_price']

In [29]:
num_pipeline = Pipeline([('std_scaler', StandardScaler())])

cat_pipeline = Pipeline([('one_hot', OneHotEncoder())])

In [30]:
preprocessor_pipeline = ColumnTransformer([("num_transformer", num_pipeline, num_cols),
                                         ("cat_transformer", cat_pipeline, cat_cols)])

In [31]:
X = preprocessor_pipeline.fit_transform(df)

**Split Data to train and test**


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)

**Training**


In [33]:
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1, random_state=42)
sgd_reg.fit(X_train, y_train.ravel())

SGDRegressor(eta0=0.1, penalty=None, random_state=42)

**Predicting and calculate r2 score**


In [34]:
y_predict = sgd_reg.predict(X_test)

In [35]:
score = r2_score(y_test, y_predict)

In [36]:
print(score)

-0.4617500465479967


## The r2 score is very small because the data is small and I haven't enough time to work with data preperation and scrape large amount of courses  

## Thank you 