In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error , r2_score, mean_absolute_error
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv("courses.csv")

In [3]:
df.head()

Unnamed: 0,course_id,course_title,url,price,num_subscribers,num_reviews,num_lectures,level,Rating,content_duration,published_timestamp,subject
0,41295.0,Learn HTML5 Programming From Scratch,https://www.udemy.com/learn-html5-programming-...,0.0,268923.0,8629.0,45.0,Beginner Level,0.82,10.5,2013-02-14T07:03:41Z,Subject: Web Development
1,59014.0,Coding for Entrepreneurs Basic,https://www.udemy.com/coding-for-entrepreneurs...,0.0,161029.0,279.0,27.0,Expert Level,0.69,3.5,2013-06-09T15:51:55Z,Subject: Web Development
2,625204.0,The Web Developer Bootcamp,https://www.udemy.com/the-web-developer-bootcamp/,200.0,121584.0,27445.0,342.0,Beginner Level,0.89,43.0,2015-11-02T21:13:27Z,Subject: Web Development
3,173548.0,Build Your First Website in 1 Week with HTML5 ...,https://www.udemy.com/build-your-first-website...,0.0,120291.0,5924.0,30.0,All Levels,0.78,3.0,2014-04-08T16:21:30Z,Subject: Web Development
4,764164.0,The Complete Web Developer Course 2.0,https://www.udemy.com/the-complete-web-develop...,200.0,114512.0,22412.0,304.0,Beginner Level,0.55,30.5,2016-03-08T22:28:36Z,Subject: Web Development


In [4]:
np.sum(df.isna())

course_id              4
course_title           4
url                    4
price                  4
num_subscribers        4
num_reviews            4
num_lectures           4
level                  4
Rating                 4
content_duration       4
published_timestamp    4
subject                3
dtype: int64

In [5]:
df = df.dropna() 

In [6]:
df.head()

Unnamed: 0,course_id,course_title,url,price,num_subscribers,num_reviews,num_lectures,level,Rating,content_duration,published_timestamp,subject
0,41295.0,Learn HTML5 Programming From Scratch,https://www.udemy.com/learn-html5-programming-...,0.0,268923.0,8629.0,45.0,Beginner Level,0.82,10.5,2013-02-14T07:03:41Z,Subject: Web Development
1,59014.0,Coding for Entrepreneurs Basic,https://www.udemy.com/coding-for-entrepreneurs...,0.0,161029.0,279.0,27.0,Expert Level,0.69,3.5,2013-06-09T15:51:55Z,Subject: Web Development
2,625204.0,The Web Developer Bootcamp,https://www.udemy.com/the-web-developer-bootcamp/,200.0,121584.0,27445.0,342.0,Beginner Level,0.89,43.0,2015-11-02T21:13:27Z,Subject: Web Development
3,173548.0,Build Your First Website in 1 Week with HTML5 ...,https://www.udemy.com/build-your-first-website...,0.0,120291.0,5924.0,30.0,All Levels,0.78,3.0,2014-04-08T16:21:30Z,Subject: Web Development
4,764164.0,The Complete Web Developer Course 2.0,https://www.udemy.com/the-complete-web-develop...,200.0,114512.0,22412.0,304.0,Beginner Level,0.55,30.5,2016-03-08T22:28:36Z,Subject: Web Development


In [7]:
df = df.drop(columns = ["course_id", "course_title", "url", "published_timestamp"])

In [8]:
df.head()

Unnamed: 0,price,num_subscribers,num_reviews,num_lectures,level,Rating,content_duration,subject
0,0.0,268923.0,8629.0,45.0,Beginner Level,0.82,10.5,Subject: Web Development
1,0.0,161029.0,279.0,27.0,Expert Level,0.69,3.5,Subject: Web Development
2,200.0,121584.0,27445.0,342.0,Beginner Level,0.89,43.0,Subject: Web Development
3,0.0,120291.0,5924.0,30.0,All Levels,0.78,3.0,Subject: Web Development
4,200.0,114512.0,22412.0,304.0,Beginner Level,0.55,30.5,Subject: Web Development


In [9]:
np.sum(df['level'].unique())

'Beginner LevelExpert LevelAll LevelsIntermediate Level'

In [10]:
df['level'].value_counts()

All Levels            1925
Beginner Level        1271
Intermediate Level     422
Expert Level            58
Name: level, dtype: int64

In [11]:
np.sum(df['level'].value_counts())

3676

In [12]:
df.corr()

Unnamed: 0,price,num_subscribers,num_reviews,num_lectures,Rating,content_duration
price,1.0,0.050555,0.113423,0.330233,0.031643,0.293245
num_subscribers,0.050555,1.0,0.650761,0.158092,-0.007353,0.161844
num_reviews,0.113423,0.650761,1.0,0.242986,0.004137,0.228842
num_lectures,0.330233,0.158092,0.242986,1.0,-0.03717,0.80163
Rating,0.031643,-0.007353,0.004137,-0.03717,1.0,0.00065
content_duration,0.293245,0.161844,0.228842,0.80163,0.00065,1.0


In [13]:
df_encoded = pd.get_dummies(df, columns=['level'], prefix='level')

In [14]:
df_encoded

Unnamed: 0,price,num_subscribers,num_reviews,num_lectures,Rating,content_duration,subject,level_All Levels,level_Beginner Level,level_Expert Level,level_Intermediate Level
0,0.0,268923.0,8629.0,45.0,0.82,10.500000,Subject: Web Development,0,1,0,0
1,0.0,161029.0,279.0,27.0,0.69,3.500000,Subject: Web Development,0,0,1,0
2,200.0,121584.0,27445.0,342.0,0.89,43.000000,Subject: Web Development,0,1,0,0
3,0.0,120291.0,5924.0,30.0,0.78,3.000000,Subject: Web Development,1,0,0,0
4,200.0,114512.0,22412.0,304.0,0.55,30.500000,Subject: Web Development,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
3675,20.0,0.0,0.0,23.0,0.31,1.500000,Musical Instruments,1,0,0,0
3676,125.0,0.0,0.0,7.0,0.06,0.633333,Musical Instruments,0,1,0,0
3677,20.0,0.0,0.0,6.0,0.08,0.533333,Musical Instruments,1,0,0,0
3678,35.0,0.0,0.0,13.0,0.78,0.516667,Musical Instruments,0,1,0,0


In [15]:
df['subject'].unique()

array(['Subject: Web Development', 'Business Finance', 'Graphic Design',
       'Musical Instruments'], dtype=object)

In [16]:
np.sum(df['level'].unique())

'Beginner LevelExpert LevelAll LevelsIntermediate Level'

In [17]:
df['subject'].value_counts()

Subject: Web Development    1203
Business Finance            1191
Musical Instruments          680
Graphic Design               602
Name: subject, dtype: int64

In [18]:
df_encoded = pd.get_dummies(df_encoded, columns=['subject'], prefix='subject')

In [19]:
df_encoded

Unnamed: 0,price,num_subscribers,num_reviews,num_lectures,Rating,content_duration,level_All Levels,level_Beginner Level,level_Expert Level,level_Intermediate Level,subject_Business Finance,subject_Graphic Design,subject_Musical Instruments,subject_Subject: Web Development
0,0.0,268923.0,8629.0,45.0,0.82,10.500000,0,1,0,0,0,0,0,1
1,0.0,161029.0,279.0,27.0,0.69,3.500000,0,0,1,0,0,0,0,1
2,200.0,121584.0,27445.0,342.0,0.89,43.000000,0,1,0,0,0,0,0,1
3,0.0,120291.0,5924.0,30.0,0.78,3.000000,1,0,0,0,0,0,0,1
4,200.0,114512.0,22412.0,304.0,0.55,30.500000,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3675,20.0,0.0,0.0,23.0,0.31,1.500000,1,0,0,0,0,0,1,0
3676,125.0,0.0,0.0,7.0,0.06,0.633333,0,1,0,0,0,0,1,0
3677,20.0,0.0,0.0,6.0,0.08,0.533333,1,0,0,0,0,0,1,0
3678,35.0,0.0,0.0,13.0,0.78,0.516667,0,1,0,0,0,0,1,0


In [20]:
X = df_encoded.drop(columns=['Rating'])
y = df_encoded['Rating']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression

In [22]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

LinearRegression()

In [23]:
y_pred = linear_model.predict(X_test)

In [24]:
mse = mean_squared_error(y_test, y_pred)

In [25]:
mse

0.09070813365775134

In [26]:
r2 = r2_score(y_test, y_pred)

In [27]:
r2

0.17610871119739147

In [28]:
linear_coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': linear_model.coef_})


In [29]:
linear_coefficients

Unnamed: 0,Feature,Coefficient
0,price,-4e-05
1,num_subscribers,-3e-06
2,num_reviews,1.5e-05
3,num_lectures,-8e-06
4,content_duration,-0.001096
5,level_All Levels,0.008694
6,level_Beginner Level,-0.001984
7,level_Expert Level,-0.022051
8,level_Intermediate Level,0.015341
9,subject_Business Finance,0.09222


In [30]:
mae = mean_absolute_error(y_pred, y_test)

In [31]:
mae

0.2558348330661115

In [32]:
y_pred

array([ 0.29383228,  0.70104807,  0.30466831,  0.74018638,  0.70011075,
        0.64964779,  0.72733548,  0.7016515 ,  0.32256622,  0.66110548,
        0.72036306,  0.65541349,  0.69971595,  0.31946544,  0.30184826,
        0.31143401,  0.68480143,  0.63130558,  0.67602004,  0.58398094,
        0.6979931 ,  0.63071941,  0.65723088,  0.28945858,  0.30077836,
        0.30644344,  0.32238176,  0.687465  ,  0.67117833,  0.67227192,
        0.72541247,  0.64923275,  0.64569046,  0.67657231,  0.64521423,
        0.74076483,  0.69896062,  0.68840683,  0.69468679,  0.68841653,
        0.70210315,  0.3064428 ,  0.6975705 ,  0.64721856,  0.31898615,
        0.31803067,  0.72237755,  0.52489481,  0.73893922,  0.66967433,
        0.739176  ,  0.73103989,  0.74048285,  0.72283396,  0.7012017 ,
        0.69828491,  0.66653481,  0.65202932,  0.62986708,  0.64120726,
        0.66345286,  0.67298777,  0.69491957,  0.69785005,  0.29581761,
        0.608442  ,  0.64389981,  0.68271588,  0.67575701,  0.68

In [33]:
y_test

3049    0.11
1746    0.55
3320    0.55
2758    0.98
1693    0.74
        ... 
1428    0.76
25      0.34
3159    1.00
681     0.13
1001    0.18
Name: Rating, Length: 736, dtype: float64

# Lasso Regression


In [34]:
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

Lasso(alpha=0.1)

In [35]:
y_pred = lasso_model.predict(X_test)

In [36]:
mean_absolute_error(y_pred, y_test)

0.2967043163914568

In [37]:
r2_score(y_test, y_pred)

-0.0037406272830555753

In [38]:
mean_squared_error(y_test, y_pred)

0.11050904435417586

## Fine Tuning Lasso 

In [39]:
alpha_values = [0.01, 0.1, 0.5, 1]

lasso = Lasso()

# parameter_grid = {'alpha': alpha_values}
lasso_cv = LassoCV(alphas=alpha_values , cv=5)
lasso_cv.fit(X_train, y_train)

LassoCV(alphas=[0.01, 0.1, 0.5, 1], cv=5)

In [40]:
best_alpha = lasso_cv.alpha_


In [41]:
best_alpha

0.01

In [42]:
lasso = Lasso(0.01)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

In [43]:
r2_score(y_test, y_pred)

0.16489662006971606

In [44]:
mean_absolute_error(y_pred, y_test)

0.262458549435358

In [45]:
mean_squared_error(y_test, y_pred)

0.0919425536284615

# Random Forest 

In [46]:
rf = RandomForestRegressor()

In [47]:
rf.fit(X_train,y_train)

RandomForestRegressor()

In [48]:
y_pred = rf.predict(X_test)

In [49]:
mean_absolute_error(y_test, y_pred)

0.20118369565217395

In [50]:
mean_squared_error(y_test, y_pred)

0.06903745820760869

In [51]:
r2_score(y_test, y_pred)

0.3729409025993955

# XGBoost

In [52]:
xgb = XGBRegressor(n_estimators = 100, learning_rate = 0.3, max_depth = 5)

In [53]:
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [54]:
y_pred = xgb.predict(X_test)

In [55]:
mean_absolute_error(y_test, y_pred)

0.20285954018057648

In [56]:
mean_squared_error(y_test, y_pred)

0.0685636202117271

In [57]:
r2_score(y_test, y_pred)

0.3772447172780605

In [58]:

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # You can adjust these values
    'learning_rate': [0.3, 0.2, 0.4],  # You can adjust these values
    'max_depth': [3, 4, 5]  # You can adjust these values
}

# Create the XGBoost regressor
xgb_model = XGBRegressor()

# Initialize GridSearchCV
grid_search = GridSearchCV(xgb_model, param_grid, cv=5 )

# Perform grid search on your training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameter values
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100}


In [59]:
best_xgb_model = grid_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 0.06997655767175814
Mean Absolute Error: 0.20510692367168223
R-squared: 0.36441117283170765
