# Project: Price Mechanism
Objective: build a model to predict the price based on scrapped Udemy data <br> 
Models: Multiple Linear Regression, Random Forest Regression, Gradient Boosting (XGBoost, LightGBM) <br> 
Evaluation Metric: Root Mean Squared Error (RSME), Mean Absolute Error (MAE) <br>

## Packages

In [1]:
# Data handling
import pandas as pd
import numpy as np

# Preprocessing
import category_encoders as ce
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split

# Models
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression

# Metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

## Preprocessing

In [2]:
# Import data
data_clean = pd.read_csv("./Data/Udemy_Clean.csv", index_col=0)

# Transform discounted_price variable
data_clean["Discounted"] = data_clean["Discounted_Price"] != data_clean["Price"]
# Remove unused discounted_price variable
data_clean.drop(columns = ["Discounted_Price"], inplace=True)
data_clean.set_index("Title", inplace=True)
# Inspect data
data_clean.head()

Unnamed: 0_level_0,Overall_Rating,Best_Rating,Worst_Rating,No_of_Ratings,Category,Subcategory,Topic,Instructor,Language,SkillsFuture,No_of_Practice_Test,No_of_Articles,No_of_Coding_Exercises,Video_Duration_Hr,No_of_Additional_Resources,Bestseller,Price,Discounted
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Complete Hypnotherapy & Hypnosis Certification Diploma,4.7,5,0.5,3524,Lifestyle,Esoteric Practices,Hypnotherapy,Dr Karen E Wells,English,False,0,4,0,3.0,0,Yes,104.98,True
Pinterest Marketing for Wedding Professionals 2020,5.0,5,0.5,1,Marketing,Social Media Marketing,Pinterest Marketing,Staci Nichols,English,False,0,0,0,0.6,2,No,29.98,True
Master the Telephone Sales- Cold calling Secrets,4.5,5,0.5,3,Marketing,Product Marketing,Marketing Strategy,Sanjay Bhasin,English,False,0,0,0,0.733333,0,No,29.98,True
5 Practical Management concepts you MUST know,5.0,5,0.5,2,Personal Development,Leadership,Management Skills,Vasudev Murthy,English,False,0,0,0,2.0,0,No,49.98,True
Fermented Foods Mastery,4.5,5,0.5,187,Health & Fitness,Nutrition,Fermented Foods,Kale Brock,English,False,0,3,0,1.5,12,No,68.98,True


Numeric and categorical variables will be preprocessed separately and then concatenated together afterwards. 

In [3]:
# Define response and explanatory variables
y = pd.DataFrame(data_clean["Price"])
X_raw = data_clean[data_clean.columns.drop('Price')]

# Separate numeric and categorical variables for X
X_numeric = X_raw.select_dtypes(exclude=["object", "boolean"])
X_categorical = X_raw.select_dtypes(include=["object", "boolean"])

### Categorical Encoding
For variable encoding, we will be deploying the binary encoder. This is because this dataset has many categorical variables with high cardinality, hence it would be more suitable for this dataset as it uses fewer features as compaerd to one-hot encoding. 

In [4]:
# Create encoder
encoder = ce.BinaryEncoder(cols=X_categorical.columns, return_df=True)
# Fit and transform data 
X_categorical_encoded = encoder.fit_transform(X_categorical)
X_categorical_encoded.reset_index(inplace=True)
X_categorical_encoded.head()

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Title,Category_0,Category_1,Category_2,Category_3,Category_4,Subcategory_0,Subcategory_1,Subcategory_2,Subcategory_3,...,Instructor_12,Instructor_13,Instructor_14,Language_0,SkillsFuture_0,SkillsFuture_1,Bestseller_0,Bestseller_1,Discounted_0,Discounted_1
0,Complete Hypnotherapy & Hypnosis Certification...,0,0,0,0,1,0,0,0,0,...,0,0,1,1,0,1,0,1,0,1
1,Pinterest Marketing for Wedding Professionals ...,0,0,0,1,0,0,0,0,0,...,0,1,0,1,0,1,1,0,0,1
2,Master the Telephone Sales- Cold calling Secrets,0,0,0,1,0,0,0,0,0,...,0,1,1,1,0,1,1,0,0,1
3,5 Practical Management concepts you MUST know,0,0,0,1,1,0,0,0,0,...,1,0,0,1,0,1,1,0,0,1
4,Fermented Foods Mastery,0,0,1,0,0,0,0,0,0,...,1,0,1,1,0,1,1,0,0,1


### Outliers

In [5]:
# Outlier detection 
lof = LocalOutlierFactor()
yhat = pd.DataFrame(lof.fit_predict(X_numeric), columns=["outliers_d"])
outliers_index = yhat[yhat["outliers_d"]==-1].index
outliers_index

Int64Index([    8,    20,    31,    32,    71,    73,    76,    78,    80,
               94,
            ...
            16255, 16263, 16329, 16339, 16347, 16368, 16390, 16401, 16413,
            16427],
           dtype='int64', length=1522)

In [6]:
# Combine both numeric and categorical variables back into one dataframe
X_numeric.reset_index(inplace=True)
X = pd.concat([X_numeric, X_categorical_encoded], axis=1)
y.reset_index(inplace=True)

# Drop outliers for both X and y
X.drop(outliers_index, axis=0, inplace=True)
y.drop(outliers_index, axis=0, inplace=True)

# Double check that it has been dropped
print(len(X))
print(len(y))

14907
14907


In [7]:
# Reset index as title for both 
X.set_index("Title", inplace=True)
y.set_index("Title", inplace=True)

### Train Test Split
Now that we have removed the outliers and prepared the categorical variables, we can split the dataset into the training and validation set for model development and evaluation. 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Model Development

### (1) Multiple Linear Regression Model

In [9]:
# Copy training and testing variables
X_train_mlr = X_train.copy()
X_test_mlr = X_test.copy()
y_train_mlr = y_train.copy()
y_test_mlr = y_test.copy()

# Build model function
def mlr_model(X_train, X_test, y_train, y_test): 
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print("Score: {0:0.3f}".format(model.score(X_train, y_train)))
    return predictions

# First initial model
predictions_1 = mlr_model(X_train_mlr, X_test_mlr, y_train_mlr, y_test_mlr)

Score: 0.151


As the score of the model is undesirable, we can perform feature selection to see if it can improve the fit of the model. 

#### Feature selection 

In [10]:
# Configure to select features based on correlation
fs_1 = SelectKBest(score_func=f_regression, k=15)
fs_1.fit(X_train_mlr, y_train_mlr)
X_train_fs1 = fs_1.transform(X_train_mlr)
X_test_fs1 = fs_1.transform(X_test_mlr)

predictions_2 = mlr_model(X_train_fs1, X_test_fs1, y_train_mlr, y_test_mlr)

Score: 0.137


  return f(*args, **kwargs)
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


In [11]:
# Configure to select features based on correlation
fs_1 = SelectKBest(score_func=mutual_info_regression, k=15)
fs_1.fit(X_train_mlr, y_train_mlr)
X_train_fs1 = fs_1.transform(X_train_mlr)
X_test_fs1 = fs_1.transform(X_test_mlr)

predictions_3 = mlr_model(X_train_fs1, X_test_fs1, y_train_mlr, y_test_mlr)

  return f(*args, **kwargs)


Score: 0.130


It appears that the best multiple linear regression model would be the original model without feature selection

In [31]:
# Final model 
mlr_model = LinearRegression()
mlr_model.fit(X_train_mlr, y_train_mlr)
mlr_predictions = mlr_model.predict(X_test_mlr)

### (2) Random Forest Regression

In [32]:
# Copying training and testing data 
X_train_rfr = X_train.copy()
X_test_rfr = X_test.copy()
y_train_rfr = y_train.copy()
y_test_rfr = y_test.copy()

# Initial model 
model_rfr1 = RandomForestRegressor(random_state=0)
model_rfr1.fit(X_train_rfr, y_train_rfr.values.ravel())
predictions_rfr1 = model_rfr1.predict(X_test_rfr)
mae_rfr1 = mean_absolute_error(y_test_rfr, predictions_rfr1)
print("MAE: {0:0.3f}".format(mae_rfr1))

  model_rfr1.fit(X_train_rfr, y_train_rfr)


MAE: 31.361


In [34]:
# First model improvement 
for n in range(100, 1100, 100):
    model_rfr2 = RandomForestRegressor(n_estimators=n, random_state=0)
    model_rfr2.fit(X_train_rfr, y_train_rfr.values.ravel())
    predictions_rfr2 = model_rfr2.predict(X_test_rfr)
    mae_rfr2 = mean_absolute_error(y_test_rfr, predictions_rfr2)
    print("n_estimators: {0}, MAE: {1:0.3f}".format(n, mae_rfr2))

n_estimators: 100, MAE: 31.361
n_estimators: 200, MAE: 31.300
n_estimators: 300, MAE: 31.252
n_estimators: 400, MAE: 31.218
n_estimators: 500, MAE: 31.249
n_estimators: 600, MAE: 31.236
n_estimators: 700, MAE: 31.242
n_estimators: 800, MAE: 31.234
n_estimators: 900, MAE: 31.237
n_estimators: 1000, MAE: 31.245


Optimal n_estimators = 400. 

In [37]:
# Second model improvement
for max_depth in range(10, 60, 10): 
    model_rfr3 = RandomForestRegressor(n_estimators=400, max_depth=max_depth, random_state=0)
    model_rfr3.fit(X_train_rfr, y_train_rfr.values.ravel())
    predictions_rfr3 = model_rfr3.predict(X_test_rfr)
    mae_rfr3 = mean_absolute_error(y_test_rfr, predictions_rfr3)
    print("max_depth: {0}, MAE: {1:0.3f}".format(max_depth, mae_rfr3))

max_depth: 10, MAE: 31.726
max_depth: 20, MAE: 31.210
max_depth: 30, MAE: 31.219
max_depth: 40, MAE: 31.218
max_depth: 50, MAE: 31.218


Optimal max_depth = 20

In [46]:
# 3rd model improvement
for min_samples_leaf in range(1, 6, 1): 
    model_rfr4 = RandomForestRegressor(n_estimators=400, max_depth=20, random_state=0, 
                                       min_samples_leaf=min_samples_leaf)
    model_rfr4.fit(X_train_rfr, y_train_rfr.values.ravel())
    predictions_rfr4 = model_rfr4.predict(X_test_rfr)
    mae_rfr4 = mean_absolute_error(y_test_rfr, predictions_rfr4)
    print("min_samples_leaf: {0}, MAE: {1:0.3f}".format(min_samples_leaf, mae_rfr4))

min_samples_leaf: 1, MAE: 31.210
min_samples_leaf: 2, MAE: 31.163
min_samples_leaf: 3, MAE: 31.177
min_samples_leaf: 4, MAE: 31.216
min_samples_leaf: 5, MAE: 31.297


Optimal min_samples_leaf = 2

In [48]:
# 4th model improvement
for max_leaf_nodes in range(1000,1600,100): 
    model_rfr5 = RandomForestRegressor(n_estimators=400, max_depth=20, min_samples_leaf=2, 
                                      max_leaf_nodes=max_leaf_nodes, random_state=0)
    model_rfr5.fit(X_train_rfr, y_train_rfr.values.ravel())
    predictions_rfr5 = model_rfr5.predict(X_test_rfr)
    mae_rfr5 = mean_absolute_error(y_test_rfr, predictions_rfr5)
    print("max_leaf_nodes: {0}, MAE: {1:0.3f}".format(max_leaf_nodes, mae_rfr5))

max_leaf_nodes: 1000, MAE: 31.032
max_leaf_nodes: 1100, MAE: 31.024
max_leaf_nodes: 1200, MAE: 31.027
max_leaf_nodes: 1300, MAE: 31.032
max_leaf_nodes: 1400, MAE: 31.037
max_leaf_nodes: 1500, MAE: 31.050


Optimal max_leaf_nodes = 1100

In [49]:
# Final model 
rfr_model = RandomForestRegressor(n_estimators=400, max_depth=20, min_samples_leaf=2, 
                                 max_leaf_nodes=1100, random_state=0)
rfr_model.fit(X_train_rfr, y_train_rfr.values.ravel())
rfr_predictions = rfr_model.predict(X_test_rfr)

### (3) Gradient Boosting Model (XGBoost)

In [53]:
# Copying training and testing data 
X_train_xgb = X_train.copy()
X_test_xgb = X_test.copy()
y_train_xgb = y_train.copy()
y_test_xgb = y_test.copy()

# Inital model 
model_xgb = XGBRegressor(random_state=0)
model_xgb.fit(X_train_xgb, y_train_xgb)
predictions_xgb = model_xgb.predict(X_test_xgb)
mae_xgb = mean_absolute_error(y_test_xgb, predictions_xgb)
print("MAE: {0:0.3f}".format(mae_xgb))

MAE: 31.015


In [57]:
# 1st model improvement
for n in range(100, 111, 1):
    model_xgb1 = XGBRegressor(n_estimators=n, random_state=0)
    model_xgb1.fit(X_train_xgb, y_train_xgb)
    predictions_xgb1 = model_xgb1.predict(X_test_xgb)
    mae_xgb1 = mean_absolute_error(y_test_xgb, predictions_xgb1)
    print("n_estimators: {0}, MAE: {1:0.3f}".format(n, mae_xgb1))

n_estimators: 100, MAE: 31.015
n_estimators: 101, MAE: 31.012
n_estimators: 102, MAE: 30.946
n_estimators: 103, MAE: 30.947
n_estimators: 104, MAE: 30.899
n_estimators: 105, MAE: 30.910
n_estimators: 106, MAE: 30.887
n_estimators: 107, MAE: 30.872
n_estimators: 108, MAE: 30.857
n_estimators: 109, MAE: 30.867
n_estimators: 110, MAE: 30.872


Optimal n_estimators = 108.

In [60]:
# 2nd model improvement 
for learning_rate in range(20, 31, 1): 
    learning_rate = learning_rate / 100
    model_xgb2 = XGBRegressor(n_estimators=108, learning_rate=learning_rate, random_state=0)
    model_xgb2.fit(X_train_xgb, y_train_xgb)
    predictions_xgb2 = model_xgb2.predict(X_test_xgb)
    mae_xgb2 = mean_absolute_error(y_test_xgb, predictions_xgb2)
    print("learning_rate: {0}, MAE: {1:0.3f}".format(learning_rate, mae_xgb2))

learning_rate: 0.2, MAE: 30.842
learning_rate: 0.21, MAE: 30.843
learning_rate: 0.22, MAE: 31.183
learning_rate: 0.23, MAE: 30.530
learning_rate: 0.24, MAE: 30.834
learning_rate: 0.25, MAE: 30.678
learning_rate: 0.26, MAE: 31.070
learning_rate: 0.27, MAE: 30.901
learning_rate: 0.28, MAE: 30.801
learning_rate: 0.29, MAE: 30.819
learning_rate: 0.3, MAE: 30.857


Optimal learning_rate = 0.23

In [62]:
# 3rd model improvement
for max_depth in range(1, 11, 1): 
    model_xgb3 = XGBRegressor(n_estimators=108, learning_rate=0.23, max_depth=max_depth, random_state=0)
    model_xgb3.fit(X_train_xgb, y_train_xgb)
    predictions_xgb3 = model_xgb3.predict(X_test_xgb)
    mae_xgb3 = mean_absolute_error(y_test_xgb, predictions_xgb3)
    print("max_depth: {0}, MAE: {1:0.3f}".format(max_depth, mae_xgb3))

max_depth: 1, MAE: 33.066
max_depth: 2, MAE: 32.368
max_depth: 3, MAE: 32.041
max_depth: 4, MAE: 31.554
max_depth: 5, MAE: 31.173
max_depth: 6, MAE: 30.530
max_depth: 7, MAE: 30.417
max_depth: 8, MAE: 30.691
max_depth: 9, MAE: 30.426
max_depth: 10, MAE: 30.965


Optimal max_depth = 7

In [74]:
# 4th model improvement
for min_child_weight in range(10, 21, 1): 
    model_xgb4 = XGBRegressor(n_estimators=108, learning_rate=0.23, max_depth=7, min_child_weight=min_child_weight, 
                              random_state=0)
    model_xgb4.fit(X_train_xgb, y_train_xgb)
    predictions_xgb4 = model_xgb4.predict(X_test_xgb)
    mae_xgb4 = mean_absolute_error(y_test_xgb, predictions_xgb4)
    print("min_child_weight: {0}, MAE: {1:0.3f}".format(min_child_weight, mae_xgb4))

min_child_weight: 10, MAE: 30.407
min_child_weight: 11, MAE: 30.513
min_child_weight: 12, MAE: 30.812
min_child_weight: 13, MAE: 30.624
min_child_weight: 14, MAE: 30.427
min_child_weight: 15, MAE: 30.613
min_child_weight: 16, MAE: 30.723
min_child_weight: 17, MAE: 30.400
min_child_weight: 18, MAE: 30.647
min_child_weight: 19, MAE: 30.597
min_child_weight: 20, MAE: 30.842


Optimal min_child_weight = 17

In [71]:
# 5th model improvement 
for alpha in range(1, 11, 1): 
    alpha = alpha / 10
    model_xgb5 = XGBRegressor(n_estimators=108, learning_rate=0.23, max_depth=7, min_child_weight=17, 
                             reg_alpha=alpha, random_state=0)
    model_xgb5.fit(X_train_xgb, y_train_xgb)
    predictions_xgb5 = model_xgb5.predict(X_test_xgb)
    mae_xgb5 = mean_absolute_error(y_test_xgb, predictions_xgb5)
    print("Alpha: {0}, MAE: {1:0.3f}".format(alpha, mae_xgb5))

Alpha: 0.1, MAE: 30.679
Alpha: 0.2, MAE: 30.646
Alpha: 0.3, MAE: 30.923
Alpha: 0.4, MAE: 30.809
Alpha: 0.5, MAE: 30.954
Alpha: 0.6, MAE: 30.559
Alpha: 0.7, MAE: 30.365
Alpha: 0.8, MAE: 30.338
Alpha: 0.9, MAE: 30.430
Alpha: 1.0, MAE: 30.471


Optimal alpha = 0.8

In [72]:
# 6th model improvement 
for lambd in range(1, 11, 1): 
    lambd = lambd / 10
    model_xgb6 = XGBRegressor(n_estimators=108, learning_rate=0.23, max_depth=7, min_child_weight=17, 
                              reg_alpha=0.8, reg_lambda=lambd, random_state=0)
    model_xgb6.fit(X_train_xgb, y_train_xgb)
    predictions_xgb6 = model_xgb6.predict(X_test_xgb)
    mae_xgb6 = mean_absolute_error(y_test_xgb, predictions_xgb6)
    print("Lambda: {0}, MAE: {1:0.3f}".format(lambd, mae_xgb6))

Lambda: 0.1, MAE: 30.404
Lambda: 0.2, MAE: 30.770
Lambda: 0.3, MAE: 30.355
Lambda: 0.4, MAE: 30.912
Lambda: 0.5, MAE: 30.822
Lambda: 0.6, MAE: 30.515
Lambda: 0.7, MAE: 30.718
Lambda: 0.8, MAE: 30.714
Lambda: 0.9, MAE: 30.704
Lambda: 1.0, MAE: 30.338


In [73]:
# Final model 
xgb_model = XGBRegressor(n_estimators=108, learning_rate=0.23, max_depth=7, min_child_weight=17, 
                        reg_alpha=0.8, random_state=0)
xgb_model.fit(X_train_xgb, y_train_xgb)
xgb_predictions = xgb_model.predict(X_test_xgb)

### (4) Gradient Boosting Model (LightGBM)

In [77]:
# Copying training and testing data
X_train_lgbm = X_train.copy()
X_test_lgbm = X_test.copy()
y_train_lgbm = y_train.copy()
y_test_lgbm = y_test.copy()

# Initial model
model_lgbm = LGBMRegressor(random_state=0)
model_lgbm.fit(X_train_lgbm, y_train_lgbm)
predictions_lgbm = model_lgbm.predict(X_test_lgbm)
mae_lgbm = mean_absolute_error(y_test_lgbm, predictions_lgbm)
print("MAE: {0:0.3f}".format(mae_lgbm))

MAE: 31.237


In [80]:
# 1st model improvement
for n in range(1000, 2100, 100):
    model_lgbm1 = LGBMRegressor(n_estimators=n, random_state=0)
    model_lgbm1.fit(X_train_lgbm, y_train_lgbm)
    predictions_lgbm1 = model_lgbm1.predict(X_test_lgbm)
    mae_lgbm1 = mean_absolute_error(y_test_lgbm, predictions_lgbm1)
    print("n_estimators: {0}, MAE: {1:0.3f}".format(n, mae_lgbm1))

n_estimators: 1000, MAE: 29.988
n_estimators: 1100, MAE: 29.949
n_estimators: 1200, MAE: 29.941
n_estimators: 1300, MAE: 29.916
n_estimators: 1400, MAE: 29.886
n_estimators: 1500, MAE: 29.875
n_estimators: 1600, MAE: 29.878
n_estimators: 1700, MAE: 29.889
n_estimators: 1800, MAE: 29.898
n_estimators: 1900, MAE: 29.897
n_estimators: 2000, MAE: 29.918


Optimal n_estimators = 1500

In [82]:
# 2nd model improvement
for max_depth in range(1, 11, 1): 
    model_lgbm2 = LGBMRegressor(n_estimators=1500, max_depth=max_depth, random_state=0)
    model_lgbm2.fit(X_train_lgbm, y_train_lgbm)
    predictions_lgbm2 = model_lgbm2.predict(X_test_lgbm)
    mae_lgbm2 = mean_absolute_error(y_test_lgbm, predictions_lgbm2)
    print("max_depth: {0}, MAE: {1:0.3f}".format(max_depth, mae_lgbm2))

max_depth: 1, MAE: 32.883
max_depth: 2, MAE: 32.261
max_depth: 3, MAE: 31.599
max_depth: 4, MAE: 31.055
max_depth: 5, MAE: 30.456
max_depth: 6, MAE: 30.281
max_depth: 7, MAE: 29.873
max_depth: 8, MAE: 29.737
max_depth: 9, MAE: 29.748
max_depth: 10, MAE: 29.945


Optimal max_depth = 8

In [85]:
# 3rd model improvement 
for learning_rate in range(1, 11, 1): 
    learning_rate = learning_rate / 100
    model_lgbm3 = LGBMRegressor(n_estimators=1500, max_depth=8, learning_rate=learning_rate, random_state=0)
    model_lgbm3.fit(X_train_lgbm, y_train_lgbm)
    predictions_lgbm3 = model_lgbm3.predict(X_test_lgbm)
    mae_lgbm3 = mean_absolute_error(y_test_lgbm, predictions_lgbm3)
    print("learning_rate: {0}, MAE: {1:0.3f}".format(learning_rate, mae_lgbm3))

learning_rate: 0.01, MAE: 31.041
learning_rate: 0.02, MAE: 30.614
learning_rate: 0.03, MAE: 30.382
learning_rate: 0.04, MAE: 30.146
learning_rate: 0.05, MAE: 29.951
learning_rate: 0.06, MAE: 29.923
learning_rate: 0.07, MAE: 29.903
learning_rate: 0.08, MAE: 29.854
learning_rate: 0.09, MAE: 29.836
learning_rate: 0.1, MAE: 29.737


In [90]:
# 4th model improvement
for min_child_weight in range(10, 60, 10): 
    model_lgbm4 = LGBMRegressor(n_estimators=1500, max_depth=8, min_child_weight=min_child_weight, random_state=0)
    model_lgbm4.fit(X_train_lgbm, y_train_lgbm)
    predictions_lgbm4 = model_lgbm4.predict(X_test_lgbm)
    mae_lgbm4 = mean_absolute_error(y_test_lgbm, predictions_lgbm4)
    print("min_child_weight: {0}, MAE: {1:0.3f}".format(min_child_weight, mae_lgbm4))

min_child_weight: 10, MAE: 29.737
min_child_weight: 20, MAE: 29.737
min_child_weight: 30, MAE: 29.690
min_child_weight: 40, MAE: 29.956
min_child_weight: 50, MAE: 30.218


Optimal min_child_weight = 30

In [97]:
# 5th model improvement 
for lambd in range(30, 41, 1): 
    lambd = lambd/100
    model_lgbm5 = LGBMRegressor(n_estimators=1500, max_depth=8, min_child_weight=30, reg_lambda=lambd, 
                               random_state=0)
    model_lgbm5.fit(X_train_lgbm, y_train_lgbm)
    predictions_lgbm5 = model_lgbm5.predict(X_test_lgbm)
    mae_lgbm5 = mean_absolute_error(y_test_lgbm, predictions_lgbm5)
    print("Lambda: {0}, MAE: {1:0.3f}".format(lambd, mae_lgbm5))

Lambda: 0.3, MAE: 29.970
Lambda: 0.31, MAE: 29.987
Lambda: 0.32, MAE: 29.623
Lambda: 0.33, MAE: 29.833
Lambda: 0.34, MAE: 29.645
Lambda: 0.35, MAE: 29.927
Lambda: 0.36, MAE: 29.957
Lambda: 0.37, MAE: 29.871
Lambda: 0.38, MAE: 29.883
Lambda: 0.39, MAE: 29.830
Lambda: 0.4, MAE: 29.668


Optimal lambda = 0.32

In [98]:
# Final model 
lgbm_model = LGBMRegressor(n_estimators=1500, max_depth=8, min_child_weight=30, reg_lambda=0.32, random_state=0)
lgbm_model.fit(X_train_lgbm, y_train_lgbm)
lgbm_predictions = lgbm_model.predict(X_test_lgbm)