# Lightgbm model test notebook

In [4]:
# !brew install cmake libomp
# !pip install lightgbm

Running `brew update --auto-update`...
[34m==>[0m [1mHomebrew collects anonymous analytics.[0m
[1mRead the analytics documentation (and how to opt-out) here:
  [4mhttps://docs.brew.sh/Analytics[24m[0m
No analytics have been recorded yet (nor will be during this `brew` run).

[34m==>[0m [1mHomebrew is run entirely by unpaid volunteers. Please consider donating:[0m
  [4mhttps://github.com/Homebrew/brew#donations[24m

[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
actions-batch              jtbl                       prjtrellis
appwrite                   k8sgpt                     proto
asmfmt                     kin                        protoc-gen-js
autobrr                    kiota                      ratchet
bluez                      ktfmt                      rathole
bpftop                     kubeshark                  rattler-build
c-blosc2                   kubetui                   

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization

import lightgbm as lgb


In [2]:
X_train_full_df = pd.read_csv("X_train_NMF_topics.csv")
X_test_full_df = pd.read_csv("X_test_NMF_topics.csv")

In [3]:
pd.set_option('display.max_columns', 500)

## In this test the target variable will be "weighted rating"

### Definining X_train, X_test, y_train, y_test

In [4]:
columns_to_drop = ["Title","description","authors","image",
                   "previewLink","publisher","infoLink","categories",
                   "index","reviews number","average rating",
                   "median rating","min review date_x","min review date_y",
                   "weighted rating","date","description_language","tokens"]

X_train = X_train_full_df.drop(columns_to_drop, axis = 1)
X_test = X_test_full_df.drop(columns_to_drop, axis = 1)

# change the year feature to an integer
X_train["year"] = X_train["year"].astype(int)
X_test["year"] = X_test["year"].astype(int)

y_train = X_train_full_df["weighted rating"]
y_test = X_test_full_df["weighted rating"]

### Hyperparameters for lightgbm

In [6]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 100000
}

In [7]:
# hyper_params = {
#     'task': 'train',
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': ['l1','l2'],
#     'learning_rate': 0.1, # Changed to default
#     'verbose': 0,
#     "max_depth": 8,
#     "num_leaves": 10,  
# }

hyper_params = {
    'task': 'train',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.1, # Changed to default
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 10,  
}

In [None]:
# # Constructing the lgbdataset object
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)


# Instantiating and fitting the lightgbm model
# gbm = lgb.LGBMRegressor(hyper_params)

gbm = lgb.LGBMRegressor(
    boosting_type= 'gbdt',
    n_estimators=100,
    learning_rate= 0.1, # Changed to default
    num_boost_round=100,
    max_depth=5
)

# gbm.fit(X_train, y_train,
#         eval_set=[(X_test, y_test)],
#         eval_metric='l1',
#         callbacks=[
#         lgb.early_stopping(stopping_rounds=3),
#     ])

gbm.fit(X_train, y_train)





In [6]:
# # Constructing the lgbdataset object
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# # changed early stopping rounds from 30 to 5

# lgb_model = lgb.train(hyper_params,
#                       train_set=lgb_train,
#                       valid_sets=lgb_eval,
#                       callbacks=[lgb.early_stopping(stopping_rounds=5)]
#                      )

In [None]:
y_pred = gbm.predict(X_train, num_iteration=bgm.best_iteration_)

In [None]:
# Basic RMSE
print('The rmse of prediction is:', round(mean_squared_log_error(y_pred, y_train) ** 0.5, 5))

# CATBOOST!

In [36]:
# Define X_train, X_test, y_train, y_test for the datasets with AND without the book cover encodings:

# Import NMF_topics datafiles (these do not include image embeddings
X_train_full_df = pd.read_csv("X_train_NMF_topics.csv")
X_test_full_df = pd.read_csv("X_test_NMF_topics.csv")

In [37]:
# Define the train/test split for the datasets that do not 
# include image embeddings:

# Identify the columns that we do not want in our feature set:
columns_to_drop = ["Title","description","authors","image",
                   "previewLink","publisher","infoLink","categories",
                   "reviews number","average rating",
                   "median rating","min review date_x","min review date_y",
                   "weighted rating","date","description_language","tokens"]

# Remove the unwanted features:
X_train_NOIMAGE = X_train_full_df.drop(columns_to_drop, axis = 1)
X_test_NOIMAGE = X_test_full_df.drop(columns_to_drop, axis = 1)

# change the year feature to an integer
X_train_NOIMAGE["year"] = X_train_NOIMAGE["year"].astype(int)
X_test_NOIMAGE["year"] = X_test_NOIMAGE["year"].astype(int)

# Set the target variable
y_train = X_train_full_df["weighted rating"]
y_test = X_test_full_df["weighted rating"]

# remember to drop index!!!

In [38]:
X_train_NOIMAGE.head(2)

Unnamed: 0,index,year,nostalgia,self-published/debut,story/anthology,womens_fiction,childrens_books,classic,family_drama,digital_books/recreations,...,reprint,bestselling_author,romance,unkonwn,teen,novel,world/war/historical_fiction,unknown,young_adult,coming_of_age
0,3,2005,0.007321,0.0,0.0,0.0,0.0,0.0,0.01781,0.0,...,0.001315,0.0,0.040636,0.0,0.000214,0.0,0.010339,0.0,0.0,0.0
1,24,2001,0.017931,0.000181,0.016551,0.0,0.011345,0.0,0.004711,0.0,...,0.0,0.0,0.009457,0.0,0.0,0.006398,0.000197,0.0,0.0,0.000621


In [39]:
#Import dataset with embeddings
title_embeddings_df = pd.read_pickle(
    'original_data/English_fiction_pre_PCA_3_with_av_pool_embeddings')

In [40]:
# Select only necessary features (features needed for merging)
title_embeddings_df = title_embeddings_df[["index", "Title", "Embedding"]]
title_embeddings_df = title_embeddings_df.set_index("index")

In [41]:
# Convert array to columns
embeddings_df = title_embeddings_df['Embedding'].apply(pd.Series)
embeddings_df = embeddings_df.add_prefix('image_')

In [42]:
# Now include images in the dataset:
X_train_withIMAGE = X_train_NOIMAGE.merge(
    embeddings_df,
    left_on='index', 
    right_on=embeddings_df.index,
    how = 'inner')

X_test_withIMAGE = X_test_NOIMAGE.merge(
    embeddings_df,
    left_on='index', 
    right_on=embeddings_df.index,
    how = 'inner')

y_train_NOIMAGE = X_train_full_df["weighted rating"]
y_test_NOIMAGE = X_test_full_df["weighted rating"]

In [43]:
# Drop index from data before inserting into ML models:
X_train_NOIMAGE = X_train_NOIMAGE.drop("index", axis = 1)
X_test_NOIMAGE = X_test_NOIMAGE.drop("index", axis = 1)

X_train_withIMAGE = X_train_withIMAGE.drop("index", axis = 1)
X_test_withIMAGE = X_test_withIMAGE.drop("index", axis = 1)

In [44]:
# !pip install catboost

In [None]:
import catboost as cb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Build and fit a CatBoost regressor
lgbm_reg = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

lgbm_reg.fit(X_train_withIMAGE, y_train)

# Calculate the predictions on the test set
pred =lgbm_reg.predict(X_test_withIMAGE)

# Evaluate the performance using the RMSE
rmse_lgbm = np.sqrt(mean_squared_error(y_test, pred))
mse_lgbm = mean_squared_error(y_test, pred)
print('MSE (CatBoost): {:.5f}'.format(mse_lgbm))
print('RMSE (CatBoost): {:.5f}'.format(rmse_lgbm))

In [56]:
import catboost as cb
from sklearn.metrics import mean_squared_error, mean_absolute_error

def catboost_execute(X_train,
                     X_test,
                     y_train,
                     y_test,
                     num_estimators,
                     learn_rate,
                     depth_max, 
                     reg_lambda_val
                     ):
    solution = {}
    # Build and fit a CatBoost regressor
    reg_cat = cb.CatBoostRegressor(n_estimators=num_estimators, 
                                   learning_rate=learn_rate, 
                                   max_depth=depth_max,
                                   reg_lambda=reg_lambda_val,
                                   random_state=42)
    
    reg_cat.fit(X_train, y_train)
    
    # Calculate the predictions on the test set
    pred =reg_cat.predict(X_test)
    pred_avg =np.full(5355, np.mean(y_train))
    
    # Evaluate the performance using the RMSE
    rmse_cat = np.sqrt(mean_squared_error(y_test, pred))
    rmse_cat_avg_baseline = np.sqrt(mean_squared_error(y_test, pred_avg))
    mse_cat = mean_squared_error(y_test, pred)
    mae_cat = mean_absolute_error(y_test, pred)
    print('MSE (CatBoost): {:.5f}'.format(mse_cat))
    print('RMSE (CatBoost): {:.5f}'.format(rmse_cat))
    
    solution["MSE"] = mse_cat
    solution["RMSE_cat_avg_baseline"] = rmse_cat_avg_baseline
    solution["RMSE"] = rmse_cat
    solution["MAE"] = mae_cat

    return solution, pred

In [52]:
y_test.shape

(5355,)

In [57]:
# default values 100, 0.1, 3
#  tuned values = 300, 0.05, 8, 1

solution, pred = catboost_execute(X_train_NOIMAGE,
                 X_test_NOIMAGE,
                 y_train,
                 y_test,
                 300,
                 0.05,
                 8,
                 1)

0:	learn: 0.2071599	total: 11ms	remaining: 3.28s
1:	learn: 0.2066898	total: 23.2ms	remaining: 3.46s
2:	learn: 0.2063424	total: 29.7ms	remaining: 2.94s
3:	learn: 0.2059842	total: 39.2ms	remaining: 2.9s
4:	learn: 0.2056700	total: 44.7ms	remaining: 2.64s
5:	learn: 0.2053477	total: 49.4ms	remaining: 2.42s
6:	learn: 0.2050607	total: 54ms	remaining: 2.26s
7:	learn: 0.2047544	total: 58.4ms	remaining: 2.13s
8:	learn: 0.2045070	total: 67.2ms	remaining: 2.17s
9:	learn: 0.2041713	total: 71.8ms	remaining: 2.08s
10:	learn: 0.2039024	total: 76.3ms	remaining: 2s
11:	learn: 0.2036678	total: 80ms	remaining: 1.92s
12:	learn: 0.2033632	total: 86ms	remaining: 1.9s
13:	learn: 0.2031552	total: 90.4ms	remaining: 1.85s
14:	learn: 0.2029348	total: 96.4ms	remaining: 1.83s
15:	learn: 0.2027264	total: 101ms	remaining: 1.79s
16:	learn: 0.2025345	total: 105ms	remaining: 1.75s
17:	learn: 0.2022818	total: 110ms	remaining: 1.72s
18:	learn: 0.2020502	total: 114ms	remaining: 1.69s
19:	learn: 0.2018581	total: 120ms	remai

In [67]:
y_test

0       3.841209
1       4.276257
2       4.306145
3       4.336313
4       4.363966
          ...   
5350    4.322300
5351    4.269408
5352    3.993982
5353    4.249356
5354    4.306145
Name: weighted rating, Length: 5355, dtype: float64

In [70]:
pred_ytest_df = pd.DataFrame(pred, y_test).reset_index().rename(columns={0:"pred", "weighted rating":"y_test"})

In [71]:
pred_ytest_df

Unnamed: 0,y_test,pred
0,3.841209,4.147563
1,4.276257,4.248851
2,4.306145,4.294339
3,4.336313,4.281392
4,4.363966,4.274487
...,...,...
5350,4.322300,4.265910
5351,4.269408,4.229114
5352,3.993982,4.217690
5353,4.249356,4.242861


In [75]:
# Between 1-2
slice1_2 = pred_ytest_df[(pred_ytest_df["y_test"] >= 1) & (pred_ytest_df["y_test"] < 2)]

slice2_3 = pred_ytest_df[(pred_ytest_df["y_test"] >= 2) & (pred_ytest_df["y_test"] < 3)]

slice3_4 = pred_ytest_df[(pred_ytest_df["y_test"] >= 3) & (pred_ytest_df["y_test"] < 4)]

slice4_5 = pred_ytest_df[(pred_ytest_df["y_test"] >= 4) & (pred_ytest_df["y_test"] < 5)]

In [76]:
# calc RMSE for each slice:
RMSE1_2 =  np.sqrt(mean_squared_error(slice1_2["y_test"], slice1_2["pred"]))
RMSE2_3 =  np.sqrt(mean_squared_error(slice2_3["y_test"], slice2_3["pred"]))
RMSE3_4 =  np.sqrt(mean_squared_error(slice3_4["y_test"], slice3_4["pred"]))
RMSE4_5 =  np.sqrt(mean_squared_error(slice4_5["y_test"], slice4_5["pred"]))

In [77]:
print(RMSE1_2)
print(RMSE2_3)
print(RMSE3_4)
print(RMSE4_5)

2.2828373472512946
1.4485867096749372
0.501017310334427
0.12884704533735222


In [19]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Define the parameter grid (from estimator)

model_CBR= cb.CatBoostRegressor()

param_grid = { 
    'learning_rate': [0.05, 0.1], 
    'depth': [6, 8], 
    'l2_leaf_reg': [1, 3],
    'iterations': [100, 200, 300] 
}

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
# Do I remove the cv=5?

grid_search = GridSearchCV(
    estimator=model_CBR,
    param_grid=param_grid,
    scoring='neg_mean_squared_error'
)

grid_search.fit(X_train_withIMAGE, y_train) 

# Get the best parameters
best_params = grid_search.best_params_

# Get the best estimator
best_estimator = grid_search.best_estimator_

0:	learn: 0.2049668	total: 21.3ms	remaining: 2.11s
1:	learn: 0.2046756	total: 38.7ms	remaining: 1.9s
2:	learn: 0.2043686	total: 52.3ms	remaining: 1.69s
3:	learn: 0.2041269	total: 62.4ms	remaining: 1.5s
4:	learn: 0.2038554	total: 72.2ms	remaining: 1.37s
5:	learn: 0.2035649	total: 81.7ms	remaining: 1.28s
6:	learn: 0.2033054	total: 91.7ms	remaining: 1.22s
7:	learn: 0.2030663	total: 102ms	remaining: 1.18s
8:	learn: 0.2027865	total: 112ms	remaining: 1.14s
9:	learn: 0.2025697	total: 125ms	remaining: 1.13s
10:	learn: 0.2023822	total: 139ms	remaining: 1.12s
11:	learn: 0.2021912	total: 151ms	remaining: 1.1s
12:	learn: 0.2020111	total: 169ms	remaining: 1.13s
13:	learn: 0.2017966	total: 190ms	remaining: 1.17s
14:	learn: 0.2015545	total: 213ms	remaining: 1.21s
15:	learn: 0.2013795	total: 230ms	remaining: 1.21s
16:	learn: 0.2011720	total: 241ms	remaining: 1.18s
17:	learn: 0.2010042	total: 251ms	remaining: 1.15s
18:	learn: 0.2008326	total: 261ms	remaining: 1.11s
19:	learn: 0.2006746	total: 270ms	rem

In [31]:
best_params

{'depth': 8, 'iterations': 300, 'l2_leaf_reg': 1, 'learning_rate': 0.05}

In [29]:
best_estimator


<catboost.core.CatBoostRegressor at 0x289290f50>