# Lightgbm model test notebook

In [4]:
# !brew install cmake libomp
# !pip install lightgbm

Running `brew update --auto-update`...
[34m==>[0m [1mHomebrew collects anonymous analytics.[0m
[1mRead the analytics documentation (and how to opt-out) here:
  [4mhttps://docs.brew.sh/Analytics[24m[0m
No analytics have been recorded yet (nor will be during this `brew` run).

[34m==>[0m [1mHomebrew is run entirely by unpaid volunteers. Please consider donating:[0m
  [4mhttps://github.com/Homebrew/brew#donations[24m

[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
actions-batch              jtbl                       prjtrellis
appwrite                   k8sgpt                     proto
asmfmt                     kin                        protoc-gen-js
autobrr                    kiota                      ratchet
bluez                      ktfmt                      rathole
bpftop                     kubeshark                  rattler-build
c-blosc2                   kubetui                   

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization

import lightgbm as lgb


In [2]:
X_train_full_df = pd.read_csv("X_train_NMF_topics.csv")
X_test_full_df = pd.read_csv("X_test_NMF_topics.csv")

In [3]:
pd.set_option('display.max_columns', 500)

## In this test the target variable will be "weighted rating"

### Definining X_train, X_test, y_train, y_test

In [4]:
columns_to_drop = ["Title","description","authors","image",
                   "previewLink","publisher","infoLink","categories",
                   "index","reviews number","average rating",
                   "median rating","min review date_x","min review date_y",
                   "weighted rating","date","description_language","tokens"]

X_train = X_train_full_df.drop(columns_to_drop, axis = 1)
X_test = X_test_full_df.drop(columns_to_drop, axis = 1)

# change the year feature to an integer
X_train["year"] = X_train["year"].astype(int)
X_test["year"] = X_test["year"].astype(int)

y_train = X_train_full_df["weighted rating"]
y_test = X_test_full_df["weighted rating"]

### Hyperparameters for lightgbm

In [6]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 100000
}

In [7]:
# hyper_params = {
#     'task': 'train',
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': ['l1','l2'],
#     'learning_rate': 0.1, # Changed to default
#     'verbose': 0,
#     "max_depth": 8,
#     "num_leaves": 10,  
# }

hyper_params = {
    'task': 'train',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.1, # Changed to default
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 10,  
}

In [None]:
# # Constructing the lgbdataset object
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)


# Instantiating and fitting the lightgbm model
# gbm = lgb.LGBMRegressor(hyper_params)

gbm = lgb.LGBMRegressor(
    boosting_type= 'gbdt',
    n_estimators=100,
    learning_rate= 0.1, # Changed to default
    num_boost_round=100,
    max_depth=5
)

# gbm.fit(X_train, y_train,
#         eval_set=[(X_test, y_test)],
#         eval_metric='l1',
#         callbacks=[
#         lgb.early_stopping(stopping_rounds=3),
#     ])

gbm.fit(X_train, y_train)





In [6]:
# # Constructing the lgbdataset object
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# # changed early stopping rounds from 30 to 5

# lgb_model = lgb.train(hyper_params,
#                       train_set=lgb_train,
#                       valid_sets=lgb_eval,
#                       callbacks=[lgb.early_stopping(stopping_rounds=5)]
#                      )

In [None]:
y_pred = gbm.predict(X_train, num_iteration=bgm.best_iteration_)

In [None]:
# Basic RMSE
print('The rmse of prediction is:', round(mean_squared_log_error(y_pred, y_train) ** 0.5, 5))

# CATBOOST!

In [3]:
# Define X_train, X_test, y_train, y_test for the datasets with AND without the book cover encodings:

# Import NMF_topics datafiles (these do not include image embeddings
X_train_full_df = pd.read_csv("X_train_NMF_topics.csv")
X_test_full_df = pd.read_csv("X_test_NMF_topics.csv")

In [5]:
# Define the train/test split for the datasets that do not 
# include image embeddings:

# Identify the columns that we do not want in our feature set:
columns_to_drop = ["Title","description","authors","image",
                   "previewLink","publisher","infoLink","categories",
                   "reviews number","average rating",
                   "median rating","min review date_x","min review date_y",
                   "weighted rating","date","description_language","tokens"]

# Remove the unwanted features:
X_train_NOIMAGE = X_train_full_df.drop(columns_to_drop, axis = 1)
X_test_NOIMAGE = X_test_full_df.drop(columns_to_drop, axis = 1)

# change the year feature to an integer
X_train_NOIMAGE["year"] = X_train_NOIMAGE["year"].astype(int)
X_test_NOIMAGE["year"] = X_test_NOIMAGE["year"].astype(int)

# Set the target variable
y_train = X_train_full_df["weighted rating"]
y_test = X_test_full_df["weighted rating"]

# remember to drop index!!!

In [6]:
X_train_NOIMAGE.head(2)

Unnamed: 0,index,year,nostalgia,self-published/debut,story/anthology,womens_fiction,childrens_books,classic,family_drama,digital_books/recreations,...,reprint,bestselling_author,romance,unkonwn,teen,novel,world/war/historical_fiction,unknown,young_adult,coming_of_age
0,3,2005,0.007321,0.0,0.0,0.0,0.0,0.0,0.01781,0.0,...,0.001315,0.0,0.040636,0.0,0.000214,0.0,0.010339,0.0,0.0,0.0
1,24,2001,0.017931,0.000181,0.016551,0.0,0.011345,0.0,0.004711,0.0,...,0.0,0.0,0.009457,0.0,0.0,0.006398,0.000197,0.0,0.0,0.000621


In [7]:
#Import dataset with embeddings
title_embeddings_df = pd.read_pickle(
    'original_data/English_fiction_pre_PCA_3_with_av_pool_embeddings')

In [8]:
# Select only necessary features (features needed for merging)
title_embeddings_df = title_embeddings_df[["index", "Title", "Embedding"]]
title_embeddings_df = title_embeddings_df.set_index("index")

In [9]:
# Convert array to columns
embeddings_df = title_embeddings_df['Embedding'].apply(pd.Series)
embeddings_df = embeddings_df.add_prefix('image_')

In [10]:
# Now include images in the dataset:
X_train_withIMAGE = X_train_NOIMAGE.merge(
    embeddings_df,
    left_on='index', 
    right_on=embeddings_df.index,
    how = 'inner')

X_test_withIMAGE = X_test_NOIMAGE.merge(
    embeddings_df,
    left_on='index', 
    right_on=embeddings_df.index,
    how = 'inner')

y_train_NOIMAGE = X_train_full_df["weighted rating"]
y_test_NOIMAGE = X_test_full_df["weighted rating"]

In [11]:
# Drop index from data before inserting into ML models:
X_train_NOIMAGE = X_train_NOIMAGE.drop("index", axis = 1)
X_test_NOIMAGE = X_test_NOIMAGE.drop("index", axis = 1)

X_train_withIMAGE = X_train_withIMAGE.drop("index", axis = 1)
X_test_withIMAGE = X_test_withIMAGE.drop("index", axis = 1)

In [108]:
# !pip install catboost

In [None]:
import catboost as cb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Build and fit a CatBoost regressor
lgbm_reg = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

lgbm_reg.fit(X_train_withIMAGE, y_train)

# Calculate the predictions on the test set
pred =lgbm_reg.predict(X_test_withIMAGE)

# Evaluate the performance using the RMSE
rmse_lgbm = np.sqrt(mean_squared_error(y_test, pred))
mse_lgbm = mean_squared_error(y_test, pred)
print('MSE (CatBoost): {:.5f}'.format(mse_lgbm))
print('RMSE (CatBoost): {:.5f}'.format(rmse_lgbm))

In [22]:
import catboost as cb
from sklearn.metrics import mean_squared_error, mean_absolute_error

def catboost_execute(X_train,
                     X_test,
                     y_train,
                     y_test,
                     num_estimators,
                     learn_rate,
                     depth_max, 
                     reg_lambda_val
                     ):
    solution = {}
    # Build and fit a CatBoost regressor
    reg_cat = cb.CatBoostRegressor(n_estimators=num_estimators, 
                                   learning_rate=learn_rate, 
                                   max_depth=depth_max,
                                   reg_lambda=reg_lambda_val,
                                   random_state=42)
    
    reg_cat.fit(X_train, y_train)
    
    # Calculate the predictions on the test set
    pred =reg_cat.predict(X_test)
    
    # Evaluate the performance using the RMSE
    rmse_cat = np.sqrt(mean_squared_error(y_test, pred))
    mse_cat = mean_squared_error(y_test, pred)
    mae_cat = mean_absolute_error(y_test, pred)
    print('MSE (CatBoost): {:.5f}'.format(mse_cat))
    print('RMSE (CatBoost): {:.5f}'.format(rmse_cat))
    
    solution["MSE"] = mse_cat
    solution["RMSE"] = rmse_cat
    solution["MAE"] = mae_cat
    
    return solution

In [25]:
# default values 100, 0.1, 3

catboost_execute(X_train_NOIMAGE,
                 X_test_NOIMAGE,
                 y_train,
                 y_test,
                 100,
                 0.1,
                 3,
                 3)

0:	learn: 0.2071123	total: 2.19ms	remaining: 217ms
1:	learn: 0.2066845	total: 3.69ms	remaining: 181ms
2:	learn: 0.2063497	total: 5.39ms	remaining: 174ms
3:	learn: 0.2060414	total: 6.98ms	remaining: 168ms
4:	learn: 0.2058017	total: 9.29ms	remaining: 177ms
5:	learn: 0.2055639	total: 11.4ms	remaining: 179ms
6:	learn: 0.2053610	total: 14ms	remaining: 186ms
7:	learn: 0.2051861	total: 15.3ms	remaining: 177ms
8:	learn: 0.2050392	total: 16.8ms	remaining: 169ms
9:	learn: 0.2048980	total: 18.1ms	remaining: 163ms
10:	learn: 0.2047170	total: 19.5ms	remaining: 158ms
11:	learn: 0.2045851	total: 20.8ms	remaining: 152ms
12:	learn: 0.2044324	total: 22.2ms	remaining: 148ms
13:	learn: 0.2043099	total: 23.4ms	remaining: 144ms
14:	learn: 0.2041952	total: 24.6ms	remaining: 139ms
15:	learn: 0.2041202	total: 25.8ms	remaining: 136ms
16:	learn: 0.2040210	total: 27ms	remaining: 132ms
17:	learn: 0.2038989	total: 28.3ms	remaining: 129ms
18:	learn: 0.2038299	total: 29.8ms	remaining: 127ms
19:	learn: 0.2037355	total

{'MSE': 0.04303574772664888,
 'RMSE': 0.20745059104916735,
 'MAE': 0.13340316107184907}