# Lightgbm model test notebook

In [4]:
# !brew install cmake libomp
# !pip install lightgbm

Running `brew update --auto-update`...
[34m==>[0m [1mHomebrew collects anonymous analytics.[0m
[1mRead the analytics documentation (and how to opt-out) here:
  [4mhttps://docs.brew.sh/Analytics[24m[0m
No analytics have been recorded yet (nor will be during this `brew` run).

[34m==>[0m [1mHomebrew is run entirely by unpaid volunteers. Please consider donating:[0m
  [4mhttps://github.com/Homebrew/brew#donations[24m

[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
actions-batch              jtbl                       prjtrellis
appwrite                   k8sgpt                     proto
asmfmt                     kin                        protoc-gen-js
autobrr                    kiota                      ratchet
bluez                      ktfmt                      rathole
bpftop                     kubeshark                  rattler-build
c-blosc2                   kubetui                   

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization

import lightgbm as lgb


In [2]:
X_train_full_df = pd.read_csv("X_train_NMF_topics.csv")
X_test_full_df = pd.read_csv("X_test_NMF_topics.csv")

In [3]:
pd.set_option('display.max_columns', 500)

## In this test the target variable will be "weighted rating"

### Definining X_train, X_test, y_train, y_test

In [4]:
columns_to_drop = ["Title","description","authors","image",
                   "previewLink","publisher","infoLink","categories",
                   "index","reviews number","average rating",
                   "median rating","min review date_x","min review date_y",
                   "weighted rating","date","description_language","tokens"]

X_train = X_train_full_df.drop(columns_to_drop, axis = 1)
X_test = X_test_full_df.drop(columns_to_drop, axis = 1)

# change the year feature to an integer
X_train["year"] = X_train["year"].astype(int)
X_test["year"] = X_test["year"].astype(int)

y_train = X_train_full_df["weighted rating"]
y_test = X_test_full_df["weighted rating"]

### Hyperparameters for lightgbm

In [6]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 100000
}

In [7]:
# hyper_params = {
#     'task': 'train',
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': ['l1','l2'],
#     'learning_rate': 0.1, # Changed to default
#     'verbose': 0,
#     "max_depth": 8,
#     "num_leaves": 10,  
# }

hyper_params = {
    'task': 'train',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.1, # Changed to default
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 10,  
}

In [None]:
# # Constructing the lgbdataset object
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)


# Instantiating and fitting the lightgbm model
# gbm = lgb.LGBMRegressor(hyper_params)

gbm = lgb.LGBMRegressor(
    boosting_type= 'gbdt',
    n_estimators=100,
    learning_rate= 0.1, # Changed to default
    num_boost_round=100,
    max_depth=5
)

# gbm.fit(X_train, y_train,
#         eval_set=[(X_test, y_test)],
#         eval_metric='l1',
#         callbacks=[
#         lgb.early_stopping(stopping_rounds=3),
#     ])

gbm.fit(X_train, y_train)





In [6]:
# # Constructing the lgbdataset object
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# # changed early stopping rounds from 30 to 5

# lgb_model = lgb.train(hyper_params,
#                       train_set=lgb_train,
#                       valid_sets=lgb_eval,
#                       callbacks=[lgb.early_stopping(stopping_rounds=5)]
#                      )

In [None]:
y_pred = gbm.predict(X_train, num_iteration=bgm.best_iteration_)

In [None]:
# Basic RMSE
print('The rmse of prediction is:', round(mean_squared_log_error(y_pred, y_train) ** 0.5, 5))

# CATBOOST!

In [26]:
# Define X_train, X_test, y_train, y_test for the datasets with AND without the book cover encodings:

# Import NMF_topics datafiles (these do not include image embeddings
X_train_full_df = pd.read_csv("X_train_NMF_topics_100.csv")
X_test_full_df = pd.read_csv("X_test_NMF_topics_100.csv")

In [28]:
# Define the train/test split for the datasets that do not 
# include image embeddings:

# Identify the columns that we do not want in our feature set:
columns_to_drop = ["Title","description","authors","image",
                   "previewLink","publisher","infoLink","categories",
                   "reviews number","average rating",
                   "median rating","min review date_x","min review date_y",
                   "weighted rating","date","description_language","tokens"]

# Remove the unwanted features:
X_train_NOIMAGE = X_train_full_df.drop(columns_to_drop, axis = 1)
X_test_NOIMAGE = X_test_full_df.drop(columns_to_drop, axis = 1)

# change the year feature to an integer
X_train_NOIMAGE["year"] = X_train_NOIMAGE["year"].astype(int)
X_test_NOIMAGE["year"] = X_test_NOIMAGE["year"].astype(int)

# Set the target variable
y_train = X_train_full_df["weighted rating"]
y_test = X_test_full_df["weighted rating"]

# remember to drop index!!!

In [29]:
X_train_NOIMAGE.head(2)

Unnamed: 0,index,year,NMF_0,NMF_1,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,...,NMF_90,NMF_91,NMF_92,NMF_93,NMF_94,NMF_95,NMF_96,NMF_97,NMF_98,NMF_99
0,3,2005,7.9e-05,0.0,0.0,0.0,0.0,0.0,0.001653,0.0,...,0.0,0.0,0.0,0.00092,0.0,0.0,0.036894,0.0,0.001658,0.025723
1,24,2001,0.000319,0.001464,0.004044,0.0,0.001295,0.0,0.000105,0.0,...,0.001011,0.0,0.0,0.0,0.0,2.6e-05,0.028403,0.0,0.0,0.0


In [30]:
#Import dataset with embeddings
title_embeddings_df = pd.read_pickle(
    'original_data/English_fiction_pre_PCA_3_with_av_pool_embeddings')

In [31]:
# Select only necessary features (features needed for merging)
title_embeddings_df = title_embeddings_df[["index", "Title", "Embedding"]]
title_embeddings_df = title_embeddings_df.set_index("index")

In [32]:
# Convert array to columns
embeddings_df = title_embeddings_df['Embedding'].apply(pd.Series)
embeddings_df = embeddings_df.add_prefix('image_')

In [33]:
# Now include images in the dataset:
X_train_withIMAGE = X_train_NOIMAGE.merge(
    embeddings_df,
    left_on='index', 
    right_on=embeddings_df.index,
    how = 'inner')

X_test_withIMAGE = X_test_NOIMAGE.merge(
    embeddings_df,
    left_on='index', 
    right_on=embeddings_df.index,
    how = 'inner')

y_train_NOIMAGE = X_train_full_df["weighted rating"]
y_test_NOIMAGE = X_test_full_df["weighted rating"]

In [34]:
# Drop index from data before inserting into ML models:
X_train_NOIMAGE = X_train_NOIMAGE.drop("index", axis = 1)
X_test_NOIMAGE = X_test_NOIMAGE.drop("index", axis = 1)

X_train_withIMAGE = X_train_withIMAGE.drop("index", axis = 1)
X_test_withIMAGE = X_test_withIMAGE.drop("index", axis = 1)

In [108]:
# !pip install catboost

In [None]:
import catboost as cb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Build and fit a CatBoost regressor
lgbm_reg = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

lgbm_reg.fit(X_train_withIMAGE, y_train)

# Calculate the predictions on the test set
pred =lgbm_reg.predict(X_test_withIMAGE)

# Evaluate the performance using the RMSE
rmse_lgbm = np.sqrt(mean_squared_error(y_test, pred))
mse_lgbm = mean_squared_error(y_test, pred)
print('MSE (CatBoost): {:.5f}'.format(mse_lgbm))
print('RMSE (CatBoost): {:.5f}'.format(rmse_lgbm))

In [35]:
import catboost as cb
from sklearn.metrics import mean_squared_error, mean_absolute_error

def catboost_execute(X_train,
                     X_test,
                     y_train,
                     y_test,
                     num_estimators,
                     learn_rate,
                     depth_max, 
                     ):
    solution = {}
    # Build and fit a CatBoost regressor
    reg_cat = cb.CatBoostRegressor(n_estimators=num_estimators, 
                                   learning_rate=learn_rate, 
                                   max_depth=depth_max, 
                                   random_state=42)
    
    reg_cat.fit(X_train, y_train)
    
    # Calculate the predictions on the test set
    pred =reg_cat.predict(X_test)
    
    # Evaluate the performance using the RMSE
    rmse_cat = np.sqrt(mean_squared_error(y_test, pred))
    mse_cat = mean_squared_error(y_test, pred)
    mae_cat = mean_absolute_error(y_test, pred)
    print('MSE (CatBoost): {:.5f}'.format(mse_cat))
    print('RMSE (CatBoost): {:.5f}'.format(rmse_cat))
    
    solution["MSE"] = mse_cat
    solution["RMSE"] = rmse_cat
    solution["MAE"] = mae_cat
    
    return solution

In [39]:
catboost_execute(X_train_withIMAGE,
                 X_test_withIMAGE,
                 y_train,
                 y_test,
                 100,
                 0.1,
                 3)

0:	learn: 0.2072258	total: 11.7ms	remaining: 1.16s
1:	learn: 0.2069349	total: 16.7ms	remaining: 821ms
2:	learn: 0.2066633	total: 21.2ms	remaining: 686ms
3:	learn: 0.2064068	total: 26.3ms	remaining: 631ms
4:	learn: 0.2062145	total: 31.3ms	remaining: 595ms
5:	learn: 0.2060253	total: 36.3ms	remaining: 568ms
6:	learn: 0.2058181	total: 40.9ms	remaining: 543ms
7:	learn: 0.2056614	total: 45.4ms	remaining: 522ms
8:	learn: 0.2054696	total: 50ms	remaining: 505ms
9:	learn: 0.2052796	total: 54.9ms	remaining: 494ms
10:	learn: 0.2051265	total: 59.6ms	remaining: 482ms
11:	learn: 0.2049927	total: 64.6ms	remaining: 473ms
12:	learn: 0.2048576	total: 69.5ms	remaining: 465ms
13:	learn: 0.2046639	total: 74.2ms	remaining: 456ms
14:	learn: 0.2045496	total: 78.7ms	remaining: 446ms
15:	learn: 0.2044096	total: 83ms	remaining: 436ms
16:	learn: 0.2043083	total: 88ms	remaining: 429ms
17:	learn: 0.2041355	total: 92.3ms	remaining: 420ms
18:	learn: 0.2040440	total: 97.1ms	remaining: 414ms
19:	learn: 0.2039291	total: 

{'MSE': 0.04273954708521179,
 'RMSE': 0.20673545193123455,
 'MAE': 0.13248475529491216}