# Lightgbm model test notebook

In [4]:
# !brew install cmake libomp
# !pip install lightgbm

Running `brew update --auto-update`...
[34m==>[0m [1mHomebrew collects anonymous analytics.[0m
[1mRead the analytics documentation (and how to opt-out) here:
  [4mhttps://docs.brew.sh/Analytics[24m[0m
No analytics have been recorded yet (nor will be during this `brew` run).

[34m==>[0m [1mHomebrew is run entirely by unpaid volunteers. Please consider donating:[0m
  [4mhttps://github.com/Homebrew/brew#donations[24m

[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
actions-batch              jtbl                       prjtrellis
appwrite                   k8sgpt                     proto
asmfmt                     kin                        protoc-gen-js
autobrr                    kiota                      ratchet
bluez                      ktfmt                      rathole
bpftop                     kubeshark                  rattler-build
c-blosc2                   kubetui                   

In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization

import lightgbm as lgb


In [10]:
X_train_full_df = pd.read_csv("X_train_NMF_topics.csv")
X_test_full_df = pd.read_csv("X_test_NMF_topics.csv")

In [11]:
pd.set_option('display.max_columns', 500)

## In this test the target variable will be "weighted rating"

### Definining X_train, X_test, y_train, y_test

In [12]:
columns_to_drop = ["Title","description","authors","image",
                   "previewLink","publisher","infoLink","categories",
                   "index","reviews number","average rating",
                   "median rating","min review date_x","min review date_y",
                   "weighted rating","date","description_language","tokens"]

X_train = X_train_full_df.drop(columns_to_drop, axis = 1)
X_test = X_test_full_df.drop(columns_to_drop, axis = 1)

# change the year feature to an integer
X_train["year"] = X_train["year"].astype(int)
X_test["year"] = X_test["year"].astype(int)

y_train = X_train_full_df["weighted rating"]
y_test = X_test_full_df["weighted rating"]

### Hyperparameters for lightgbm

In [6]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 100000
}

In [7]:
# hyper_params = {
#     'task': 'train',
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': ['l1','l2'],
#     'learning_rate': 0.1, # Changed to default
#     'verbose': 0,
#     "max_depth": 8,
#     "num_leaves": 10,  
# }

hyper_params = {
    'task': 'train',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.1, # Changed to default
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 10,  
}

In [None]:
# # Constructing the lgbdataset object
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)


# Instantiating and fitting the lightgbm model
# gbm = lgb.LGBMRegressor(hyper_params)

gbm = lgb.LGBMRegressor(
    boosting_type= 'gbdt',
    n_estimators=100,
    learning_rate= 0.1, # Changed to default
    num_boost_round=100,
    max_depth=5
)

# gbm.fit(X_train, y_train,
#         eval_set=[(X_test, y_test)],
#         eval_metric='l1',
#         callbacks=[
#         lgb.early_stopping(stopping_rounds=3),
#     ])

gbm.fit(X_train, y_train)





In [6]:
# # Constructing the lgbdataset object
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# # changed early stopping rounds from 30 to 5

# lgb_model = lgb.train(hyper_params,
#                       train_set=lgb_train,
#                       valid_sets=lgb_eval,
#                       callbacks=[lgb.early_stopping(stopping_rounds=5)]
#                      )

In [None]:
y_pred = gbm.predict(X_train, num_iteration=bgm.best_iteration_)

In [None]:
# Basic RMSE
print('The rmse of prediction is:', round(mean_squared_log_error(y_pred, y_train) ** 0.5, 5))