In [41]:
import sys
import os
import numpy as np
import pandas as pd
from pathlib import Path

In [42]:
current_dir = os.getcwd()
project_root = current_dir
while not os.path.isdir(os.path.join(project_root, "src")):
    parent_dir = os.path.dirname(project_root)
    if parent_dir == project_root:
        raise FileNotFoundError(
            "Could not find the 'src' directory. Ensure this script is within the project structure."
        )
    project_root = parent_dir

if project_root not in sys.path:
    sys.path.insert(0, project_root)

project_root = Path(project_root)

In [43]:
# Define paths relative to the project root for maximum robustness.
PROCESSED_DATA_DIR = project_root / "data" / "processed"
TRAIN_CLEAN_PATH = PROCESSED_DATA_DIR / "clean_train_tweets.parquet"
TEST_CLEAN_PATH = PROCESSED_DATA_DIR / "clean_test_tweets.parquet"
VAL_CLEAN_PATH = PROCESSED_DATA_DIR / "clean_val_tweets.parquet"

# Define where to save/load the generated embeddings for caching
TRAIN_EMB_PATH = PROCESSED_DATA_DIR / "train_embeddings.npy"
TEST_EMB_PATH = PROCESSED_DATA_DIR / "test_embeddings.npy"
VAL_EMB_PATH = PROCESSED_DATA_DIR / "val_embeddings.npy"

In [44]:
# from src.models.embeddings.emb_data import get_data_embeddings (is already used)
from src.models.utils import model_metrics
from src.model.model import create_lgbm_model, create_hyperparameter_search

import pandas as pd
import joblib as jb

In [45]:
train_labels = pd.read_parquet(TRAIN_CLEAN_PATH)
test_labels = pd.read_parquet(TEST_CLEAN_PATH)
val_labels = pd.read_parquet(VAL_CLEAN_PATH)

In [46]:
# Load labels
y_train = train_labels['Label'].values
y_test = test_labels['Label'].values
y_val = val_labels['Label'].values

In [47]:
### Load embeddings ###
# emb_train, emb_test, emb_val = get_data_embeddings() (Is already loaded)
emb_train, emb_test, emb_val = np.load(TRAIN_EMB_PATH), np.load(TEST_EMB_PATH), np.load(VAL_EMB_PATH)

In [48]:
# Create model
lbgm = create_lgbm_model()
random_search = create_hyperparameter_search(estimator=lbgm, n_iter=15)

In [49]:
# Fit the model
with jb.parallel_backend('threading'):
    random_search.fit(
        emb_train,
        y_train
    )

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.166462 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.989422 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_samples=10, n_estimators=1500, num_leaves=20, reg_alpha=0.1, reg_lambda=0.01, subsample=1.0; total time=18.9min
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_samples=10, n_estimators=1500, num_leaves=20, reg_alpha=0.1, reg_lambda=0.01, subsample=1.0; total time=19.0min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.911089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.777556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [I



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_samples=10, n_estimators=1500, num_leaves=20, reg_alpha=0.1, reg_lambda=0.01, subsample=1.0; total time=15.0min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.803148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=20, min_child_samples=10, n_estimators=1500, num_leaves=31, reg_alpha=0.01, reg_lambda=0.1, subsample=0.8; total time=24.5min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.770034 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=20, min_child_samples=10, n_estimators=1500, num_leaves=31, reg_alpha=0.01, reg_lambda=0.1, subsample=0.8; total time=24.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.796951 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=20, min_child_samples=10, n_estimators=1500, num_leaves=31, reg_alpha=0.01, reg_lambda=0.1, subsample=0.8; total time=24.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.768336 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=-1, min_child_samples=30, n_estimators=1500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.1, subsample=0.8; total time=30.5min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.784466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=-1, min_child_samples=30, n_estimators=1500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.1, subsample=0.8; total time=30.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.798102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=-1, min_child_samples=10, n_estimators=1500, num_leaves=20, reg_alpha=0.01, reg_lambda=1.0, subsample=1.0; total time=18.1min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.784774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=-1, min_child_samples=30, n_estimators=1500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.1, subsample=0.8; total time=30.5min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.794273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=-1, min_child_samples=10, n_estimators=1500, num_leaves=20, reg_alpha=0.01, reg_lambda=1.0, subsample=1.0; total time=17.9min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.778522 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=-1, min_child_samples=10, n_estimators=1500, num_leaves=20, reg_alpha=0.01, reg_lambda=1.0, subsample=1.0; total time=17.9min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.736628 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=10, min_child_samples=20, n_estimators=1500, num_leaves=31, reg_alpha=0.01, reg_lambda=0.01, subsample=1.0; total time=28.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.805674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=10, min_child_samples=20, n_estimators=1500, num_leaves=31, reg_alpha=0.01, reg_lambda=0.01, subsample=1.0; total time=28.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.792372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=20, min_child_samples=20, n_estimators=1000, num_leaves=31, reg_alpha=1.0, reg_lambda=1.0, subsample=1.0; total time=13.8min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.801998 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=10, min_child_samples=20, n_estimators=1500, num_leaves=31, reg_alpha=0.01, reg_lambda=0.01, subsample=1.0; total time=28.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.757945 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=20, min_child_samples=20, n_estimators=1000, num_leaves=31, reg_alpha=1.0, reg_lambda=1.0, subsample=1.0; total time=14.0min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.782290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=20, min_child_samples=20, n_estimators=500, num_leaves=40, reg_alpha=0.01, reg_lambda=1.0, subsample=0.8; total time= 9.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.776967 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=20, min_child_samples=20, n_estimators=1000, num_leaves=31, reg_alpha=1.0, reg_lambda=1.0, subsample=1.0; total time=13.8min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.777420 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=20, min_child_samples=20, n_estimators=500, num_leaves=40, reg_alpha=0.01, reg_lambda=1.0, subsample=0.8; total time= 9.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.788569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=20, min_child_samples=20, n_estimators=500, num_leaves=40, reg_alpha=0.01, reg_lambda=1.0, subsample=0.8; total time= 9.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.797606 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=-1, min_child_samples=30, n_estimators=1000, num_leaves=40, reg_alpha=0.1, reg_lambda=1.0, subsample=1.0; total time=20.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.784441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=-1, min_child_samples=30, n_estimators=1000, num_leaves=40, reg_alpha=0.1, reg_lambda=1.0, subsample=1.0; total time=20.8min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.799051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=20, min_child_samples=20, n_estimators=1000, num_leaves=20, reg_alpha=0.1, reg_lambda=0.1, subsample=1.0; total time=18.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.790279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=-1, min_child_samples=30, n_estimators=1000, num_leaves=40, reg_alpha=0.1, reg_lambda=1.0, subsample=1.0; total time=22.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.772936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=20, min_child_samples=20, n_estimators=1000, num_leaves=20, reg_alpha=0.1, reg_lambda=0.1, subsample=1.0; total time=17.1min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.775077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=20, min_child_samples=20, n_estimators=1000, num_leaves=20, reg_alpha=0.1, reg_lambda=0.1, subsample=1.0; total time=17.2min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.790443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=10, min_child_samples=20, n_estimators=500, num_leaves=31, reg_alpha=0.01, reg_lambda=0.1, subsample=1.0; total time=11.2min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.774552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=10, min_child_samples=20, n_estimators=500, num_leaves=31, reg_alpha=0.01, reg_lambda=0.1, subsample=1.0; total time=11.2min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.776063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=10, min_child_samples=20, n_estimators=500, num_leaves=31, reg_alpha=0.01, reg_lambda=0.1, subsample=1.0; total time=11.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.971094 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=-1, min_child_samples=30, n_estimators=1500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.1, subsample=0.8; total time=28.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.797081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=-1, min_child_samples=30, n_estimators=1500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.1, subsample=0.8; total time=27.9min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.699985 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=-1, min_child_samples=30, n_estimators=500, num_leaves=40, reg_alpha=1.0, reg_lambda=1.0, subsample=1.0; total time= 9.9min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.772334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=-1, min_child_samples=30, n_estimators=1500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.1, subsample=0.8; total time=26.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.765379 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=-1, min_child_samples=30, n_estimators=500, num_leaves=40, reg_alpha=1.0, reg_lambda=1.0, subsample=1.0; total time=10.5min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.914889 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=-1, min_child_samples=30, n_estimators=500, num_leaves=40, reg_alpha=1.0, reg_lambda=1.0, subsample=1.0; total time=13.8min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.885525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=10, min_child_samples=20, n_estimators=1500, num_leaves=31, reg_alpha=1.0, reg_lambda=1.0, subsample=1.0; total time=29.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.950243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=10, min_child_samples=20, n_estimators=1500, num_leaves=31, reg_alpha=1.0, reg_lambda=1.0, subsample=1.0; total time=29.0min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.762467 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=-1, min_child_samples=30, n_estimators=500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.1, subsample=0.9; total time= 9.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.796397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=10, min_child_samples=20, n_estimators=1500, num_leaves=31, reg_alpha=1.0, reg_lambda=1.0, subsample=1.0; total time=28.1min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.806061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=-1, min_child_samples=30, n_estimators=500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.1, subsample=0.9; total time= 9.8min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.786912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097488




[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=-1, min_child_samples=30, n_estimators=500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.1, subsample=0.9; total time= 9.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.758034 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097277
[LightGBM] [Info] Start training from score -1.101041
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=-1, min_child_samples=30, n_estimators=1500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.01, subsample=0.8; total time=24.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.791443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 85332, number of used features: 384
[LightGBM] [Info] Start training from score -1.097312
[LightGBM] [Info] Start training from score -1.101006
[LightGBM] [Info] Start training from score -1.097523




[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=-1, min_child_samples=30, n_estimators=1500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.01, subsample=0.8; total time=25.8min




[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=-1, min_child_samples=30, n_estimators=1500, num_leaves=40, reg_alpha=1.0, reg_lambda=0.01, subsample=0.8; total time=18.3min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.838417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 127998, number of used features: 384
[LightGBM] [Info] Start training from score -1.097301
[LightGBM] [Info] Start training from score -1.101029
[LightGBM] [Info] Start training from score -1.097511


In [52]:
# Predictions and metrics
best_lgb = random_search.best_estimator_

pred_test = best_lgb.predict(emb_test)
metrics_test = model_metrics(y_test,pred_test)

pred_val = best_lgb.predict(emb_val)
metrics_val = model_metrics(y_val,pred_val)



Accuracy: 74.98%
Precision: 74.88%
Recall: 74.98%
F1 Score: 0.7487
Confusion matrix:
 [[ 9360  2321  1705]
 [ 2314  9487  1507]
 [ 1150  1009 11147]]




Accuracy: 74.20%
Precision: 74.07%
Recall: 74.20%
F1 Score: 0.7409
Confusion matrix:
 [[7274 1957 1327]
 [1939 7626 1230]
 [ 971  833 8843]]


In [57]:
# Export model
model_path = "/home/juancho_col/Documents/Projects/Sentiment Analysis v2/src/models/predictor/lgbm_sentiment_predictor.joblib"

# Save
jb.dump(best_lgb,model_path)

['/home/juancho_col/Documents/Projects/Sentiment Analysis v2/src/models/predictor/lgbm_sentiment_predictor.joblib']