In [1]:
! pip install mlflow boto3 awscli

In [2]:
import mlflow
# step 1 : Set up the mlflow tracking server 
mlflow.set_tracking_uri("http://ec2-13-62-226-249.eu-north-1.compute.amazonaws.com:5000/")

In [3]:
mlflow.set_experiment("Exp 7 - Best Model-Feature_Eng")

<Experiment: artifact_location='s3://reddit-reccomender-bucket/14', creation_time=1763558084789, experiment_id='14', last_update_time=1763558084789, lifecycle_stage='active', name='Exp 7 - Best Model-Feature_Eng', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import spacy
# CHANGES MADE: Import MLflow
import mlflow
import mlflow.lightgbm

In [5]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
# Separate features and target
X_cleaned = df['clean_comment']
y_cleaned = df['category']

# Split the cleaned data into train and test sets (80-20 split)
X_train_cleaned,X_test_cleaned,y_train_cleaned,y_test_cleaned = train_test_split(
    X_cleaned,y_cleaned,test_size=0.2,random_state=42
)

In [6]:
#load spacy language model for POS tagging
import spacy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

In [7]:
def extract_custom_features_batch(text_list):
    results = []

    # nlp.pipe is MUCH faster than individual calls
    for doc in nlp.pipe(text_list, batch_size=64):
        text = doc.text
        word_list = [token.text for token in doc]
        word_count = len(word_list)
        unique_words = len(set(word_list))
        pos_tags = [token.pos_ for token in doc]

        # Base features
        features = {
            "comment_length": len(text),
            "word_count": word_count,
            "avg_word_length": (
                sum(len(w) for w in word_list) / word_count
                if word_count else 0
            ),
            "unique_word_count": unique_words,
            "lexical_diversity": (
                unique_words / word_count if word_count else 0
            ),
            "pos_count": len(pos_tags),
        }

        # POS proportions
        if word_count > 0:
            for tag in set(pos_tags):
                features[f"pos_ratio_{tag}"] = pos_tags.count(tag) / word_count

        results.append(features)

    return results





In [8]:
# apply it to train and test

train_custom_features = pd.DataFrame(
    extract_custom_features_batch(X_train_cleaned)
)

test_custom_features = pd.DataFrame(
    extract_custom_features_batch(X_test_cleaned)
)


#ALIGN train and test so model doesn't break
train_custom_features, test_custom_features = train_custom_features.align(
    test_custom_features, join="outer", axis=1
)

train_custom_features = train_custom_features.fillna(0)
test_custom_features = test_custom_features.fillna(0)

In [9]:
train_custom_features.head()

Unnamed: 0,avg_word_length,comment_length,lexical_diversity,pos_count,pos_ratio_ADJ,pos_ratio_ADP,pos_ratio_ADV,pos_ratio_AUX,pos_ratio_CCONJ,pos_ratio_DET,...,pos_ratio_PART,pos_ratio_PRON,pos_ratio_PROPN,pos_ratio_PUNCT,pos_ratio_SCONJ,pos_ratio_SYM,pos_ratio_VERB,pos_ratio_X,unique_word_count,word_count
0,6.428571,51,1.0,7,0.0,0.142857,0.142857,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.428571,0.0,7,7
1,5.166667,36,1.0,6,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.166667,0.166667,0.0,0.0,0.0,0.333333,0.0,6,6
2,6.222222,64,1.0,9,0.222222,0.0,0.111111,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.222222,0.0,9,9
3,6.266667,108,0.933333,15,0.2,0.0,0.0,0.0,0.0,0.0,...,0.066667,0.0,0.0,0.0,0.0,0.0,0.2,0.0,14,15
4,6.0,6,1.0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1


In [10]:
test_custom_features.isnull().sum()

avg_word_length      0
comment_length       0
lexical_diversity    0
pos_count            0
pos_ratio_ADJ        0
pos_ratio_ADP        0
pos_ratio_ADV        0
pos_ratio_AUX        0
pos_ratio_CCONJ      0
pos_ratio_DET        0
pos_ratio_INTJ       0
pos_ratio_NOUN       0
pos_ratio_NUM        0
pos_ratio_PART       0
pos_ratio_PRON       0
pos_ratio_PROPN      0
pos_ratio_PUNCT      0
pos_ratio_SCONJ      0
pos_ratio_SYM        0
pos_ratio_VERB       0
pos_ratio_X          0
unique_word_count    0
word_count           0
dtype: int64

In [11]:
# Apply BOW with trigram setting and max_features=5000

BOW = CountVectorizer(ngram_range=(1,3),max_features=5000)
X_train_BOW = BOW.fit_transform(X_train_cleaned).astype('float32')
X_test_BOW = BOW.transform(X_test_cleaned).astype('float32')

In [12]:
# Convert BOW to DataFrame
X_train_BOW_df = pd.DataFrame(X_train_BOW.toarray(), columns=BOW.get_feature_names_out())
X_test_BOW_df = pd.DataFrame(X_test_BOW.toarray(), columns=BOW.get_feature_names_out())

In [13]:
#Combine BOW and custom features
X_train_combined = pd.concat([X_train_BOW_df.reset_index(drop=True), train_custom_features.reset_index(drop=True)], axis=1)
X_test_combined = pd.concat([X_test_BOW_df.reset_index(drop=True), test_custom_features.reset_index(drop=True)], axis=1)

In [14]:
X_train_combined

Unnamed: 0,000,000 crore,100,1000,101,120,150,180ml,1947,1984,...,pos_ratio_PART,pos_ratio_PRON,pos_ratio_PROPN,pos_ratio_PUNCT,pos_ratio_SCONJ,pos_ratio_SYM,pos_ratio_VERB,pos_ratio_X,unique_word_count,word_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.428571,0.0,7,7
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.166667,0.166667,0.0,0.0,0.0,0.333333,0.0,6,6
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.222222,0.0,9,9
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.066667,0.000000,0.000000,0.0,0.0,0.0,0.200000,0.0,14,15
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.083333,0.0,0.0,0.0,0.250000,0.0,12,12
29325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.600000,0.0,0.0,0.0,0.133333,0.0,27,30
29326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,1,1
29327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.243243,0.0,31,37


In [15]:
! pip install optuna

In [16]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import classification_report,accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# function to optimize LightGbm hyperparameter
def objective(trial):
    # Define hyperparameters to be tuned
    param = {
        "objective": "multiclass",
        "num_class":3,
        "learning_rate": trial.suggest_float("learning_rate",1e-3, 1e-1),
        "n_estimators" : trial.suggest_int("n_estimators",50,500),
        "min_child_samples": trial.suggest_int('min_child_samples', 10, 200),
        "reg_lambda" : trial.suggest_float('reg_lambda', 1e-4, 50.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "metric": "multi_logloss",
        "is_unbalance": True,
        "class_weight": "balanced",
    }

    # CHANGES MADE: Start nested MLflow run for this trial
    with mlflow.start_run(nested=True):
        # Log trial hyperparameters
        for k, v in param.items():
            mlflow.log_param(k, v)
     
        #Define the Lightgbm parameter with the trial parameter
        model = lgb.LGBMClassifier(**param)

        #perform a cross validation
        scores =cross_val_score(model,X_train_combined,y_train_cleaned,cv=3,scoring='accuracy')

        # Return the average score across folds
        mean_score = scores.mean()

        # Log the mean CV accuracy
        mlflow.log_metric("mean_cv_accuracy", mean_score)

    return mean_score

In [None]:
with mlflow.start_run(run_name="Custom_feature_reddit"):

    # Create an Optuna study to optimize the hyperparameters
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50)

    # Log best trial params
    best_params = study.best_trial.params
    for k, v in best_params.items():
        mlflow.log_param(f"best_{k}", v)

    best_model = lgb.LGBMClassifier(
        objective='multiclass',
        num_class=3,
        metric="multi_logloss",
        is_unbalance=True,
        class_weight="balanced",
        reg_alpha=0.1,
        reg_lambda=0.26403467489787047,
        learning_rate=0.08,
        max_depth=20,
        n_estimators=465,
        min_child_samples=10
    )

    # fit and log the model
    best_model.fit(X_train_combined, y_train_cleaned)

    # Predict on the test set
    y_test_pred = best_model.predict(X_test_combined)
    accuracy = accuracy_score(y_test_cleaned, y_test_pred)
    mlflow.log_metric("test_accuracy", accuracy)

    # Classification report
    report = classification_report(y_test_cleaned, y_test_pred, output_dict=True)

    for cls in ["0", "1", "2"]:
        if cls in report:
            mlflow.log_metric(f"precision_class_{cls}", report[cls]["precision"])
            mlflow.log_metric(f"recall_class_{cls}", report[cls]["recall"])
            mlflow.log_metric(f"f1_class_{cls}", report[cls]["f1-score"])

    # Log final trained model
    mlflow.lightgbm.log_model(best_model, artifact_path="lightgbm_model_custom_feature")

    print("Test accuracy:", accuracy)
    print(report)




[I 2025-11-19 22:20:29,537] A new study created in memory with name: no-name-be87739e-493f-4109-aeb3-07d472d604c7


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042120 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8301
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 769
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8357
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 770
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing 

[I 2025-11-19 22:21:36,348] Trial 0 finished with value: 0.7283918493702529 and parameters: {'learning_rate': 0.040407186598806315, 'n_estimators': 421, 'min_child_samples': 80, 'reg_lambda': 6.391302756755433, 'max_depth': 3}. Best is trial 0 with value: 0.7283918493702529.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7232
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 502
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037853 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7242
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 495
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing 

[I 2025-11-19 22:22:55,812] Trial 1 finished with value: 0.7337789623497614 and parameters: {'learning_rate': 0.06426993791282166, 'n_estimators': 484, 'min_child_samples': 117, 'reg_lambda': 13.722409402426845, 'max_depth': 13}. Best is trial 1 with value: 0.7337789623497614.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6356
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 302
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023722 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6372
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 301
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025247 s

[I 2025-11-19 22:24:16,282] Trial 2 finished with value: 0.7042176097068277 and parameters: {'learning_rate': 0.05917627730303833, 'n_estimators': 462, 'min_child_samples': 176, 'reg_lambda': 4.830774843101786, 'max_depth': 14}. Best is trial 1 with value: 0.7337789623497614.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.675520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15428
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 3021
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.284593 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15618
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3050
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2025-11-19 22:31:59,076] Trial 3 finished with value: 0.8110401366735708 and parameters: {'learning_rate': 0.07610918231721008, 'n_estimators': 355, 'min_child_samples': 20, 'reg_lambda': 0.009813818639495486, 'max_depth': 5}. Best is trial 3 with value: 0.8110401366735708.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6852
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 413
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6870
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 409
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042194 s

[I 2025-11-19 22:32:59,847] Trial 4 finished with value: 0.6707014214305805 and parameters: {'learning_rate': 0.054312251742778345, 'n_estimators': 80, 'min_child_samples': 137, 'reg_lambda': 0.0003689868072694603, 'max_depth': 4}. Best is trial 3 with value: 0.8110401366735708.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083951 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10922
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 1503
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11011
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 1508
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.1039

[I 2025-11-19 22:34:28,533] Trial 5 finished with value: 0.7500085408426793 and parameters: {'learning_rate': 0.01963766026092286, 'n_estimators': 159, 'min_child_samples': 41, 'reg_lambda': 0.027093864950836755, 'max_depth': 20}. Best is trial 3 with value: 0.8110401366735708.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.107143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10793
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 1468
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.112305 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10904
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 1477
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.1244

[I 2025-11-19 22:35:29,625] Trial 6 finished with value: 0.8038801170354218 and parameters: {'learning_rate': 0.06918299995507722, 'n_estimators': 203, 'min_child_samples': 42, 'reg_lambda': 0.14795570346975878, 'max_depth': 17}. Best is trial 3 with value: 0.8110401366735708.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6515
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 338
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6567
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 342
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.048874 s

[I 2025-11-19 22:49:47,307] Trial 7 finished with value: 0.7009443953965019 and parameters: {'learning_rate': 0.014590887892621765, 'n_estimators': 498, 'min_child_samples': 160, 'reg_lambda': 2.3777377263103605, 'max_depth': 20}. Best is trial 3 with value: 0.8110401366735708.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.093721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10793
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 1468
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077485 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10904
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 1477
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.0992

[I 2025-11-19 22:50:36,243] Trial 8 finished with value: 0.7721707267649601 and parameters: {'learning_rate': 0.05499703124152114, 'n_estimators': 488, 'min_child_samples': 42, 'reg_lambda': 0.004830483707678875, 'max_depth': 3}. Best is trial 3 with value: 0.8110401366735708.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.097924 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9951
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 1223
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.091640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9952
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 1201
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098743

[I 2025-11-19 22:51:07,696] Trial 9 finished with value: 0.7827405409591891 and parameters: {'learning_rate': 0.08984773866074167, 'n_estimators': 269, 'min_child_samples': 51, 'reg_lambda': 0.016269759705809825, 'max_depth': 5}. Best is trial 3 with value: 0.8110401366735708.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.188101 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19532
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 4532
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.119174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19716
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 4535
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.3541

[I 2025-11-19 22:52:02,765] Trial 10 finished with value: 0.8422721743990832 and parameters: {'learning_rate': 0.09819034231187426, 'n_estimators': 361, 'min_child_samples': 11, 'reg_lambda': 0.00018275146189573424, 'max_depth': 8}. Best is trial 10 with value: 0.8422721743990832.


In [None]:
#extract the best hyperparameter
best_params = study.best_params
best_params

{'learning_rate': 0.0995266871837764,
 'n_estimators': 438,
 'min_child_samples': 10,
 'reg_lambda': 0.03592354513195317,
 'max_depth': 20}