In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
import joblib
from lightgbm import LGBMClassifier

In [2]:
# Set display option to show all rows
pd.set_option('display.max_rows', None)

# Set displaxx option to show all columns
pd.set_option('display.max_columns', None)

# Set the displax width to None (auto-detect width)
pd.set_option('display.width', None)

# If necessary, control the column width to prevent truncation
pd.set_option('display.max_colwidth', None)

In [5]:
merged_train=pd.read_csv("merged_train.csv") #input the transformed train
merged_test=pd.read_csv("merged_test.csv") #input the transformed test

In [6]:
X_train=merged_train.drop(['Unnamed: 0','target'],axis=1)
y_train=merged_train['target']
X_test=merged_test.drop(['Unnamed: 0','target'],axis=1)
y_test=merged_test['target']

In [7]:

lightgbm_params = {
    'n_estimators': 100,              # Number of boosting iterations
    'learning_rate': 0.1,             # Step size
    'num_leaves': 31,                 # Maximum number of leaves in one tree
    'max_depth': -1,                  # Maximum depth of the tree, -1 means no limit
    'scale_pos_weight': 1,            # Balancing of positive and negative weights
}

In [12]:
# Define a dictionary of models
models = {
    "LightGBM": LGBMClassifier(**lightgbm_params)   
}


In [9]:
import joblib

In [11]:
# Function to evaluate model performance
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("\n" + "-"*50 + "\n")
    joblib.dump(model,"lgbm.pkl")
    
    
    return accuracy, conf_matrix, precision, recall, f1

# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"Evaluating {name}...")
    
    results[name] = evaluate_model(model, X_train, y_train, X_test, y_test)




Evaluating LightGBM...
[LightGBM] [Info] Number of positive: 74033, number of negative: 711100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2528
[LightGBM] [Info] Number of data points in the train set: 785133, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.094294 -> initscore=-2.262302
[LightGBM] [Info] Start training from score -2.262302
Accuracy: 0.978281469707159
Confusion Matrix:
[[232798   4236]
 [  1448  23230]]
Precision: 0.8457729556542635
Recall: 0.9413242564227247
F1 Score: 0.8909941699907947

--------------------------------------------------

