In [None]:
from lightgbm import LGBMRegressor
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump
from sklearn.model_selection import train_test_split
import os
from datetime import datetime

# Function to save the model and results
def save_model_and_results(model, mse_train, mae_train, r2_train, mse_test, mae_test, r2_test):
    results_path = r'...\results'
    os.makedirs(results_path, exist_ok=True)  # Create directory if it doesn't exist

    # Save model with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_file = os.path.join(results_path, f'lightgbm_model_{timestamp}.joblib')
    dump(model, model_file)

    # Save results to CSV with timestamp
    results_file = os.path.join(results_path, f'lightgbm_results_{timestamp}.csv')
    pd.DataFrame([{
        'Model': 'LightGBM',
        'Train MSE': mse_train,
        'Train MAE': mae_train,
        'Train R2': r2_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test R2': r2_test
    }]).to_csv(results_file, index=False)

    print(f"\nModel and results saved to {results_path}.")

# Load Preprocessed Data
file_path = r'...\data\standardized_data.csv'
df = pd.read_csv(file_path)
X = df.drop(columns=['price'])
y = df['price']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train LightGBM Regressor Model
model = LGBMRegressor()
model.fit(X_train, y_train)

# Evaluate Model Performance on Training Set
y_pred_train = model.predict(X_train)
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

# Evaluate Model Performance on Test Set
y_pred_test = model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

# Print Results
print(f"\nTrain Set Performance:")
print(f'MSE: {mse_train}, MAE: {mae_train}, R2 Score: {r2_train}')
print(f"\nTest Set Performance:")
print(f'MSE: {mse_test}, MAE: {mae_test}, R2 Score: {r2_test}')

# Save the Model and Results
save_model_and_results(model, mse_train, mae_train, r2_train, mse_test, mae_test, r2_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002245 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1204
[LightGBM] [Info] Number of data points in the train set: 45295, number of used features: 14
[LightGBM] [Info] Start training from score -0.002629

Train Set Performance:
MSE: 0.25037366897972235, MAE: 0.3347926847940165, R2 Score: 0.748441798564184

Test Set Performance:
MSE: 0.2846875233325708, MAE: 0.35405551239939537, R2 Score: 0.7205132135509913

Model and results saved to C:\Users\izama\Desktop\machine learning\results.
