In [None]:
from catboost import CatBoostRegressor
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump
from sklearn.model_selection import train_test_split, GridSearchCV
import os
from datetime import datetime

# Function to save the model and results
def save_model_and_results(model, train_mse, train_mae, train_r2, test_mse, test_mae, test_r2):
    results_path = r'...\results'
    os.makedirs(results_path, exist_ok=True)  # Create directory if it doesn't exist

    # Save model with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_file = os.path.join(results_path, f'catboost_model_{timestamp}.joblib')
    dump(model, model_file)

    # Save results to CSV with timestamp
    results_file = os.path.join(results_path, f'catboost_results_{timestamp}.csv')
    pd.DataFrame([{
        'Model': 'CatBoost',
        'Train MSE': train_mse,
        'Train MAE': train_mae,
        'Train R2': train_r2,
        'Test MSE': test_mse,
        'Test MAE': test_mae,
        'Test R2': test_r2
    }]).to_csv(results_file, index=False)

    print(f"\nModel and results saved to {results_path}.")

# Load Preprocessed Data
file_path = r'...\data\standardized_data.csv'
df = pd.read_csv(file_path)
X = df.drop(columns=['price'])
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the CatBoost Regressor
model = CatBoostRegressor(verbose=0)  # Set verbose=0 to suppress output during training

# Define the parameter grid for hyperparameter optimization
param_grid = {
    'depth': [4, 6, 8],              # Depth of the tree
    'learning_rate': [0.01, 0.1],    # Learning rate
    'iterations': [100, 200]          # Number of boosting iterations
}

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Evaluate Model Performance on Train Set
y_train_pred = best_model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Evaluate Model Performance on Test Set
y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print Results
print("Train Set Evaluation:")
print(f'MSE: {train_mse}, MAE: {train_mae}, R2 Score: {train_r2}')
print("\nTest Set Evaluation:")
print(f'MSE: {test_mse}, MAE: {test_mae}, R2 Score: {test_r2}')
print(f"\nBest Parameters from Grid Search: {grid_search.best_params_}")

# Save the Model and Results
save_model_and_results(best_model, train_mse, train_mae, train_r2, test_mse, test_mae, test_r2)


Train Set Evaluation:
MSE: 0.24933714939338283, MAE: 0.3338267163970088, R2 Score: 0.7494832219852454

Test Set Evaluation:
MSE: 0.28452014296528366, MAE: 0.35276230125467134, R2 Score: 0.7206775361753905

Best Parameters from Grid Search: {'depth': 8, 'iterations': 200, 'learning_rate': 0.1}

Model and results saved to C:\Users\izama\Desktop\machine learning\results.
