In [None]:
from xgboost import XGBRegressor
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import os
from datetime import datetime

# Function to save the model and results
def save_model_and_results(model, mse_train, mae_train, r2_train, mse_test, mae_test, r2_test, cv_mse):
    results_path = r'...\results'
    os.makedirs(results_path, exist_ok=True)  # Create directory if it doesn't exist

    # Save model with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_file = os.path.join(results_path, f'xgboost_model_{timestamp}.joblib')
    dump(model, model_file)

    # Save results to CSV with timestamp
    results_file = os.path.join(results_path, f'xgboost_results_{timestamp}.csv')
    pd.DataFrame([{
        'Model': 'XGBoost',
        'Train MSE': mse_train,
        'Train MAE': mae_train,
        'Train R2': r2_train,
        'Test MSE': mse_test,
        'Test MAE': mae_test,
        'Test R2': r2_test,
        'Cross-Validation MSE': cv_mse
    }]).to_csv(results_file, index=False)

    print(f"\nModel and results saved to {results_path}.")

# Load Preprocessed Data
file_path = r'...\data\standardized_data.csv'
df = pd.read_csv(file_path)

# Define feature matrix (X) and target variable (y)
X = df.drop(columns=['price'])
y = df['price']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost Regressor
model = XGBRegressor()

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive MSE values
cv_mse = -cv_scores.mean()
print(f"Cross-Validation MSE: {cv_mse:.2f} ± {cv_scores.std():.2f}")

# Fit the model on the training data
model.fit(X_train, y_train)

# Evaluate Model Performance on the Training Set
y_pred_train = model.predict(X_train)
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

# Evaluate Model Performance on the Test Set
y_pred_test = model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

# Print Results
print(f"\nTrain Set Performance:")
print(f'MSE: {mse_train}, MAE: {mae_train}, R2 Score: {r2_train}')
print(f"\nTest Set Performance:")
print(f'MSE: {mse_test}, MAE: {mae_test}, R2 Score: {r2_test}')

# Save the Model and Results
save_model_and_results(model, mse_train, mae_train, r2_train, mse_test, mae_test, r2_test, cv_mse)


Cross-Validation MSE: 0.27 ± 0.01

Train Set Performance:
MSE: 0.17985816032413962, MAE: 0.2855912153495718, R2 Score: 0.8192909202111043

Test Set Performance:
MSE: 0.2699700229472974, MAE: 0.3391957438956845, R2 Score: 0.7349618512682015

Model and results saved to C:\Users\izama\Desktop\machine learning\results.
