In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
from joblib import dump
from datetime import datetime

# Load dataset
data = pd.read_csv(r'...\data\standardized_data.csv')

# Define features (X) and target variable (y)
X = data.drop('price', axis=1)  # Features
y = data['price']                # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(dt_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive MSE values
cv_scores = -cv_scores

# Print cross-validation results
print(f"Cross-Validation MSE: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

# Define the parameter grid for hyperparameter optimization
param_grid = {
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Set up GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=dt_regressor, param_grid=param_grid,
                           cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model with the training data
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the training set
train_predictions = best_model.predict(X_train)
train_mse = mean_squared_error(y_train, train_predictions)
train_mae = mean_absolute_error(y_train, train_predictions)
train_r2 = r2_score(y_train, train_predictions)

# Make predictions on the testing set
test_predictions = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
test_mae = mean_absolute_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Training MSE: {train_mse:.2f}, Training MAE: {train_mae:.2f}, Training R2: {train_r2:.2f}")
print(f"Testing MSE: {test_mse:.2f}, Testing MAE: {test_mae:.2f}, Testing R2: {test_r2:.2f}")

# Function to save the model and results
def save_model_and_results(model, train_mse, train_mae, train_r2, test_mse, test_mae, test_r2):
    results_path = r'...\results'
    os.makedirs(results_path, exist_ok=True)  # Create directory if it doesn't exist

    # Save model with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_file = os.path.join(results_path, f'decision_tree_model_{timestamp}.joblib')
    dump(model, model_file)

    # Save results to CSV with timestamp
    results_file = os.path.join(results_path, f'decision_tree_results_{timestamp}.csv')
    pd.DataFrame([{
        'Model': 'Decision Tree',
        'Train MSE': train_mse,
        'Train MAE': train_mae,
        'Train R2': train_r2,
        'Test MSE': test_mse,
        'Test MAE': test_mae,
        'Test R2': test_r2
    }]).to_csv(results_file, index=False)

    print(f"\nModel and results saved to {results_path}.")

# Save the model and results
save_model_and_results(best_model, train_mse, train_mae, train_r2, test_mse, test_mae, test_r2)


Cross-Validation MSE: 0.54 ± 0.02
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Training MSE: 0.32, Training MAE: 0.38, Training R2: 0.68
Testing MSE: 0.41, Testing MAE: 0.42, Testing R2: 0.60

Model and results saved to C:\Users\izama\Desktop\machine learning\results.
