# Import

In [None]:
import sys
import os
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler



# Import custom file

In [None]:
import sys
sys.path.append(os.path.abspath('../scripts'))
import data_processing as dp
import model as m

# Data loading & cleaning

In [None]:
filepath = r"C:\Users\user\Desktop\Project\alpha-care-insurance\data\machine.csv"
data = dp.load_and_clean_data(filepath)
data

# Encoding

In [None]:
columns_label = ['sex', 'smoker', 'region']
columns_onehot = ['sex', 'smoker', 'region', 'children']

df_label = dp.encoder('labelEncoder', data, columns_label, columns_onehot)
df_label

# Scaling

In [None]:
columns_scaler = ['charges']
df_scaled = dp.scaler('minMaxScaler', df_label, columns_scaler)
df_scaled

# Train & test

In [None]:
X = df_scaled.drop('charges', axis=1)
y = df_scaled['charges']

X_train, X_test, y_train, y_test = m.split_data(X, y)
X_train


# Evalution

In [None]:
lr_model, dt_model, rfr_model, xgb_model = m.train_models(X_train, y_train)
models = ['Linear Regression', 'Decision Tree', 'Random Forest', 'XGBoost']
mae_scores, mse_scores, r2_scores = [], [], []

for model in [lr_model, dt_model, rfr_model, xgb_model]:
    mae, mse, r2, _ = m.evaluate_model(model, X_test, y_test)
    mae_scores.append(mae)
    mse_scores.append(mse)
    r2_scores.append(r2)
    
    

# Prints the performance

In [None]:
for i, model_name in enumerate(models):
    print(f"Evaluation results for {model_name}:")
    print(f" - Mean Absolute Error (MAE): {mae_scores[i]}")
    print(f" - Mean Squared Error (MSE): {mse_scores[i]}")
    print(f" - R-squared (R2) Score: {r2_scores[i]}")
    print("\n")

# bar chart 

In [None]:
m.plot_metrics(models, mae_scores, mse_scores, r2_scores)

# decision tree visualization

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 10))
plot_tree(decision_tree=dt_model, feature_names=X_train.columns, filled=True, rounded=True)
plt.show()

# random forest visualization

In [None]:
tree_index = 0
single_tree = rfr_model.estimators_[tree_index]

plt.figure(figsize=(20, 10))
plot_tree(single_tree, feature_names=X_train.columns, filled=True, rounded=True)
plt.show()

# demonstrates model diversity

In [None]:
# Shows first 3 trees â†’ demonstrates model diversity.
for tree_index in range(3):
    single_tree = rfr_model.estimators_[tree_index]
    plt.figure(figsize=(20, 10))
    plot_tree(single_tree, feature_names=X_train.columns, filled=True, rounded=True)
    plt.title(f"Decision Tree {tree_index} from Random Forest")
    plt.show()


# feature 

In [None]:
def plot_feature_importance(model, feature_names, model_name):
    feature_importance = pd.DataFrame(model.feature_importances_, index=feature_names, columns=["Importance"])
    feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

    plt.figure(figsize=(10, 6))
    feature_importance.plot(kind='bar', legend=False, color='skyblue')
    plt.title(f'Feature Importance for {model_name}')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.xticks(rotation=45)
    plt.show()

# Compares which features matter most across different models

In [None]:
plot_feature_importance(dt_model, X_train.columns, "Decision Tree")
plot_feature_importance(rfr_model, X_train.columns, "Random Forest")
plot_feature_importance(xgb_model, X_train.columns, "XGBoost")

# Hyperparameter model learns.

In [None]:

from sklearn.model_selection import GridSearchCV
# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],          # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],          # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],          # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],            # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]                # Whether bootstrap samples are used when building trees
}

# RandomForestRegressor


In [None]:
# Set up the grid search
grid_search = GridSearchCV(estimator=rfr_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='r2')
# Fit the grid search model to the training data
grid_search.fit(X_train, y_train)
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best R-squared Score: {grid_search.best_score_}")

#  best parameter

In [None]:
# Train the model using the best parameters
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

best_rfr_model = grid_search.best_estimator_

# Fit the best model on the training data
best_rfr_model.fit(X_train, y_train)

# Make predictions and evaluate the performance
y_pred_best = best_rfr_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

# Print the results
print(f"MAE: {mae_best}, MSE: {mse_best}, R2: {r2_best}")


# Visualizes one decision tree inside the tuned Random Forest

In [None]:
plt.figure(figsize=(20, 10))
plot_tree(single_tree, feature_names=X_train.columns, filled=True, rounded=True)
plt.show()

# PRINT DECISION RULES

In [None]:
from sklearn.tree import _tree

# Extract the best Random Forest model from GridSearchCV
best_rfr_model = grid_search.best_estimator_

# Extract a single decision tree from the Random Forest
tree_index = 0  # Index of the tree to visualize
single_tree = best_rfr_model.estimators_[tree_index]

# Function to display the decision tree criteria
def print_tree_criteria(tree, feature_names):
    # Access the tree structure
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    
    def recurse(node):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print(f"Node {node}: {name} <= {threshold:.2f}")
            recurse(tree_.children_left[node])
            print(f"Node {node}: {name} > {threshold:.2f}")
            recurse(tree_.children_right[node])
        else:
            print(f"Leaf node {node}: Predicted value {tree_.value[node]}")

    recurse(0)  # Start from the root node

# Print the criteria for each split in the selected tree
print_tree_criteria(single_tree, X_train.columns)