<a href="https://colab.research.google.com/github/Lee-Minsoo-97/Machine-Learning/blob/main/Decision_Tree_%26_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decision Tree Classifier

In [None]:
#install
#!pip install scikit-optimize
# Data handling
import pandas as pd
import numpy as np

# Data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Model building
from skopt.space import Integer, Categorical
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV  # Bayesian optimization

# Model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Other
import warnings
warnings.filterwarnings("ignore")

In [None]:
#lini to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import data
path_train = '/content/drive/MyDrive/Colab Notebooks/CIS508_Machine_Learning/Individual_Assignment/Insurance Fraud - TRAIN-3000.csv'
path_test = '/content/drive/MyDrive/Colab Notebooks/CIS508_Machine_Learning/Individual_Assignment/Insurance Fraud -TEST-12900.csv'

df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

# Separate features and target in the training data
X_train = df_train.drop("FRAUDFOUND", axis=1)
y_train = df_train["FRAUDFOUND"]

# Apply the same transformation to the test data
X_test = df_test.drop("FRAUDFOUND", axis=1)
y_test = df_test["FRAUDFOUND"]

# Encoding categorical variables
categorical_features = X_train.select_dtypes(include=["object"]).columns
X_train = pd.get_dummies(X_train, columns=categorical_features, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)

# Align test and train data (if they have different columns after encoding)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Convert Target Labels to Numeric
y_train = y_train.map({'No': 0, 'Yes': 1})
y_test = y_test.map({'No': 0, 'Yes': 1})


In [None]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Initialize the Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42,class_weight='balanced')

# Expanded hyperparameter grid
dt_param_grid_expanded = {
    'max_depth': range(1, 51, 10),                   # Increased range with steps to reduce combinations
    'min_samples_split': range(2, 21, 2),            # Wider range to explore split sizes
    'min_samples_leaf': range(1, 11, 2),             # Control minimum samples at leaf
    'criterion': ['gini', 'entropy'],                # Split quality criteria
    'max_features': ['auto', 'sqrt', 'log2']         # Feature subset at each split
}

In [None]:

# Grid Search with expanded grid (may take longer time due to combinatorial growth)
grid_search_dt_expanded = GridSearchCV(estimator=dt, param_grid=dt_param_grid_expanded, cv=5, scoring='f1', n_jobs=-1)
grid_search_dt_expanded.fit(X_train_resampled, y_train_resampled)
print("Decision Tree - Expanded Grid Search Best Parameters:", grid_search_dt_expanded.best_params_)

# Random Search with expanded grid (limited by n_iter to control runtime)
random_search_dt_expanded = RandomizedSearchCV(estimator=dt, param_distributions=dt_param_grid_expanded, cv=5, n_iter=20, scoring='f1', random_state=42, n_jobs=-1)
random_search_dt_expanded.fit(X_train_resampled, y_train_resampled)
print("Decision Tree - Expanded Random Search Best Parameters:", random_search_dt_expanded.best_params_)

# Corrected expanded hyperparameter grid for Bayesian Search
bayes_param_grid_expanded = {
    'max_depth': Integer(1, 50),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
    'criterion': Categorical(['gini', 'entropy']),
    'max_features': Categorical(['sqrt', 'log2', None])  # Remove 'auto' as it's invalid for DecisionTreeClassifier
}

# Bayesian Search with corrected grid
bayes_search_dt_expanded = BayesSearchCV(estimator=dt, search_spaces=bayes_param_grid_expanded, cv=5, scoring='f1', n_iter=30, random_state=42, n_jobs=-1)
bayes_search_dt_expanded.fit(X_train_resampled, y_train_resampled)
print("Decision Tree - Expanded Bayesian Search Best Parameters:", bayes_search_dt_expanded.best_params_)

Decision Tree - Expanded Grid Search Best Parameters: {'criterion': 'gini', 'max_depth': 21, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 6}
Decision Tree - Expanded Random Search Best Parameters: {'min_samples_split': 14, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 21, 'criterion': 'entropy'}
Decision Tree - Expanded Bayesian Search Best Parameters: OrderedDict([('criterion', 'gini'), ('max_depth', 27), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2)])


In [None]:
# Initialize models with best parameters from each search
# Expanded Grid Search Model
dt_grid_expanded = DecisionTreeClassifier(**grid_search_dt_expanded.best_params_, random_state=42, class_weight='balanced')
dt_grid_expanded.fit(X_train_resampled, y_train_resampled)

# Expanded Random Search Model
dt_random_expanded = DecisionTreeClassifier(**random_search_dt_expanded.best_params_, random_state=42, class_weight='balanced')
dt_random_expanded.fit(X_train_resampled, y_train_resampled)

# Expanded Bayesian Search Model
dt_bayes_expanded = DecisionTreeClassifier(**bayes_search_dt_expanded.best_params_, random_state=42, class_weight='balanced')
dt_bayes_expanded.fit(X_train_resampled, y_train_resampled)

# Function to evaluate model on the test set
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }
    return results

# Evaluate each model on the test set
print("Evaluating Expanded Grid Search Model")
grid_results_expanded = evaluate_model(dt_grid_expanded, X_test, y_test)

print("Evaluating Expanded Random Search Model")
random_results_expanded = evaluate_model(dt_random_expanded, X_test, y_test)

print("Evaluating Expanded Bayesian Search Model")
bayes_results_expanded = evaluate_model(dt_bayes_expanded, X_test, y_test)

# Compile and display results for comparison
comparison_df_expanded = pd.DataFrame([grid_results_expanded, random_results_expanded, bayes_results_expanded],
                                      index=["Expanded Grid Search", "Expanded Random Search", "Expanded Bayesian Search"])
print("\nComparison of Decision Tree Classifiers with Expanded Grid:")
print(comparison_df_expanded)


Evaluating Expanded Grid Search Model
Evaluating Expanded Random Search Model
Evaluating Expanded Bayesian Search Model

Comparison of Decision Tree Classifiers with Expanded Grid:
                          Accuracy  Precision    Recall  F1 Score
Expanded Grid Search      0.859421   0.157128  0.606426  0.249587
Expanded Random Search    0.858956   0.140608  0.520080  0.221368
Expanded Bayesian Search  0.864453   0.207653  0.893574  0.336994


•	Why 1 to 20? This range is typically a good starting point for tuning. In practice, trees deeper than 20 tend to overfit unless you have a large, complex dataset. If your data is simpler or you find overfitting, you might reduce the upper limit.


•	Why 2 to 10? This range provides flexibility from a highly detailed tree (min_samples_split=2) to a simpler, generalized tree. It’s common to start with 2 and test up to around 10, but the range can be adjusted based on data size and noise levels.

# Ranodm Forest Classifier

In [None]:

# Expanded hyperparameter grid for RandomForestClassifier
# Adjusted hyperparameter grid for RandomForestClassifier
rf_param_grid_expanded = {
    'n_estimators': [50, 100, 200],            # Reduced number of trees
    'max_depth': [10, 20, 30],                 # Fewer values for depth
    'min_samples_split': [2, 10, 20],          # Limited split options
    'min_samples_leaf': [1, 5, 10],            # Limited leaf options
    'criterion': ['gini', 'entropy'],          # Criterion for split quality
    'max_features': ['auto', 'sqrt', 'log2']   # Valid options for RandomForestClassifier
}

In [None]:
# Initialize the Random Forest with class_weight='balanced'
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Grid Search with expanded grid
grid_search_rf_expanded = GridSearchCV(estimator=rf, param_grid=rf_param_grid_expanded, cv=3, scoring='f1', n_jobs=-1)
grid_search_rf_expanded.fit(X_train_resampled, y_train_resampled)
print("Random Forest - Expanded Grid Search Best Parameters:", grid_search_rf_expanded.best_params_)

# Random Search with expanded grid
random_search_rf_expanded = RandomizedSearchCV(estimator=rf, param_distributions=rf_param_grid_expanded, cv=3, n_iter=10, scoring='f1', random_state=42, n_jobs=-1)
random_search_rf_expanded.fit(X_train_resampled, y_train_resampled)
print("Random Forest - Expanded Random Search Best Parameters:", random_search_rf_expanded.best_params_)


Random Forest - Expanded Grid Search Best Parameters: {'criterion': 'gini', 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Random Forest - Expanded Random Search Best Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'criterion': 'gini'}


InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.

In [None]:
print("Random Forest - Expanded Grid Search Best Parameters:", grid_search_rf_expanded.best_params_)
print("Random Forest - Expanded Random Search Best Parameters:", random_search_rf_expanded.best_params_)



Random Forest - Expanded Grid Search Best Parameters: {'criterion': 'gini', 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Random Forest - Expanded Random Search Best Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'criterion': 'gini'}


In [None]:
bayes_rf_param_grid_expanded = {
    'n_estimators': Integer(50, 300),
    'max_depth': Integer(10, 50),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
    'criterion': Categorical(['gini', 'entropy']),
    'max_features': Categorical(['sqrt', 'log2', None])  # Replace 'auto' with None
}

# Bayesian Search with corrected grid
bayes_search_rf_expanded = BayesSearchCV(estimator=rf, search_spaces=bayes_rf_param_grid_expanded, cv=3, scoring='f1', n_iter=15, random_state=42, n_jobs=-1)
bayes_search_rf_expanded.fit(X_train_resampled, y_train_resampled)
print("Random Forest - Expanded Bayesian Search Best Parameters:", bayes_search_rf_expanded.best_params_)

Random Forest - Expanded Bayesian Search Best Parameters: OrderedDict([('criterion', 'gini'), ('max_depth', 50), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 18), ('n_estimators', 231)])


In [None]:
# Initialize models with best parameters from each search method
# Grid Search Model
rf_grid_expanded = RandomForestClassifier(**grid_search_rf_expanded.best_params_, random_state=42, class_weight='balanced')
rf_grid_expanded.fit(X_train_resampled, y_train_resampled)

# Random Search Model
rf_random_expanded = RandomForestClassifier(**random_search_rf_expanded.best_params_, random_state=42, class_weight='balanced')
rf_random_expanded.fit(X_train_resampled, y_train_resampled)

# Bayesian Search Model
rf_bayes_expanded = RandomForestClassifier(**bayes_search_rf_expanded.best_params_, random_state=42, class_weight='balanced')
rf_bayes_expanded.fit(X_train_resampled, y_train_resampled)

# Evaluate each model on the test set
print("Evaluating Expanded Grid Search Model")
grid_results_rf_expanded = evaluate_model(rf_grid_expanded, X_test, y_test)

print("Evaluating Expanded Random Search Model")
random_results_rf_expanded = evaluate_model(rf_random_expanded, X_test, y_test)

print("Evaluating Expanded Bayesian Search Model")
bayes_results_rf_expanded = evaluate_model(rf_bayes_expanded, X_test, y_test)

# Compile and display results for comparison
comparison_df_rf_expanded = pd.DataFrame([grid_results_rf_expanded, random_results_rf_expanded, bayes_results_rf_expanded],
                                         index=["Expanded Grid Search", "Expanded Random Search", "Expanded Bayesian Search"])
print("\nComparison of Random Forest Classifiers with Expanded Grid:")
print(comparison_df_rf_expanded)

Evaluating Expanded Grid Search Model
Evaluating Expanded Random Search Model
Evaluating Expanded Bayesian Search Model

Comparison of Random Forest Classifiers with Expanded Grid:
                          Accuracy  Precision    Recall  F1 Score
Expanded Grid Search      0.947205   0.410680  0.849398  0.553665
Expanded Random Search    0.932265   0.323336  0.692771  0.440895
Expanded Bayesian Search  0.934742   0.297297  0.508032  0.375093
