In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
# Step 1: Load wine dataset
wine_data = load_wine()
X, y = wine_data.data, wine_data.target

In [3]:
# Step 2: Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Step 3: Hyperparameter tuning using RandomizedSearchCV for Decision Tree
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [5]:
dt_classifier = DecisionTreeClassifier()
random_search = RandomizedSearchCV(dt_classifier, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)


In [6]:
# Display the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters for Decision Tree:", best_params)

Best Hyperparameters for Decision Tree: {'splitter': 'random', 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None, 'criterion': 'gini'}


In [8]:
# Use the best hyperparameters to train the Decision Tree
best_dt_classifier = DecisionTreeClassifier(**best_params)
best_dt_classifier.fit(X_train, y_train)

In [9]:
# Evaluate the Decision Tree
y_pred_dt = best_dt_classifier.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy of Decision Tree:", accuracy_dt)


Accuracy of Decision Tree: 0.9166666666666666


In [10]:
# Step 4: Create subsets and train a Random Forest
n_subsets = 10
subset_generator = ShuffleSplit(n_splits=n_subsets, test_size=0.2, random_state=42)

In [11]:
# Initialize a list to store individual decision trees
decision_trees = []

for train_index, _ in subset_generator.split(X_train):
    # Train a Decision Tree on each subset
    subset_dt = DecisionTreeClassifier(**best_params)
    subset_dt.fit(X_train[train_index], y_train[train_index])
    decision_trees.append(subset_dt)

In [13]:
# Evaluate all decision trees on the test dataset
ensemble_predictions = [dt.predict(X_test) for dt in decision_trees]

In [14]:
# Calculate accuracy for each decision tree
ensemble_accuracies = [accuracy_score(y_test, predictions) for predictions in ensemble_predictions]

In [15]:
# Compare with the accuracy of the single Decision Tree
print("Accuracy of Decision Tree:", accuracy_dt)
print("Accuracy of Individual Decision Trees in the Ensemble:", ensemble_accuracies)

Accuracy of Decision Tree: 0.9166666666666666
Accuracy of Individual Decision Trees in the Ensemble: [0.9166666666666666, 0.9166666666666666, 0.8888888888888888, 0.8888888888888888, 0.8333333333333334, 0.9166666666666666, 0.9166666666666666, 0.9166666666666666, 0.9166666666666666, 0.9166666666666666]
