In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')


In [3]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
from sklearn.metrics import accuracy_score


from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier

In [4]:
wine = load_wine()
X = wine.data
y = wine.target

In [None]:
#Data Cleaning

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2', None]
}

In [7]:
dt = DecisionTreeClassifier()

In [8]:
random_search = RandomizedSearchCV(dt, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)



In [9]:
print("Best Parameters:", random_search.best_params_)

Best Parameters: {'criterion': 'entropy', 'max_depth': 13, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 9, 'splitter': 'random'}


In [10]:
best_dt = random_search.best_estimator_
y_pred = best_dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [11]:
print("Accuracy on Test Set:", accuracy)

Accuracy on Test Set: 0.8888888888888888


In [12]:
#as required 10 splits are done
shuffle_split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [13]:
random_forest_trees = []

for train_index, _ in shuffle_split.split(X_train):
    subset_X_train, subset_y_train = X_train[train_index], y_train[train_index]

    # Create a decision tree with best hyperparameters found in the previous question
    tree = DecisionTreeClassifier(**random_search.best_params_)
    tree.fit(subset_X_train, subset_y_train)

    # Append the trained tree to the list
    random_forest_trees.append(tree)

In [14]:
test_predictions = []
for tree in random_forest_trees:
    tree_pred = tree.predict(X_test)
    test_predictions.append(tree_pred)

In [15]:
ensemble_predictions = [max(set(pred), key=pred.count) for pred in zip(*test_predictions)]

# Evaluate the ensemble on the test set
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)

print("Accuracy of Random Forest on Test Set:", ensemble_accuracy)

Accuracy of Random Forest on Test Set: 1.0
