In [48]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into training and test sets (75% training, 25% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Decision Tree classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = dt.predict(X_test_scaled)

# Compute the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (75% training, 25% test):", accuracy)

# Use 2/3 of total data for training, 1/3 for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

# Standardize the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Decision Tree classifier
dt.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = dt.predict(X_test_scaled)

# Compute the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (66.6% training, 33.3% test):", accuracy)

# Use hold out method for training set selection
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Decision Tree classifier
dt.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = dt.predict(X_test_scaled)

# Compute the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (hold out method):", accuracy)

# Use random subsampling for training set selection
n_samples = X.shape[0]
n_train = int(0.75 * n_samples)  # 75% for training
indices = np.random.permutation(n_samples)
X_train, X_test = X[indices[:n_train]], X[indices[n_train:]]
y_train, y_test = y[indices[:n_train]], y[indices[n_train:]]

# Standardize the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Decision Tree classifier
dt.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = dt.predict(X_test_scaled)

# Compute the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (random subsampling):", accuracy)

# Use cross-validation for training set selection
scores = cross_val_score(dt, X, y, cv=5)
print("Cross-validation scores:", scores)
print("Average cross-validation accuracy:", np.mean(scores))


Accuracy (75% training, 25% test): 1.0
Accuracy (66.6% training, 33.3% test): 0.98
Accuracy (hold out method): 1.0
Accuracy (random subsampling): 0.9736842105263158
Cross-validation scores: [0.96666667 0.96666667 0.9        0.93333333 1.        ]
Average cross-validation accuracy: 0.9533333333333334


  and should_run_async(code)
