In [1]:

#Practical
Q1
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score

# Set random seed for reproducibility
np.random.seed(42)

# Load the Iris dataset
data = load_iris()
X, y = data.data, data.target

# Initialize Bagging Classifier with DecisionTreeClassifier as base estimator
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=50,  # Number of base estimators
    max_samples=0.8,  # Fraction of samples to draw
    bootstrap=True,
    random_state=42
)

# Define accuracy scoring metric
scoring = {
    'accuracy': make_scorer(accuracy_score)
}

# Perform 5-fold cross-validation
cv_results = cross_validate(bagging, X, y, cv=5, scoring=scoring, return_train_score=False)

# Extract and print mean and standard deviation of accuracy scores
accuracy_mean = np.mean(cv_results['test_accuracy'])
accuracy_std = np.std(cv_results['test_accuracy'])

print("Bagging Classifier (Decision Tree) Performance on Iris Dataset:")
print(f"Accuracy: Mean = {accuracy_mean:.4f}, Std = {accuracy_std:.4f}")
Q2
import numpy as np
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic regression dataset
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)

# Initialize Bagging Regressor with DecisionTreeRegressor as base estimator
bagging = BaggingRegressor(
    estimator=DecisionTreeRegressor(random_state=42),
    n_estimators=50,  # Number of base estimators
    max_samples=0.8,  # Fraction of samples to draw
    bootstrap=True,
    random_state=42
)

# Define MSE scoring metric
scoring = {
    'mse': make_scorer(mean_squared_error, greater_is_better=False)
}

# Perform 5-fold cross-validation
cv_results = cross_validate(bagging, X, y, cv=5, scoring=scoring, return_train_score=False)

# Extract and print mean and standard deviation of MSE scores
mse_mean = -np.mean(cv_results['test_mse'])  # Negate because greater_is_better=False
mse_std = np.std(cv_results['test_mse'])

print("Bagging Regressor (Decision Tree) Performance:")
print(f"MSE: Mean = {mse_mean:.4f}, Std = {mse_std:.4f}")
Q3
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the Breast Cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest Classifier
rf_model.fit(X_train, y_train)

# Get feature importance scores
feature_importances = rf_model.feature_importances_

# Create a DataFrame to show feature names and their importance
feature_df = pd.DataFrame({
    'Feature': data.feature_names,
    'Importance': feature_importances
})

# Sort features by importance
feature_df = feature_df.sort_values(by='Importance', ascending=False)

# Print feature importance scores
print(feature_df)

Q4
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the California housing dataset
data = fetch_california_housing()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the models
decision_tree_model = DecisionTreeRegressor(random_state=42)
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the Decision Tree model
decision_tree_model.fit(X_train, y_train)

# Train the Random Forest model
random_forest_model.fit(X_train, y_train)

# Make predictions with both models
y_pred_tree = decision_tree_model.predict(X_test)
y_pred_forest = random_forest_model.predict(X_test)

# Calculate Mean Squared Error for both models
mse_tree = mean_squared_error(y_test, y_pred_tree)
mse_forest = mean_squared_error(y_test, y_pred_forest)

# Print the results
print(f"Decision Tree Model Mean Squared Error (MSE): {mse_tree}")
print(f"Random Forest Model Mean Squared Error (MSE): {mse_forest}")

Q5
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load a sample dataset (Iris dataset)
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets (we'll use training for OOB score)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Classifier with OOB enabled
rf_model = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)

# Train the Random Forest Classifier
rf_model.fit(X_train, y_train)

# Get the OOB score
oob_score = rf_model.oob_score_

print(f"Out-of-Bag (OOB) Score: {oob_score}")

Q6
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic binary classification dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Scale features (important for SVM)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize Bagging Classifier with SVC as base estimator
bagging = BaggingClassifier(
    estimator=SVC(probability=True, kernel='rbf', random_state=42),
    n_estimators=20,  # Fewer estimators due to SVM's computational cost
    max_samples=0.8,  # Fraction of samples to draw
    bootstrap=True,
    random_state=42
)

# Define accuracy scoring metric
scoring = {
    'accuracy': make_scorer(accuracy_score)
}

# Perform 5-fold cross-validation
cv_results = cross_validate(bagging, X, y, cv=5, scoring=scoring, return_train_score=False)

# Extract and print mean and standard deviation of accuracy scores
accuracy_mean = np.mean(cv_results['test_accuracy'])
accuracy_std = np.std(cv_results['test_accuracy'])

print("Bagging Classifier (SVM) Performance:")
print(f"Accuracy: Mean = {accuracy_mean:.4f}, Std = {accuracy_std:.4f}")
Q7
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load a sample dataset (Iris dataset)
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# List to store results
n_trees = [10, 50, 100, 200, 500]
accuracies = []

# Train and evaluate Random Forest Classifiers with different numbers of trees
for n in n_trees:
    rf_model = RandomForestClassifier(n_estimators=n, random_state=42)
    rf_model.fit(X_train, y_train)  # Train the model
    y_pred = rf_model.predict(X_test)  # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    accuracies.append(accuracy)

# Print the accuracy for each number of trees
for n, accuracy in zip(n_trees, accuracies):
    print(f"Number of Trees: {n}, Accuracy: {accuracy}")

# Plot the accuracy as a function of the number of trees
plt.plot(n_trees, accuracies, marker='o')
plt.title('Random Forest Accuracy with Different Numbers of Trees')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()

Q8
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, roc_auc_score

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic binary classification dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Initialize Bagging Classifier with LogisticRegression as base estimator
bagging = BaggingClassifier(
    estimator=LogisticRegression(random_state=42, max_iter=1000),
    n_estimators=50,  # Number of base estimators
    max_samples=0.8,  # Fraction of samples to draw
    bootstrap=True,
    random_state=42
)

# Define AUC scoring metric
scoring = {
    'auc': make_scorer(roc_auc_score, needs_proba=True)
}

# Perform 5-fold cross-validation
cv_results = cross_validate(bagging, X, y, cv=5, scoring=scoring, return_train_score=False)

# Extract and print mean and standard deviation of AUC scores
auc_mean = np.mean(cv_results['test_auc'])
auc_std = np.std(cv_results['test_auc'])

print("Bagging Classifier (Logistic Regression) Performance:")
print(f"AUC Score: Mean = {auc_mean:.4f}, Std = {auc_std:.4f}")
Q9
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

# Load the California Housing dataset
data = fetch_california_housing()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the Random Forest Regressor
rf_model.fit(X_train, y_train)

# Get feature importance scores
feature_importances = rf_model.feature_importances_

# Create a DataFrame for better visualization
feature_df = pd.DataFrame({
    'Feature': data.feature_names,
    'Importance': feature_importances
})

# Sort features by importance
feature_df = feature_df.sort_values(by='Importance', ascending=False)

# Print the feature importance scores
print("Feature Importance Scores:")
print(feature_df)

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_df['Feature'], feature_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance Scores for Random Forest Regressor')
plt.gca().invert_yaxis()  # To show the most important feature on top
plt.show()

Q10
import numpy as np
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic classification dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Initialize models
models = {
    'Bagging Classifier': BaggingClassifier(
        estimator=DecisionTreeClassifier(random_state=42),
        n_estimators=50,  # Number of base estimators
        max_samples=0.8,  # Fraction of samples to draw
        bootstrap=True,
        random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=50,  # Match number of estimators for fair comparison
        max_features='sqrt',  # Random feature selection
        bootstrap=True,
        random_state=42
    )
}

# Define scoring metric
scoring = {
    'accuracy': make_scorer(accuracy_score)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    # Perform 5-fold cross-validation
    cv_results = cross_validate(model, X, y, cv=5, scoring=scoring, return_train_score=False)

    # Store mean and std of accuracy
    results[name] = {
        'accuracy_mean': np.mean(cv_results['test_accuracy']),
        'accuracy_std': np.std(cv_results['test_accuracy'])
    }

# Print results
print("Performance Comparison: Bagging Classifier vs. Random Forest")
for name, metrics in results.items():
    print(f"\nModel: {name}")
    print(f"Accuracy: Mean = {metrics['accuracy_mean']:.4f}, Std = {metrics['accuracy_std']:.4f}")
Q11
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100],   # Number of trees
    'max_depth': [10, 20, None],  # Maximum depth of the trees
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)

# Train the Random Forest Classifier using GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters from GridSearchCV
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Test the model with the best hyperparameters
best_rf_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate and print the accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy with Best Hyperparameters: {accuracy}")

Q12
import numpy as np
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic regression dataset
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)

# Define different numbers of base estimators
n_estimators_values = [10, 20, 50, 100, 200]

# Define scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error, greater_is_better=False),
    'r2': make_scorer(r2_score)
}

# Train and evaluate Bagging Regressor for each n_estimators value
results = {}
for n_estimators in n_estimators_values:
    # Initialize Bagging Regressor
    bagging = BaggingRegressor(
        estimator=DecisionTreeRegressor(random_state=42),
        n_estimators=n_estimators,
        max_samples=0.8,  # Fraction of samples to draw
        bootstrap=True,
        random_state=42
    )

    # Perform 5-fold cross-validation
    cv_results = cross_validate(bagging, X, y, cv=5, scoring=scoring, return_train_score=False)

    # Store mean and std of scores
    results[n_estimators] = {
        'mse_mean': -np.mean(cv_results['test_mse']),  # Negate because greater_is_better=False
        'mse_std': np.std(cv_results['test_mse']),
        'r2_mean': np.mean(cv_results['test_r2']),
        'r2_std': np.std(cv_results['test_r2'])
    }

# Print results
print("Bagging Regressor Performance Comparison (Varying n_estimators):")
for n_estimators, metrics in results.items():
    print(f"\nNumber of Estimators: {n_estimators}")
    print(f"MSE: Mean = {metrics['mse_mean']:.4f}, Std = {metrics['mse_std']:.4f}")
    print(f"R²: Mean = {metrics['r2_mean']:.4f}, Std = {metrics['r2_std']:.4f}")
Q13
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Identify the misclassified samples
misclassified_idx = (y_pred != y_test)

# Display the misclassified samples
misclassified_samples = pd.DataFrame(X_test[misclassified_idx], columns=data.feature_names)
misclassified_samples['True Class'] = y_test[misclassified_idx]
misclassified_samples['Predicted Class'] = y_pred[misclassified_idx]

# Print the misclassified samples
print("\nMisclassified Samples:")
print(misclassified_samples)


Q14
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic classification dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Initialize models
models = {
    'Bagging Classifier': BaggingClassifier(
        estimator=DecisionTreeClassifier(random_state=42),
        n_estimators=50,  # Number of base estimators
        max_samples=0.8,  # Fraction of samples to draw
        bootstrap=True,
        random_state=42
    ),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Define scoring metrics
scoring = {
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    # Perform 5-fold cross-validation
    cv_results = cross_validate(model, X, y, cv=5, scoring=scoring, return_train_score=False)

    # Store mean and std of scores
    results[name] = {
        'precision_mean': np.mean(cv_results['test_precision']),
        'precision_std': np.std(cv_results['test_precision']),
        'recall_mean': np.mean(cv_results['test_recall']),
        'recall_std': np.std(cv_results['test_recall']),
        'f1_mean': np.mean(cv_results['test_f1']),
        'f1_std': np.std(cv_results['test_f1'])
    }

# Print results
print("Performance Comparison: Bagging Classifier vs. Decision Tree")
for name, metrics in results.items():
    print(f"\nModel: {name}")
    print(f"Precision: Mean = {metrics['precision_mean']:.4f}, Std = {metrics['precision_std']:.4f}")
    print(f"Recall: Mean = {metrics['recall_mean']:.4f}, Std = {metrics['recall_std']:.4f}")
    print(f"F1-Score: Mean = {metrics['f1_mean']:.4f}, Std = {metrics['f1_std']:.4f}")
Q15
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=data.target_names, yticklabels=data.target_names)
plt.title('Confusion Matrix for Random Forest Classifier')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

Q16
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the base models
dt = DecisionTreeClassifier(random_state=42)
svm = SVC(random_state=42)
lr = LogisticRegression(max_iter=200, random_state=42)

# Create the stacking classifier
stacking_model = StackingClassifier(estimators=[('dt', dt), ('svm', svm), ('lr', lr)], final_estimator=LogisticRegression())

# Create the voting classifier (for comparison)
voting_model = VotingClassifier(estimators=[('dt', dt), ('svm', svm), ('lr', lr)], voting='hard')

# Train the stacking classifier
stacking_model.fit(X_train, y_train)

# Train the voting classifier
voting_model.fit(X_train, y_train)

# Predict with both models
y_pred_stacking = stacking_model.predict(X_test)
y_pred_voting = voting_model.predict(X_test)

# Calculate accuracy for both models
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
accuracy_voting = accuracy_score(y_test, y_pred_voting)

# Print the accuracy results
print(f"Accuracy of Stacking Classifier: {accuracy_stacking * 100:.2f}%")
print(f"Accuracy of Voting Classifier: {accuracy_voting * 100:.2f}%")

# Compare accuracies
if accuracy_stacking > accuracy_voting:
    print("\nStacking Classifier performs better than the Voting Classifier.")
else:
    print("\nVoting Classifier performs better than the Stacking Classifier.")

Q17
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target
feature_names = data.feature_names

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get the feature importance scores
feature_importances = rf_model.feature_importances_

# Create a DataFrame to display the feature names and their importance
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the top 5 most important features
print("Top 5 Most Important Features:")
print(importance_df.head(5))

Q18
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic classification dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Initialize Bagging Classifier with DecisionTreeClassifier as base estimator
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=50,  # Number of base estimators
    max_samples=0.8,  # Fraction of samples to draw
    bootstrap=True,
    random_state=42
)

# Define scoring metrics
scoring = {
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Perform 5-fold cross-validation
cv_results = cross_validate(bagging, X, y, cv=5, scoring=scoring, return_train_score=False)

# Extract and print mean and standard deviation of scores
precision_mean = np.mean(cv_results['test_precision'])
precision_std = np.std(cv_results['test_precision'])
recall_mean = np.mean(cv_results['test_recall'])
recall_std = np.std(cv_results['test_recall'])
f1_mean = np.mean(cv_results['test_f1'])
f1_std = np.std(cv_results['test_f1'])

print("5-Fold Cross-Validation Results for Bagging Classifier:")
print(f"Precision: Mean = {precision_mean:.4f}, Std = {precision_std:.4f}")
print(f"Recall: Mean = {recall_mean:.4f}, Std = {recall_std:.4f}")
print(f"F1-Score: Mean = {f1_mean:.4f}, Std = {f1_std:.4f}")
Q19
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# List of different max_depth values to try
max_depth_values = range(1, 21)  # Trying depths from 1 to 20

# List to store accuracy scores for each max_depth value
accuracy_scores = []

# Train Random Forest models with different max_depth values
for depth in max_depth_values:
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=depth, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

# Plot the accuracy as a function of max_depth
plt.figure(figsize=(10, 6))
plt.plot(max_depth_values, accuracy_scores, marker='o', color='b', linestyle='-', linewidth=2, markersize=6)
plt.title('Effect of max_depth on Random Forest Accuracy')
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
plt.grid(True)
plt.xticks(range(1, 21))
plt.show()

Q20
import numpy as np
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic regression dataset
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)

# Define base estimators
base_estimators = {
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'KNeighbors': KNeighborsRegressor(n_neighbors=5)
}

# Define scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error, greater_is_better=False),
    'r2': make_scorer(r2_score)
}

# Train and evaluate Bagging Regressor for each base estimator
results = {}
for name, estimator in base_estimators.items():
    # Initialize Bagging Regressor
    bagging = BaggingRegressor(
        estimator=estimator,
        n_estimators=50,  # Number of base estimators
        max_samples=0.8,  # Fraction of samples to draw
        bootstrap=True,
        random_state=42
    )

    # Perform 5-fold cross-validation
    cv_results = cross_validate(bagging, X, y, cv=5, scoring=scoring, return_train_score=False)

    # Store mean and std of scores
    results[name] = {
        'mse_mean': -np.mean(cv_results['test_mse']),  # Negate because greater_is_better=False
        'mse_std': np.std(cv_results['test_mse']),
        'r2_mean': np.mean(cv_results['test_r2']),
        'r2_std': np.std(cv_results['test_r2'])
    }

# Print results
print("Bagging Regressor Performance Comparison:")
for name, metrics in results.items():
    print(f"\nBase Estimator: {name}")
    print(f"MSE: Mean = {metrics['mse_mean']:.4f}, Std = {metrics['mse_std']:.4f}")
    print(f"R²: Mean = {metrics['r2_mean']:.4f}, Std = {metrics['r2_std']:.4f}")
Q21
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Get predicted probabilities for each class
y_pred_prob = rf_model.predict_proba(X_test)

# Binarize the output labels for multi-class ROC-AUC (One-vs-Rest approach)
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])

# Calculate the ROC-AUC score
roc_auc = roc_auc_score(y_test_bin, y_pred_prob, average='macro', multi_class='ovr')

# Print the ROC-AUC score
print(f"ROC-AUC Score: {roc_auc:.4f}")

Q22
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, f1_score

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic classification dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Initialize Bagging Classifier with DecisionTreeClassifier as base estimator
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=50,  # Number of base estimators
    max_samples=0.8,  # Fraction of samples to draw
    bootstrap=True,
    random_state=42
)

# Define scoring metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score, average='weighted')
}

# Perform 5-fold cross-validation
cv_results = cross_validate(bagging, X, y, cv=5, scoring=scoring, return_train_score=False)

# Extract and print mean and standard deviation of scores
accuracy_mean = np.mean(cv_results['test_accuracy'])
accuracy_std = np.std(cv_results['test_accuracy'])
f1_mean = np.mean(cv_results['test_f1'])
f1_std = np.std(cv_results['test_f1'])

print(f"5-Fold Cross-Validation Results:")
print(f"Accuracy: Mean = {accuracy_mean:.4f}, Std = {accuracy_std:.4f}")
print(f"F1-Score: Mean = {f1_mean:.4f}, Std = {f1_std:.4f}")
Q23
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

# Convert the problem into a binary classification for Precision-Recall curve (class 0 vs rest)
y_binary = (y == 0).astype(int)  # 1 if class 0, else 0

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.3, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Get predicted probabilities for the positive class
y_pred_prob = rf_model.predict_proba(X_test)[:, 1]

# Calculate Precision and Recall
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)

# Calculate Average Precision Score
avg_precision = average_precision_score(y_test, y_pred_prob)
print(f"Average Precision Score: {avg_precision:.4f}")

# Plot the Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', color='b', label=f'Random Forest (AP={avg_precision:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='best')
plt.grid(True)
plt.show()

Q24
# Import necessary libraries
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the base models
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lr', LogisticRegression(max_iter=200, random_state=42))
]

# Initialize the Stacking Classifier with Logistic Regression as the final estimator
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression())

# Train the Stacking Classifier
stacking_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_stacking = stacking_model.predict(X_test)

# Calculate the accuracy of the Stacking Classifier
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)

# Train and evaluate Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Train and evaluate Logistic Regression
lr_model = LogisticRegression(max_iter=200, random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)

# Print the accuracy of each model
print(f"Accuracy of Stacking Classifier: {accuracy_stacking:.4f}")
print(f"Accuracy of Random Forest Classifier: {accuracy_rf:.4f}")
print(f"Accuracy of Logistic Regression: {accuracy_lr:.4f}")

Q25
import numpy as np
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic regression dataset
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define different levels of max_samples
max_samples_values = [0.1, 0.3, 0.5, 0.7, 1.0]
mse_scores = []

# Train and evaluate Bagging Regressor for each max_samples value
for max_samples in max_samples_values:
    # Initialize Bagging Regressor with DecisionTreeRegressor as base estimator
    bagging = BaggingRegressor(
        estimator=DecisionTreeRegressor(),
        n_estimators=50,  # Number of base estimators
        max_samples=max_samples,
        bootstrap=True,
        random_state=42
    )

    # Train the model
    bagging.fit(X_train, y_train)

    # Predict on test set
    y_pred = bagging.predict(X_test)

    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)
    print(f"max_samples={max_samples:.1f}, MSE={mse:.4f}")

# Plot the results
plt.figure(figsize=(8, 6))
plt.plot(max_samples_values, mse_scores, marker='o', linestyle='-', color='#1f77b4')
plt.xlabel('max_samples')
plt.ylabel('Mean Squared Error')
plt.title('Bagging Regressor Performance vs. max_samples')
plt.grid(True)
plt.show()

NameError: name 'Q1' is not defined