Consider the Wine Quality Dataset, and use one the dataset form red or white wine.

The purpose of this exercise is to implement Bagging and Boosting strategies.

Also, at the end, you should try to understand the impact of feature selection process. Includin all the features or select some of them has impact in the global model performance?



In [None]:
# Load and Preprocess the Data

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Wine Quality dataset (assuming the CSV file is in the current directory)
data = pd.read_csv("winequality-white.csv", sep=";")
print(data.head())

In [None]:
data.info()

In [None]:
#Some statistics
data.describe()

In [None]:
# Features

# Select only 2 classes of wine quality: 3, and 8 #Imbalance problem
# OR
# Select only 2 classes of wine quality: 5, and 7 #Balance problem

filtered_data = '?'

# Display the first few rows of the filtered data
print(filtered_data.head())

# Separate the target variable in a different vector from the features
X = '?'
y = '?'


# Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(  '?'   )

In [None]:
# Try different cross validation strategies

In [None]:
# Initialize and train the RandomForestClassifier (Bagging)
rf_model = RandomForestClassifier(max_depth=10, criterion="entropy", n_estimators=20, random_state=0)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = '?'

# Accuracy
accuracy_rf = '?'

# F1-Score
f1_rf = '?'

# Precision
precision_rf = '?'

# Recall
recall_rf = '?'

# Confusion Matrix
cm_rf = '?'

# Print metrics for Random Forest (Bagging)
print(f"Random Forest (Bagging) Performance Metrics:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")


In [None]:
# Gradient Boosting Predictions
gb_model = GradientBoostingClassifier(loss='log_loss', learning_rate=0.9, n_estimators=100)
gb_model.fit(X_train, y_train)

# Make predictions
y_pred_gb = '?'

# Accuracy
accuracy_gb = '?'

# F1-Score
f1_gb = '?'

# Precision
precision_gb = '?'

# Recall
recall_gb = '?'

# Confusion Matrix
cm_gb = '?'


# Print metrics for Gradient Boosting (Boosting)
print(f"Gradient Boosting (Boosting) Performance Metrics:")
print(f"Accuracy: {accuracy_gb:.4f}")
print(f"F1-Score: {f1_gb:.4f}")
print(f"Precision: {precision_gb:.4f}")
print(f"Recall: {recall_gb:.4f}")


In [None]:
# Plot Confusion Matrix for Random Forest
plt.figure(figsize=(6,6))
sns.heatmap(cm_rf, annot=True, fmt="d", cmap='Blues', xticklabels=sorted(y.unique()), yticklabels=sorted(y.unique()))
plt.title('Confusion Matrix - Random Forest (Bagging)')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Plot Confusion Matrix for Gradient Boosting
plt.figure(figsize=(6,6))
sns.heatmap(cm_gb, annot=True, fmt="d", cmap='Blues', xticklabels=sorted(y.unique()), yticklabels=sorted(y.unique()))
plt.title('Confusion Matrix - Gradient Boosting (Boosting)')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


In [None]:
# Use of ADABoost
# Create AdaBoost model using a DecisionTreeClassifier as the base estimator
ada_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=100, learning_rate=1)

# Train the model
ada_model.fit(X_train, y_train)

# Make predictions
y_pred_ada = '?'


# Evaluate the model
# Accuracy
accuracy_ada = '?'

# F1-Score
f1_ada = '?'

# Precision
precision_ada = '?'

# Recall
recall_ada = '?'

# Confusion Matrix
cm_ada = '?'


# Print metrics for Gradient Boosting (Boosting)
print(f"ADABoost (Boosting) Performance Metrics:")
print(f"Accuracy: {accuracy_ada:.4f}")
print(f"F1-Score: {f1_ada:.4f}")
print(f"Precision: {precision_ada:.4f}")
print(f"Recall: {recall_ada:.4f}")


In [None]:
# Plot Confusion Matrix for ADABoost
plt.figure(figsize=(6,6))
sns.heatmap(cm_ada, annot=True, fmt="d", cmap='Blues', xticklabels=sorted(y.unique()), yticklabels=sorted(y.unique()))
plt.title('Confusion Matrix - ADABoost (Boosting)')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# And now, if I do feature selection... Can this action have influence on the model performance?

In [None]:
# Study the features relation - what can you conclude?
sns.pairplot(X_train)

In [None]:
sns.pairplot(filtered_data, hue="quality", markers=["o", "s"], corner=True)

In [None]:
# 1. Filter Method: SelectKBest
# This method selects the best k features based on univariate statistical tests (e.g., chi-squared, mutual information).
# This is a quick way to filter out irrelevant features before training.

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

# Apply SelectKBest with chi-squared for feature selection
selector = SelectKBest(chi2, k=4)  # Select top k features
X_train_selected = selector.fit_transform(X_train, y_train)

# Retrieve the selected feature names
selected_features_mask = '?'
selected_feature_names = '?'

# Display the selected features
print("Selected Features:")
print(selected_feature_names)



# Since SelectKBest returns a NumPy array, we need to apply the transformation to the test set using the same selector
X_test_selected = selector.transform(X_test)

# Train AdaBoost model with selected features
ada_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, random_state=42)
ada_model.fit(X_train_selected, y_train)

# Make predictions
y_pred_ada = '?'

# Evaluate the model
accuracy_ada = '?'
f1_ada = '?'

print(f"AdaBoost with Feature Selection - Accuracy: {accuracy_ada:.4f}")
print(f"AdaBoost with Feature Selection - F1 Score: {f1_ada:.4f}")


In [None]:
# 2. Wrapper Method: Recursive Feature Elimination (RFE)
#RFE recursively removes the least important features based on the model’s performance.

In [None]:
from sklearn.feature_selection import RFE


# Initialize the model and RFE
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=1)
selector = RFE(model, n_features_to_select=5)  # Select the top 5 features

# Fit RFE
selector = selector.fit(X_train, y_train)

# Transform the training and test sets
X_train_rfe = selector.transform(X_train)
X_test_rfe = selector.transform(X_test)


# Retrieve the selected feature names
selected_features_mask = selector.get_support()  # Boolean mask of selected features
selected_feature_names = X.columns[selected_features_mask]  # Get the names of the selected features

# Display the selected features
print("Selected Features:")
print(selected_feature_names)



# Train the AdaBoost model
model.fit('?', y_train)

# Make predictions
y_pred_ada_rfe = '?'

# Evaluate the model
accuracy_ada_rfe = '?'
f1_ada_rfe = '?'

print(f"AdaBoost with RFE - Accuracy: {accuracy_ada_rfe:.4f}")
print(f"AdaBoost with RFE - F1 Score: {f1_ada_rfe:.4f}")


In [None]:
# 3. Embedded Method: Feature Importance from AdaBoost
# Since AdaBoost builds a series of decision trees,
# we can use the feature importances generated by the model to select the most important features.
# This is an embedded method because feature selection happens during the model training.

In [None]:
# Train AdaBoost model
ada_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=1)
ada_model.fit(X_train, y_train)

# Get feature importances
importances = ada_model.feature_importances_

# Sort the features by importance
indices = importances.argsort()[::-1]  # Reverse the order so the most important feature is first

# Select the top k features (e.g., top 5)
top_k = 5 #5
X_train_top_k = X_train.iloc[:, indices[:top_k]]
X_test_top_k = X_test.iloc[:, indices[:top_k]]


# Get feature names
feature_names = X.columns
# Get the top N most important features (e.g., top 5)

top_features = feature_names[indices[:top_k]]

# Display the top N selected features based on feature importance
print("Top 5 selected features based on AdaBoost feature importance:")
print(top_features)



# Train and evaluate the model with selected features
ada_model.fit(X_train_top_k, y_train)
y_pred_ada = '?'

# Evaluate the model
accuracy_ada = '?'
f1_ada = '?'

print(f"AdaBoost with Feature Importance - Accuracy: {accuracy_ada:.4f}")
print(f"AdaBoost with Feature Importance - F1 Score: {f1_ada:.4f}")
