<a href="https://colab.research.google.com/github/Mulat-K/Machine-Learning-Mastery-with-Python/blob/main/IPWE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***Improve Performance with Ensembles***

# ***Bagging Algorithms***

***Bagged Decision Trees***

In [12]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'

# Define column names
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=names)

# Convert all columns to numeric (if needed) and handle any non-numeric data
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')

# Handle missing values by replacing NaNs with column means
dataframe = dataframe.fillna(dataframe.mean())

# Split data into input (X) and output (Y)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

# Ensure that Y is in binary format for classification
Y = Y.astype('int')  # Convert the target variable to integer

# Define the random seed and cross-validation strategy
seed = 7
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)  # Enable shuffling

# Initialize the base classifier and Bagging model
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(estimator=cart, n_estimators=num_trees, random_state=seed)

# Evaluate the model with cross-validation
results = cross_val_score(model, X, Y, cv=kfold)
print(f"Accuracy: {results.mean():.3f}")


Accuracy: 0.762


***Random Forest***

In [13]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Load the dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'  # Correct path

# Define column names
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

# Convert the dataframe to numeric values and handle non-numeric data
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')

# Handle missing values by filling NaNs with column means
dataframe = dataframe.fillna(dataframe.mean())

# Split the data into input (X) and output (Y)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

# Ensure the target variable Y is in a binary classification format
Y = Y.astype('int')  # Convert to integers for classification

# Define model parameters
num_trees = 100
max_features = 3
seed = 7

# Set up cross-validation
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)

# Initialize the RandomForest model
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features, random_state=seed)

# Evaluate the model using cross-validation
results = cross_val_score(model, X, Y, cv=kfold)

# Print the mean of the cross-validation results
print(f"Accuracy: {results.mean():.3f}")


Accuracy: 0.776


***Extra Trees***

In [17]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
import pandas as pd

# Load the dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'  # Correct path

# Define column names
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

# Convert the dataframe to numeric values and handle non-numeric data
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')

# Handle missing values by filling NaNs with column means
dataframe = dataframe.fillna(dataframe.mean())

# Split the data into input (X) and output (Y)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

# Ensure the target variable Y is in a binary classification format
Y = Y.astype('int')  # Convert to integers for classification

# Define model parameters
num_trees = 100
max_features = 7
seed = 7

# Set up cross-validation
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)

# Initialize the ExtraTrees model
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features, random_state=seed)

# Evaluate the model using cross-validation
results = cross_val_score(model, X, Y, cv=kfold)

# Print the mean of the cross-validation results
print(f"Accuracy: {results.mean():.3f}")


Accuracy: 0.756


# ***Boosting Algorithms***

***AdaBoost***

In [21]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd

# Load the dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'  # Correct path
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

# Convert the dataframe to numeric values and handle non-numeric data
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')

# Handle missing values by filling NaNs with column means
dataframe = dataframe.fillna(dataframe.mean())

# Split the data into input (X) and output (Y)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

# Ensure the target variable Y is in a binary classification format
Y = Y.astype('int')  # Convert to integers for classification

# Define model parameters
num_trees = 30
seed = 7

# Set up cross-validation with shuffling enabled
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

# Initialize the AdaBoost model
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)

# Evaluate the model using cross-validation
results = cross_val_score(model, X, Y, cv=kfold)

# Print the mean of the cross-validation results
print(f"Accuracy: {results.mean():.3f}")


Accuracy: 0.762


***Stochastic Gradient Boosting***

In [22]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd

# Load the dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'  # Correct path
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

# Convert the dataframe to numeric values and handle non-numeric data
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')

# Handle missing values by filling NaNs with column means
dataframe = dataframe.fillna(dataframe.mean())

# Split the data into input (X) and output (Y)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

# Ensure the target variable Y is in a binary classification format
Y = Y.astype('int')  # Convert to integers for classification

# Define model parameters
seed = 7
num_trees = 100

# Set up cross-validation with shuffling enabled
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

# Initialize the GradientBoosting model
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)

# Evaluate the model using cross-validation
results = cross_val_score(model, X, Y, cv=kfold)

# Print the mean of the cross-validation results
print(f"Accuracy: {results.mean():.3f}")


Accuracy: 0.767


# ***Voting Ensemble***

In [23]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
import pandas as pd

# Load the dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'  # Correct path
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

# Convert the dataframe to numeric values and handle non-numeric data
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')

# Handle missing values by filling NaNs with column means
dataframe = dataframe.fillna(dataframe.mean())

# Split the data into input (X) and output (Y)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

# Ensure the target variable Y is in a binary classification format
Y = Y.astype('int')  # Convert to integers for classification

# Define KFold cross-validation with shuffle
kfold = KFold(n_splits=10, shuffle=True, random_state=7)

# Create the sub-models
estimators = []
model1 = LogisticRegression(max_iter=200)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier(random_state=7)
estimators.append(('cart', model2))
model3 = SVC(random_state=7)
estimators.append(('svm', model3))

# Create the ensemble model
ensemble = VotingClassifier(estimators)

# Evaluate the ensemble model using cross-validation
results = cross_val_score(ensemble, X, Y, cv=kfold)

# Print the mean of the cross-validation results
print(f"Voting Ensemble Accuracy: {results.mean():.3f}")


Voting Ensemble Accuracy: 0.780
