<a href="https://colab.research.google.com/github/sanadv/MLCourse/blob/main/Lesson_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#5.1 VoterClassifer

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Load the Wine dataset
wine = load_wine()
X, y = wine.data, wine.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the classifiers
log_clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=42))
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = make_pipeline(StandardScaler(), SVC(gamma='scale', probability=True, random_state=42))

# Initialize the voting classifier with soft voting
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft'
)

# Train the voting classifier and the individual classifiers
voting_clf.fit(X_train, y_train)
log_clf.fit(X_train, y_train)
rnd_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

# Evaluate each classifier's accuracy
classifiers = [log_clf, rnd_clf, svm_clf, voting_clf]
for clf in classifiers:
    y_pred = clf.predict(X_test)
    clf_name = clf.__class__.__name__ if clf.__class__.__name__ != 'Pipeline' else clf.steps[-1][1].__class__.__name__
    print(f"{clf_name}: {accuracy_score(y_test, y_pred):.2f}")


LogisticRegression: 0.98
RandomForestClassifier: 1.00
SVC: 0.98
VotingClassifier: 1.00


In [None]:
#5.2 VoterClassifer

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the classifiers
knn_clf = make_pipeline(StandardScaler(), KNeighborsClassifier())
dt_clf = DecisionTreeClassifier(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)

# Initialize the voting classifier with soft voting
voting_clf = VotingClassifier(
    estimators=[('knn', knn_clf), ('dt', dt_clf), ('gb', gb_clf)],
    voting='soft'
)

# Train the voting classifier and the individual classifiers
for clf in (knn_clf, dt_clf, gb_clf, voting_clf):
    clf.fit(X_train, y_train)

# Evaluate each classifier's accuracy
#Voting Classifier will predict all the test data using th voting of all of the models, that's why it;s scoring .96 her and 1 in the prev
classifiers = [knn_clf, dt_clf, gb_clf, voting_clf]
for clf in classifiers:
    y_pred = clf.predict(X_test)
    clf_name = clf.__class__.__name__ if clf.__class__.__name__ != 'Pipeline' else clf.steps[-1][1].__class__.__name__
    print(f"{clf_name}: {accuracy_score(y_test, y_pred):.2f}")


KNeighborsClassifier: 0.96
DecisionTreeClassifier: 0.94
GradientBoostingClassifier: 0.96
VotingClassifier: 0.96


In [None]:
#5.3 Bagging
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the base classifier
base_dt_clf = DecisionTreeClassifier(random_state=42)

# Initialize the BaggingClassifier with Decision Tree Classifier as the base estimator
#Change bootstrap to true
bagging_clf = BaggingClassifier(base_estimator=base_dt_clf,bootstrap=False, n_estimators=100, random_state=42)

# Train the BaggingClassifier
bagging_clf.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = bagging_clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")




Accuracy: 0.94


In [None]:
#5.4 Random Forests
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the RandomForestClassifier
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the RandomForestClassifier
random_forest_clf.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = random_forest_clf.predict(X_test)
print(f"RandomForestClassifier Accuracy: {accuracy_score(y_test, y_pred):.2f}")


RandomForestClassifier Accuracy: 0.97


In [2]:
#5.5 adaboost
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
         DecisionTreeClassifier(max_depth=2), n_estimators=200,
         algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = ada_clf.predict(X_test)
print(f"AdaBoostClassifier Accuracy: {accuracy_score(y_test, y_pred):.2f}")

AdaBoostClassifier Accuracy: 0.97


In [5]:
#5.5 Gradient Boosting
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
import numpy as np

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)
# residual of the actual value minus the prediction
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)
# residual of the y2 value minus the prediction
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)
y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred_binary = np.where(y_pred > 0.5, 1, 0)

# Now calculate the accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Gradient Boosting Accuracy: {accuracy:.2f}")

Gradient Boosting Accuracy: 0.95


Gradient Boosting is an ensemble technique that builds models sequentially, each new model correcting errors made by previously trained models. Instead of using pre-built functions, this code manually implements gradient boosting using DecisionTreeRegressors.

First Model (tree_reg1): A decision tree regressor is trained on the original dataset. This model aims to predict the target variable directly.
Second Model (tree_reg2): The first model's residuals (differences between the actual and predicted values) are calculated. A second decision tree regressor is trained on these residuals. Its goal is to correct the errors of the first model.
Third Model (tree_reg3): Similarly, the second model's residuals are computed, and a third decision tree regressor is trained to correct the errors of the combined first and second models.
Each decision tree has a max_depth of 2, which controls the complexity of the model. A shallow depth helps prevent overfitting.

Prediction and Classification
Prediction: The predictions from all three models are summed up to get the final prediction. This cumulative prediction approach leverages the strength of each model, focusing on correcting the predecessor's mistakes.
Conversion to Binary Labels: Since the original problem is a classification task, but the models predict continuous values, a threshold of 0.5 is used to convert these values into binary labels (0 or 1).



The line y2 = y - tree_reg1.predict(X) plays a critical role in the gradient boosting algorithm's iterative approach to model improvement. Let's break down what this line means and why it's essential:

Gradient Boosting Concept
Gradient Boosting constructs a model in a stage-wise fashion. It begins with a base model and sequentially adds new models that correct the previous models' errors. The core idea is to improve the prediction iteratively by focusing specifically on the parts where the current ensemble of models performs poorly.

The Specific Line
tree_reg1.predict(X): This part of the line uses the first decision tree regressor (tree_reg1) to make predictions on the entire dataset (X). The output of this prediction is a set of continuous values corresponding to the initial model's estimation of the target variable y.
y - tree_reg1.predict(X): The actual values of the target variable (y) are then subtracted from these predicted values. This operation produces the residuals or errors of the first model. These residuals represent the amount by which the model's predictions deviate from the actual values.
Purpose of Calculating y2
Focus on Mistakes: By calculating y2, the algorithm identifies where the first model, tree_reg1, made its mistakes. Positive values in y2 indicate that the model's predictions were too low, while negative values indicate predictions that were too high.
Training the Next Model: y2 becomes the target variable for the second decision tree regressor (tree_reg2). Essentially, tree_reg2 is trained not to predict the original target directly but to predict how much tree_reg1's predictions need to be corrected. This way, tree_reg2 focuses on the errors made by tree_reg1, aiming to reduce these errors in the overall ensemble prediction.

In [8]:
# 5.6 StackingClassifier
from sklearn.datasets import load_iris
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Load the Iris dataset
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define base-level models
base_models = [
    ('svc', SVC(probability=True, random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('knn', KNeighborsClassifier())
]

# Define the meta-model
meta_model = LogisticRegression(random_state=42)

# Create the stacking classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Stacking Model Accuracy: {accuracy:.4f}')


Stacking Model Accuracy: 0.9825
