In [1]:
"""Class 12. Ensemble Learning
"""

import os
import pandas as pd

In [2]:
""" Machine Learning Review

Data
--------------------------------------
- Data Cleaning
- Exploratory Data Analysis
- Feature Engineering
- Data Preprocessing and distribution

Model
--------------------------------------
- Design the machine learning model: 
  Linear regression, polynomial regression, logistic regression
- Select and appropriate cost function
  Objective: Match the predicted distribution of the data with true distribution
  Maximum likelihood estimation > KL Divergence > Cost Function / Loss Function
- Configure Hyperparameters: learning rate, epoch, batch size, regularizer term
  w, b -> parameters
  
Training
-------------------------------------------
- Training the model for certain epochs
- Gradient Descent
- parameter initialization
- Compute Gradient
- Update parameters.


Generalization: Model performs good in general
1. Regularization
2. Ensemble Learning: A hybrid machine learning algorithm combining two or 
   more algorithms
   Example: A classification model that classifies cats and dogs.
   MLAlgo1 => 89%
   MLAlgo2 => 78%
   MLAlgo1 + MLAlgo2 => 84%
"""

print("Machine Learning Overview")

Machine Learning Overview


In [3]:
"""
Objectives:
1. Understand three classification algorithm
   1.1. Decision Tree
   1.2. K-Nearest Neighbour
   1.3. Support Vector Machine 
2. How to generalize your model
   2.1. Regularization (Done)
   2.2. Ensemble Learning (Bagging, Boosting, Stacking, Voting)
3. How to Scikit Learn for Machine Learning
"""

ROOT_DIR = "E:\\PyCharmProjects\\pythonProject"
DATA_DIR = os.path.join(ROOT_DIR, "data")
DATASET_PATH = os.path.join(DATA_DIR, "housing.csv")

dataset = pd.read_csv(DATASET_PATH)
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
dataset.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [5]:
numerical_cols = dataset.select_dtypes(include='number').columns
categorical_cols = dataset.select_dtypes(include='object').columns

print(numerical_cols)
print(categorical_cols)

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking'], dtype='object')
Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')


### Data Preprocessing

In [6]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

scaler = StandardScaler()
label_encoder = LabelEncoder()

In [7]:
dataset[numerical_cols] = scaler.fit_transform(dataset[numerical_cols])
dataset[categorical_cols] = dataset[categorical_cols].apply(
    label_encoder.fit_transform
)
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,4.566365,1.046726,1.403419,1.421812,1.378217,1,0,0,0,1,1.517692,1,0
1,4.004484,1.75701,1.403419,5.405809,2.532024,1,0,0,0,1,2.679409,0,0
2,4.004484,2.218232,0.047278,1.421812,0.22441,1,0,1,0,0,1.517692,1,1
3,3.985755,1.083624,1.403419,1.421812,0.22441,1,0,1,0,1,2.679409,1,0
4,3.554979,1.046726,1.403419,-0.570187,0.22441,1,1,1,0,1,1.517692,0,0


In [8]:
target_var = 'prefarea'
dataset[target_var].value_counts()

prefarea
0    417
1    128
Name: count, dtype: int64

In [9]:
X = dataset.drop(columns=target_var)
y = dataset[target_var]

X.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,furnishingstatus
0,4.566365,1.046726,1.403419,1.421812,1.378217,1,0,0,0,1,1.517692,0
1,4.004484,1.75701,1.403419,5.405809,2.532024,1,0,0,0,1,2.679409,0
2,4.004484,2.218232,0.047278,1.421812,0.22441,1,0,1,0,0,1.517692,1
3,3.985755,1.083624,1.403419,1.421812,0.22441,1,0,1,0,1,2.679409,0
4,3.554979,1.046726,1.403419,-0.570187,0.22441,1,1,1,0,1,1.517692,0


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

In [11]:
""" Sampling: Oversampling, Undersampling
"""

# from imblearn.over_sampling import SMOTE
# smote = SMOTE(random_state=42)
# X_train, y_train = smote.fit_resample(X_train, y_train)

print("Resampled class distribution:", pd.Series(y_train).value_counts())

Resampled class distribution: prefarea
0    334
1    102
Name: count, dtype: int64


### Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

In [13]:
y_pred = decision_tree.predict(X_test)
print(y_pred[:10])

[0 0 0 0 0 1 0 0 0 0]


In [14]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7614678899082569


## KNN

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

""" Initialize model """
knn_model = KNeighborsClassifier()

""" Train the model """
knn_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = knn_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"KNN Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

KNN Classifier Accuracy: 0.7522935779816514
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.92      0.85        83
           1       0.46      0.23      0.31        26

    accuracy                           0.75       109
   macro avg       0.63      0.57      0.58       109
weighted avg       0.71      0.75      0.72       109



In [17]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

""" Initialize model """
svm_model = SVC(probability=True, random_state=42)

""" Train the model """
svm_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = svm_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"SVM Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)


SVM Classifier Accuracy: 0.7431192660550459
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.93      0.85        83
           1       0.40      0.15      0.22        26

    accuracy                           0.74       109
   macro avg       0.59      0.54      0.53       109
weighted avg       0.69      0.74      0.70       109



## Ensemble Learning

In [18]:
""" Boosting
Boosting => Models are trained sequentially

Suppose, 
decision_1.fit(X_train, y_train)
decision_2.fit(incorrect_predicted_rows)
decision_3.fit(incorrect_predicted_rows)
"""


from sklearn.ensemble import AdaBoostClassifier

""" Initialize model """
boosting_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(), 
    n_estimators=3, 
    random_state=42
)

""" Train the model """
boosting_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = boosting_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Boosting Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Boosting Classifier Accuracy: 0.7522935779816514
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.83      0.84        83
           1       0.48      0.50      0.49        26

    accuracy                           0.75       109
   macro avg       0.66      0.67      0.66       109
weighted avg       0.76      0.75      0.75       109



In [19]:
""" Bagging 
Models are learned idividually
single decision tree: accuracy: 71%

X_train
y_train

multiple decision tree:
   decision_tree_1.fit(X_train_1, y_train_1)
   decision_tree_2.fit(X_train_2, y_train_2)
   ...
   decision_tree_N.fit(X_train_N, y_train_N)
   
   predict(x):
      y_pred_1 = decision_tree_1.predict(x)
      y_pred_2 = decision_tree_2.predict(x)
      ...
      
      return the majority prediction
"""


from sklearn.ensemble import BaggingClassifier

""" Initialize model """
bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)

""" Train the model """
bagging_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = bagging_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Bagging Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Bagging Classifier Accuracy: 0.7798165137614679
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.92      0.86        83
           1       0.56      0.35      0.43        26

    accuracy                           0.78       109
   macro avg       0.69      0.63      0.65       109
weighted avg       0.76      0.78      0.76       109



In [20]:
"""
Stacking is an ensemble learning technique that combines predictions from multiple base models (of different types) to improve performance.
"""
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

estimators = [
    ('decision_tree', DecisionTreeClassifier()), 
    ('knn', KNeighborsClassifier()), 
    ('svm', SVC(probability=True, random_state=42))
]

""" Initialize model """
stacking_model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())

""" Train the model """
stacking_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = stacking_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Bagging Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Bagging Classifier Accuracy: 0.7064220183486238
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.84      0.81        83
           1       0.35      0.27      0.30        26

    accuracy                           0.71       109
   macro avg       0.57      0.56      0.56       109
weighted avg       0.68      0.71      0.69       109



In [21]:
""" Voting """

from sklearn.ensemble import VotingClassifier

""" Initialize model """

voting_model = VotingClassifier(
    estimators=[
        ('decision_tree', DecisionTreeClassifier()), 
        ('knn', KNeighborsClassifier()), 
        ('svm', SVC(probability=True))],
    voting='soft'  # Use 'hard' for majority voting
)

""" Train the model """
voting_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = voting_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Voting Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Voting Classifier Accuracy: 0.7889908256880734
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.92      0.87        83
           1       0.59      0.38      0.47        26

    accuracy                           0.79       109
   macro avg       0.71      0.65      0.67       109
weighted avg       0.77      0.79      0.77       109

