# Machine Learning:
1. Data Preprocessing
2. ML learning algorithm: Linear Regression, Polynomial regression, Logistic Regression
3. ML Debugging: Overfitting, Underfitting, Generalization
4. Generalization:
   1. Regularization
   2. Ensemble Learning


## Objectives
1. Understand three classification algorithm:
   1. Decision Tree
   2. K-Nearest Neighbor
   3. Support Vector Machine
2. How to generalize your model
   1. Regularization (Done)
   2. Ensemble Learning (Bagging, Boosting, Stacking, Voting)
3. How to Scikit Learn for Machine Learning

In [1]:
import os
import pandas as pd

dataset = pd.read_csv("../Datasets/housing.csv")
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [2]:
dataset.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [3]:
numerical_cols = dataset.select_dtypes(include='number').columns
categorical_cols = dataset.select_dtypes(include='object').columns

print(numerical_cols)
print(categorical_cols)

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking'], dtype='object')
Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')


# Standardization

In [4]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

scaler = StandardScaler()
label_encoder = LabelEncoder()

In [5]:
dataset[numerical_cols] = scaler.fit_transform(dataset[numerical_cols])
dataset[categorical_cols] = dataset[categorical_cols].apply(label_encoder.fit_transform)
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,4.566365,1.046726,1.403419,1.421812,1.378217,1,0,0,0,1,1.517692,1,0
1,4.004484,1.75701,1.403419,5.405809,2.532024,1,0,0,0,1,2.679409,0,0
2,4.004484,2.218232,0.047278,1.421812,0.22441,1,0,1,0,0,1.517692,1,1
3,3.985755,1.083624,1.403419,1.421812,0.22441,1,0,1,0,1,2.679409,1,0
4,3.554979,1.046726,1.403419,-0.570187,0.22441,1,1,1,0,1,1.517692,0,0


In [6]:
target_var = 'prefarea'
dataset[target_var].value_counts()

prefarea
0    417
1    128
Name: count, dtype: int64

In [10]:
X = dataset.drop(columns=target_var)
y = dataset[target_var]


In [11]:
X.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,furnishingstatus
0,4.566365,1.046726,1.403419,1.421812,1.378217,1,0,0,0,1,1.517692,0
1,4.004484,1.75701,1.403419,5.405809,2.532024,1,0,0,0,1,2.679409,0
2,4.004484,2.218232,0.047278,1.421812,0.22441,1,0,1,0,0,1.517692,1
3,3.985755,1.083624,1.403419,1.421812,0.22441,1,0,1,0,1,2.679409,0
4,3.554979,1.046726,1.403419,-0.570187,0.22441,1,1,1,0,1,1.517692,0


In [12]:
y.head()

0    1
1    0
2    1
3    1
4    0
Name: prefarea, dtype: int64

# Model Train

In [13]:
from sklearn.model_selection import train_test_split
X_train, __x, y_train, __y = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(__x, __y, test_size=0.5, random_state=42)

# Balance
Under sampling: Reducing the number of instances from the majority class <br>
Over sampling: Increasing the number of instances in the minority class <br>
- Either by generating (Data Augmentation)
- Either by adding new data
        
Imblearn pip install imblearn

In [15]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [16]:
print("Original class distribution:", y_train.value_counts())
print("Resampled class distribution:", pd.Series(y_train_resampled).value_counts())

Original class distribution: prefarea
0    291
1     90
Name: count, dtype: int64
Resampled class distribution: prefarea
1    291
0    291
Name: count, dtype: int64


In [17]:
X_train = X_train_resampled
y_train = y_train_resampled

1. Understand three classification algorithm:
   1. Decision Tree: A tree like model
   2. K-Nearest Neighbor
   3. Support Vector Machine

# Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

In [19]:
y_pred = decision_tree.predict(X_test)
y_train_pred = decision_tree.predict(X_train)
print(y_pred[:10])
print(y_train_pred[:10])

[0 0 0 0 0 0 1 0 0 0]
[1 0 0 0 1 0 0 1 1 1]


In [20]:
from sklearn.metrics import accuracy_score
print("Test Accuracy: ", accuracy_score(y_test, y_pred))

Test Accuracy:  0.7195121951219512


# K-nearest Neighbor

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

""" Initialize model """
knn_model = KNeighborsClassifier()

""" Train the model """
knn_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = knn_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"KNN Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

KNN Classifier Accuracy: 0.6341463414634146
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.59      0.72        64
           1       0.35      0.78      0.48        18

    accuracy                           0.63        82
   macro avg       0.63      0.69      0.60        82
weighted avg       0.78      0.63      0.67        82



# Support Vector Machine(SVM)

In [22]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

""" Initialize model """
svm_model = SVC(probability=True, random_state=42)

""" Train the model """
svm_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = svm_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"SVM Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

SVM Classifier Accuracy: 0.7560975609756098
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.80      0.84        64
           1       0.46      0.61      0.52        18

    accuracy                           0.76        82
   macro avg       0.67      0.70      0.68        82
weighted avg       0.79      0.76      0.77        82



# Ensemble Learning

### Bagging

single decision tree: accuracy: 71% <br>

X_train<br>
y_train<br>

multiple decision tree:<br>
 -  decision_tree_1.fit(X_train_1, y_train_1)<br>
 -  decision_tree_2.fit(X_train_2, y_train_2)<br>
   <br>
- decision_tree_N.fit(X_train_N, y_train_N)<br>
   
-  predict(x):<br>
    -  y_pred_1 = decision_tree_1.predict(x)<br>
    -  y_pred_2 = decision_tree_2.predict(x)<br>
      <br>
      
   - return the majority prediction <br>

In [23]:

from sklearn.ensemble import BaggingClassifier

""" Initialize model """
bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)

""" Train the model """
bagging_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = bagging_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Bagging Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Bagging Classifier Accuracy: 0.7926829268292683
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87        64
           1       0.53      0.50      0.51        18

    accuracy                           0.79        82
   macro avg       0.70      0.69      0.69        82
weighted avg       0.79      0.79      0.79        82



# Bagging

Bagging => Each model is trained separately <br>
Boosting => Models are trained sequentially<br>

Suppose, <br>
decision_1.fit(X_train, y_train)<br>
decision_2.fit(incorrect_predicted_rows)<br>
decision_3.fit(incorrect_predicted_rows)

In [24]:

from sklearn.ensemble import AdaBoostClassifier

""" Initialize model """
boosting_model = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)

""" Train the model """
boosting_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = boosting_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Bagging Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Bagging Classifier Accuracy: 0.7195121951219512
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.75      0.81        64
           1       0.41      0.61      0.49        18

    accuracy                           0.72        82
   macro avg       0.64      0.68      0.65        82
weighted avg       0.77      0.72      0.74        82





# Stacking
Stacking is an ensemble learning technique that combines predictions from multiple base models (of different types) to improve performance.
The base models are trained on the same dataset, and their predictions are used as input features for a meta-model (a second-level model).
The meta-model learns to make the final prediction by identifying patterns in the base models' predictions.
Stacking can leverage the strengths of diverse models, improving both accuracy and generalization.

In [27]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

estimators = [
    ('decision_tree', DecisionTreeClassifier()), 
    ('knn', KNeighborsClassifier()), 
    ('svm', SVC(probability=True, random_state=42))
]

""" Initialize model """
stacking_model = StackingClassifier(estimators=estimators, final_estimator=SVC())

""" Train the model """
stacking_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = stacking_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Bagging Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Bagging Classifier Accuracy: 0.7926829268292683
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.81      0.86        64
           1       0.52      0.72      0.60        18

    accuracy                           0.79        82
   macro avg       0.72      0.77      0.73        82
weighted avg       0.83      0.79      0.80        82



# Voting

Voting is an ensemble method where multiple models make predictions, and the final output is determined by aggregating their predictions.
In hard voting, the final prediction is based on a majority vote (for classification) or averaging (for regression) across the models.
In soft voting, models' predicted probabilities are averaged, and the class with the highest average probability is chosen.
Voting helps improve stability and accuracy by combining different models' strengths, and works well when individual models are diverse and complementary.

In [30]:

from sklearn.ensemble import VotingClassifier

""" Initialize model """

voting_model = VotingClassifier(
    estimators=[('decision_tree', DecisionTreeClassifier()), ('knn', KNeighborsClassifier()), ('svm', SVC(probability=True))],
    voting='soft'  # Use 'hard' for majority voting
)

""" Train the model """
voting_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = voting_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Voting Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Voting Classifier Accuracy: 0.7926829268292683
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.80      0.86        64
           1       0.52      0.78      0.62        18

    accuracy                           0.79        82
   macro avg       0.72      0.79      0.74        82
weighted avg       0.84      0.79      0.81        82

