# <h1 style="text-align:center; color:blue;">Classification algorithms part 2</h1>

**We use the same data as part 1**

In [1]:
import numpy as np
import numpy.typing as npt
import pandas as pd
from sklearn.datasets import make_classification

# 1. Generate synthetic classification data
X: npt.NDArray[np.float64]
y: npt.NDArray[np.int64]
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=6,
    n_redundant=2,
    n_classes=2,
    random_state=42
)

# Convert to DataFrame for easier manipulation
feature_names: list[str] = [f"feature_{i}" for i in range(X.shape[1])]
df: pd.DataFrame = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# 2. Introduce some NaN values randomly for demonstration
rng: np.random.Generator = np.random.default_rng(42)
nan_mask: npt.NDArray[np.bool_] = rng.choice([True, False], size=df[feature_names].shape, p=[0.05, 0.95])
df.loc[:, feature_names] = df[feature_names].mask(nan_mask)

# 3. Drop NaN values
df.dropna(inplace=True)

# 4 Outlier removal (using IQR for numerical columns)
def remove_outliers_iqr(data: pd.DataFrame,
                        column: str) -> pd.DataFrame:
    Q1: float = data[column].quantile(0.25)
    Q3: float = data[column].quantile(0.75)
    IQR: float = Q3 - Q1
    not_outliers: pd.DataFrame = data[(data[column] > (Q1 - 1.5 * IQR)) & (data[column] < (Q3 + 1.5 * IQR))]
    return not_outliers

outliers_cols: list[str] = feature_names.copy()

for col in outliers_cols:
    df = remove_outliers_iqr(df, col)

# 5. Encoding and Scaling

# We will only use Scaling as sklearn created the target encoded internally

from sklearn.preprocessing import StandardScaler


# We already have the column names
numeric_cols: list[str] = feature_names.copy()

# Creating object
scaler: StandardScaler = StandardScaler()

# The template code for Scaling
for col in numeric_cols:
    df[[col]] = scaler.fit_transform(df[[col]])

# Remove weak correlations
# See Part 1 to know more

weak_corr_cols: list[str] = ['feature_0', 'feature_2', 'feature_3', 'feature_6', 'feature_7']

df.drop(weak_corr_cols, axis=1, inplace=True)

# 7. Splitting

from sklearn.model_selection import train_test_split

# Split the target and features

X: pd.DataFrame = df.drop('target', axis=1)
y: pd.Series = df['target']

X_train: pd.DataFrame
X_test: pd.DataFrame
y_train: pd.DataFrame
y_test: pd.DataFrame

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (447, 5), Test shape: (112, 5)


In [2]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator
from typing import Tuple

def evaluate_model(model: BaseEstimator,
                   X_train: pd.DataFrame,
                   y_train: pd.Series,
                   X_test: pd.DataFrame,
                   y_test: pd.Series) -> Tuple[str, npt.NDArray[np.int64]]:
    """
    Trains the given model and returns the classification report and confusion matrix for test predictions.

    Parameters:
        model: The classification model to train.
        X_train: Training features.
        y_train: Training labels.
        X_test: Test features.
        y_test: Test labels.

    Returns:
        report: Classification report as a string.
        cm: Confusion matrix as a numpy array.
    """
    model.fit(X_train, y_train)
    y_pred: npt.NDArray[np.float64] = model.predict(X_test)
    report: str = classification_report(y_test, y_pred)
    cm: npt.NDArray[np.float64] = confusion_matrix(y_test, y_pred)
    return report, cm


### 4- Support Vector Machines (SVM)

In [3]:
from sklearn.svm import LinearSVC

linear_svc: BaseEstimator = LinearSVC()

report, cm = evaluate_model(model=linear_svc,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test
                            )

print("\nClassification report\n", report)
print("\nConfusion matrix\n")
cm


Classification report
               precision    recall  f1-score   support

           0       0.79      0.81      0.80        57
           1       0.80      0.78      0.79        55

    accuracy                           0.79       112
   macro avg       0.79      0.79      0.79       112
weighted avg       0.79      0.79      0.79       112


Confusion matrix



array([[46, 11],
       [12, 43]])

In [4]:
from sklearn.svm import SVC

svc = SVC()

report, cm = evaluate_model(model=svc,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test
                            )

print("\nClassification report\n", report)
print("\nConfusion matrix\n")
cm


Classification report
               precision    recall  f1-score   support

           0       0.86      0.88      0.87        57
           1       0.87      0.85      0.86        55

    accuracy                           0.87       112
   macro avg       0.87      0.87      0.87       112
weighted avg       0.87      0.87      0.87       112


Confusion matrix



array([[50,  7],
       [ 8, 47]])

### 5- Decision Trees Classifiers

In [4]:
from sklearn.tree import DecisionTreeClassifier

DTC: BaseEstimator = DecisionTreeClassifier()

report, cm = evaluate_model(model=DTC,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test
                            )

print("\nClassification report\n", report)
print("\nConfusion matrix\n")
cm


Classification report
               precision    recall  f1-score   support

           0       0.77      0.88      0.82        57
           1       0.85      0.73      0.78        55

    accuracy                           0.80       112
   macro avg       0.81      0.80      0.80       112
weighted avg       0.81      0.80      0.80       112


Confusion matrix



array([[50,  7],
       [15, 40]])

### 6- Random Forests Classifiers

In [5]:
from sklearn.ensemble import RandomForestClassifier

RF: BaseEstimator = RandomForestClassifier()

report, cm = evaluate_model(model=RF,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test
                            )

print("\nClassification report\n", report)
print("\nConfusion matrix\n")
cm


Classification report
               precision    recall  f1-score   support

           0       0.83      0.93      0.88        57
           1       0.92      0.80      0.85        55

    accuracy                           0.87       112
   macro avg       0.87      0.86      0.87       112
weighted avg       0.87      0.87      0.87       112


Confusion matrix



array([[53,  4],
       [11, 44]])

### 7- Naive Bayes Classifiers

In [6]:
from sklearn.naive_bayes import GaussianNB

GNB: BaseEstimator = GaussianNB()

report, cm = evaluate_model(model=GNB,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test
                            )

print("\nClassification report\n", report)
print("\nConfusion matrix\n")
cm


Classification report
               precision    recall  f1-score   support

           0       0.85      0.81      0.83        57
           1       0.81      0.85      0.83        55

    accuracy                           0.83       112
   macro avg       0.83      0.83      0.83       112
weighted avg       0.83      0.83      0.83       112


Confusion matrix



array([[46, 11],
       [ 8, 47]])

In [7]:
from sklearn.naive_bayes import BernoulliNB

BNB: BaseEstimator = BernoulliNB()

report, cm = evaluate_model(model=BNB,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test
                            )

print("\nClassification report\n", report)
print("\nConfusion matrix\n")
cm


Classification report
               precision    recall  f1-score   support

           0       0.85      0.82      0.84        57
           1       0.82      0.85      0.84        55

    accuracy                           0.84       112
   macro avg       0.84      0.84      0.84       112
weighted avg       0.84      0.84      0.84       112


Confusion matrix



array([[47, 10],
       [ 8, 47]])

### Boosting

### 8- AdaBoost Classifier

In [8]:
from sklearn.ensemble import AdaBoostClassifier

ABC: BaseEstimator = AdaBoostClassifier()

report, cm = evaluate_model(model=ABC,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test
                            )

print("\nClassification report\n", report)
print("\nConfusion matrix\n")
cm


Classification report
               precision    recall  f1-score   support

           0       0.83      0.84      0.83        57
           1       0.83      0.82      0.83        55

    accuracy                           0.83       112
   macro avg       0.83      0.83      0.83       112
weighted avg       0.83      0.83      0.83       112


Confusion matrix



array([[48,  9],
       [10, 45]])

### 9- Gradient Boosting Classifier

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

GBC: BaseEstimator = GradientBoostingClassifier()

report, cm = evaluate_model(model=GBC,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test
                            )

print("\nClassification report\n", report)
print("\nConfusion matrix\n")
cm


Classification report
               precision    recall  f1-score   support

           0       0.80      0.91      0.85        57
           1       0.89      0.76      0.82        55

    accuracy                           0.84       112
   macro avg       0.85      0.84      0.84       112
weighted avg       0.85      0.84      0.84       112


Confusion matrix



array([[52,  5],
       [13, 42]])

### 10- XGBoost

In [10]:
from xgboost import XGBClassifier

XGBC: BaseEstimator = XGBClassifier()

report, cm = evaluate_model(model=XGBC,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test
                            )

print("\nClassification report\n", report)
print("\nConfusion matrix\n")
cm


Classification report
               precision    recall  f1-score   support

           0       0.83      0.93      0.88        57
           1       0.92      0.80      0.85        55

    accuracy                           0.87       112
   macro avg       0.87      0.86      0.87       112
weighted avg       0.87      0.87      0.87       112


Confusion matrix



array([[53,  4],
       [11, 44]])