# Imports and variable setup

In [1]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import plotly.express as px
import pandas as pd
import numpy as np
import os

import datetime 
import logging

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from utils import filter_by_rule
from config import *

In [3]:

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", 
                    datefmt="%d-%b-%y %H:%M:%S")

In [4]:
DATASET_FOLDER = "datasets"

assert os.path.exists(DATASET_FOLDER), "Dataset folder not found"

datasets = os.listdir(DATASET_FOLDER)
logging.info(f"Found {len(datasets)} datasets")

26-Apr-24 17:40:01 [INFO] Found 1 datasets


# Reading the data

## Load the dataset, make checks

In [5]:
dataset = datasets[0]
dataset_name = dataset.split(".")[0]

data = pd.read_csv(os.path.join(DATASET_FOLDER, dataset))

assert data.isna().sum().sum() == 0, "Dataset contains missing values"
assert "labels" in data.columns, "Dataset does not contain `labels` column"
assert data.labels.nunique() == 2, "Dataset labels are not binary"

label_ratio = data.labels.value_counts(normalize=True).iloc[0]
assert 0.4 < label_ratio < 0.6, "Label ratio is not balanced"

# move labels column to the end 
data = data[[col for col in data.columns if col != "labels"] + ["labels"]]

logging.info(f"------ Dataset: {dataset_name} | Shape: {data.shape} | Label ratio: {label_ratio:.2f} -------")

26-Apr-24 17:40:02 [INFO] ------ Dataset: Brain Tumor | Shape: (3762, 14) | Label ratio: 0.55 -------


## train test split

In [52]:
data = data.sample(frac=1).reset_index(drop=True)
data = data.apply(pd.to_numeric)
cut = int(train_set_size*len(data))

train_data_df = data.iloc[:cut]
test_data_df = data.iloc[cut:]

X_train = data.iloc[:cut, :-1].values
y_train = data.iloc[:cut, -1].values
X_test = data.iloc[cut:, :-1].values
y_test = data.iloc[cut:, -1].values

logging.info(f"Step 0: Data split done | {len(X_train)} - {len(X_test)}")


26-Apr-24 18:10:27 [INFO] Step 0: Data split done | 2633 - 1129


## Scaling

In [7]:
st_scaler = StandardScaler().fit(train_data_df)
# break
scale = st_scaler.scale_
mean = st_scaler.mean_
var = st_scaler.var_ 

X_train_scaled = st_scaler.transform(train_data_df)
X_test_scaled = st_scaler.transform(test_data_df)  #! during inference we won't have this

logging.debug("Step 1: Standard scaling complete")

26-Apr-24 17:40:03 [DEBUG] Step 1: Standard scaling complete


## KMeans

In [22]:
logging.info("Step 2: Performing clustering")

CLUSTERING_ALG = "kmeans" # in future we'll add DBSCAN

clustering_model = KMeans(n_clusters=2, random_state=42, n_init="auto")      
clustering_model.fit(X_train)  

clustering_labels_train = clustering_model.predict(X_train)
clustering_labels_test = clustering_model.predict(X_test)

def evaluate_clustering(df, labels, model=None, alg="kmeans", round_digits=3, 
                        print_results=False, dataset="train"):
    silhouette = silhouette_score(df, labels).round(round_digits)
    calinski_harabasz = calinski_harabasz_score(df, labels).round(round_digits)
    
    if alg == "kmeans" and dataset=="train":
        inertia = round(model.inertia_,round_digits)
        
    
    if print_results:
        logging.debug(f"Evaluation on {dataset}")
        logging.debug(f"\t{silhouette = }")
        logging.debug(f"\t{calinski_harabasz = }")
        if alg == "kmeans" and dataset=="train":
            logging.debug(f"\t{inertia = }")
    
evaluate_clustering(X_train, clustering_labels_train, clustering_model, 
                    CLUSTERING_ALG, print_results=True)
evaluate_clustering(X_test, clustering_labels_test, clustering_model, 
                    CLUSTERING_ALG, print_results=True, dataset="test")

logging.info("Step 2: Clustering done")

26-Apr-24 17:51:39 [INFO] Step 2: Performing clustering
26-Apr-24 17:51:39 [DEBUG] Evaluation on train
26-Apr-24 17:51:39 [DEBUG] 	silhouette = 0.587
26-Apr-24 17:51:39 [DEBUG] 	calinski_harabasz = 4398.598
26-Apr-24 17:51:39 [DEBUG] 	inertia = 237900920.42
26-Apr-24 17:51:39 [DEBUG] Evaluation on test
26-Apr-24 17:51:39 [DEBUG] 	silhouette = 0.584
26-Apr-24 17:51:39 [DEBUG] 	calinski_harabasz = 1716.836
26-Apr-24 17:51:39 [INFO] Step 2: Clustering done


In [48]:
# def swap_ones_zeros_decorator(func):
#     def wrapper(*args, **kwargs):
#         # Run the function in the ordinary way
#         print(kwargs)
#         result = func(*args, **kwargs)
        
#         # Swap 1s and 0s in y_clust
#         kwargs["y_clust"] = [1 if label == 0 else 0 for label in kwargs['y_clust']]
#         swapped_result = func(*args, **kwargs)
        
#         # Return the best result (highest accuracy)
#         if swapped_result['accuracy'] > result['accuracy']:
#             return swapped_result
#         else:
#             return result
    
#     return wrapper


In [49]:

def evaluate_classifier(*, y_actual, y_clust, dataset="train", print_results=False,
                        purpose="clustering_eval"):    
    accuracy = accuracy_score(y_actual, y_clust)
    if purpose == "clustering_eval":
        if accuracy < 0.5: # swap 1's and 0's
            y_clust = [1 if label == 0 else 0 for label in y_clust]
            accuracy = accuracy_score(y_actual, y_clust)

    f1 = f1_score(y_actual, y_clust)
    conf_matrix = confusion_matrix(y_actual, y_clust)
    
    if print_results:
        logging.debug(f"Evaluation on {dataset}")
        logging.debug(f"\tAccuracy:  {accuracy:.2f}")
        logging.debug(f"\tF1 Score: {f1:.2f}")
        logging.debug(f"\tConfusion Matrix: \n{conf_matrix}")
    
    return {"accuracy": accuracy, "f1": f1, "confusion_matrix": conf_matrix}

In [51]:
evaluate_classifier(y_actual=y_train, y_clust=clustering_labels_train, 
                    dataset="train", print_results=True)
evaluate_classifier(y_actual=y_test, y_clust=clustering_labels_test, 
                    dataset="test", print_results=True)

26-Apr-24 18:08:52 [DEBUG] Evaluation on train
26-Apr-24 18:08:52 [DEBUG] 	Accuracy:  0.65
26-Apr-24 18:08:52 [DEBUG] 	F1 Score: 0.52
26-Apr-24 18:08:52 [DEBUG] 	Confusion Matrix: 
[[1222  227]
 [ 685  499]]
26-Apr-24 18:08:52 [DEBUG] Evaluation on test
26-Apr-24 18:08:52 [DEBUG] 	Accuracy:  0.68
26-Apr-24 18:08:52 [DEBUG] 	F1 Score: 0.54
26-Apr-24 18:08:52 [DEBUG] 	Confusion Matrix: 
[[551  79]
 [285 214]]


{'accuracy': 0.6775907883082374,
 'f1': 0.5404040404040404,
 'confusion_matrix': array([[551,  79],
        [285, 214]], dtype=int64)}