# Imports and variable setup

In [55]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import plotly.express as px
import pandas as pd
import numpy as np
import os

import datetime 

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from utils import filter_by_rule, detect_outliers_z_score, report_results
from config import *
import pickle


import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning, FutureWarning))

from DSClassifierMultiQ import DSClassifierMultiQ



In [5]:
import logging


if not os.path.exists("logs"):
    os.mkdir("logs")

log_file = os.path.join("logs", "api.log")

rfh = logging.handlers.RotatingFileHandler(
    filename=log_file,
    mode='a',
    maxBytes=LOGGING_MAX_SIZE_MB*1024*1024,
    backupCount=LOGGING_BACKUP_COUNT,
    encoding=None,
    delay=0
)

console_handler = logging.StreamHandler()

logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%y-%m-%d %H:%M:%S",
    handlers=[
        rfh,
        console_handler
    ],
)

logger = logging.getLogger(__name__)

In [6]:
#  https://www.kaggle.com/datasets/jakeshbohaju/brain-tumor/data
DATASET_FOLDER = "datasets" 

assert os.path.exists(DATASET_FOLDER), "Dataset folder not found"

datasets = os.listdir(DATASET_FOLDER)
logging.info(f"Found {len(datasets)} datasets")

24-04-26 19:43:59 [INFO] Found 1 datasets


# Reading the data

## Load the dataset, make checks

In [7]:
dataset = datasets[0]
dataset_name = dataset.split(".")[0]

data = pd.read_csv(os.path.join(DATASET_FOLDER, dataset))

assert data.isna().sum().sum() == 0, "Dataset contains missing values"
assert "labels" in data.columns, "Dataset does not contain `labels` column"
assert data.labels.nunique() == 2, "Dataset labels are not binary"

label_ratio = data.labels.value_counts(normalize=True).iloc[0]
assert 0.4 < label_ratio < 0.6, "Label ratio is not balanced"

# move labels column to the end 
data = data[[col for col in data.columns if col != "labels"] + ["labels"]]

logging.info(f"------ Dataset: {dataset_name} | Shape: {data.shape} | Label ratio: {label_ratio:.2f} -------")

24-04-26 19:44:05 [INFO] ------ Dataset: Brain Tumor | Shape: (3762, 14) | Label ratio: 0.55 -------


## Train test split

In [8]:
data = data.sample(frac=1).reset_index(drop=True)
data = data.apply(pd.to_numeric)
cut = int(train_set_size*len(data))

train_data_df = data.iloc[:cut]
test_data_df = data.iloc[cut:]

X_train = data.iloc[:cut, :-1].values
y_train = data.iloc[:cut, -1].values
X_test = data.iloc[cut:, :-1].values
y_test = data.iloc[cut:, -1].values

logging.info(f"Step 0: Data split done | {len(X_train)} - {len(X_test)}")


24-04-26 19:44:08 [INFO] Step 0: Data split done | 2633 - 1129


## Scaling

In [9]:
st_scaler = StandardScaler().fit(train_data_df)
# break
scale = st_scaler.scale_
mean = st_scaler.mean_
var = st_scaler.var_ 

X_train_scaled = st_scaler.transform(train_data_df)
X_test_scaled = st_scaler.transform(test_data_df)  #! during inference we won't have this

logging.debug("Step 1: Standard scaling complete")

24-04-26 19:44:09 [DEBUG] Step 1: Standard scaling complete


## KMeans

In [10]:
logging.info("Step 2: Performing clustering")

CLUSTERING_ALG = "kmeans" # in future we'll add DBSCAN

clustering_model = KMeans(n_clusters=2, random_state=42, n_init="auto")      
clustering_model.fit(X_train)  

clustering_labels_train = clustering_model.predict(X_train)
clustering_labels_test = clustering_model.predict(X_test)

train_data_df["labels_clustering"] = clustering_labels_train
test_data_df["labels_clustering"] = clustering_labels_test

def evaluate_clustering(df, labels, model=None, alg="kmeans", round_digits=3, 
                        print_results=False, dataset="train"):
    silhouette = silhouette_score(df, labels).round(round_digits)
    calinski_harabasz = calinski_harabasz_score(df, labels).round(round_digits)
    
    if alg == "kmeans" and dataset=="train":
        inertia = round(model.inertia_,round_digits)
        
    
    if print_results:
        logging.debug(f"Evaluation on {dataset}")
        logging.debug(f"\t{silhouette = }")
        logging.debug(f"\t{calinski_harabasz = }")
        if alg == "kmeans" and dataset=="train":
            logging.debug(f"\t{inertia = }")
    
evaluate_clustering(X_train, clustering_labels_train, clustering_model, 
                    CLUSTERING_ALG, print_results=True)
evaluate_clustering(X_test, clustering_labels_test, clustering_model, 
                    CLUSTERING_ALG, print_results=True, dataset="test")

logging.info("Step 2: Clustering done")

24-04-26 19:44:11 [INFO] Step 2: Performing clustering
Exception in thread Thread-6 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\hayk_\.conda\envs\thesis\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\hayk_\.conda\envs\thesis\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\hayk_\.conda\envs\thesis\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\hayk_\.conda\envs\thesis\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x81 in position 3: invalid start byte
found 0 physical cores < 1
  File "c:\Users\hayk_\.conda\envs\thesis\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
24-04-26 19:44:12 [DEBUG] Evaluation o

## KMeans evaluation

In [11]:

def evaluate_classifier(*, y_actual, y_clust, dataset="train", print_results=False,
                        purpose="clustering_eval"):    
    accuracy = accuracy_score(y_actual, y_clust)
    if purpose == "clustering_eval":
        if accuracy < 0.5: # swap 1's and 0's
            y_clust = [1 if label == 0 else 0 for label in y_clust]
            accuracy = accuracy_score(y_actual, y_clust)

    f1 = f1_score(y_actual, y_clust)
    conf_matrix = confusion_matrix(y_actual, y_clust)
    
    if print_results:
        logging.debug(f"Evaluation on {dataset}")
        logging.debug(f"\tAccuracy:  {accuracy:.2f}")
        logging.debug(f"\tF1 Score: {f1:.2f}")
        logging.debug(f"\tConfusion Matrix: \n{conf_matrix}")
    
    return {"accuracy": accuracy, "f1": f1, "confusion_matrix": conf_matrix}

In [12]:
evaluate_classifier(y_actual=y_train, y_clust=clustering_labels_train, 
                    dataset="train", print_results=True)
evaluate_classifier(y_actual=y_test, y_clust=clustering_labels_test, 
                    dataset="test", print_results=True)

logger.info("Step 3: Clustering as a classifier, evaluation done")

24-04-26 19:44:13 [DEBUG] Evaluation on train
24-04-26 19:44:13 [DEBUG] 	Accuracy:  0.66
24-04-26 19:44:13 [DEBUG] 	F1 Score: 0.53
24-04-26 19:44:13 [DEBUG] 	Confusion Matrix: 
[[1203  254]
 [ 654  522]]
24-04-26 19:44:13 [DEBUG] Evaluation on test
24-04-26 19:44:13 [DEBUG] 	Accuracy:  0.67
24-04-26 19:44:13 [DEBUG] 	F1 Score: 0.55
24-04-26 19:44:13 [DEBUG] 	Confusion Matrix: 
[[535  87]
 [281 226]]
24-04-26 19:44:13 [INFO] Step 3: Clustering as a classifier, evaluation done


## Distance calculation

In [13]:
def get_distance(df, model, alg="kmeans"):#, remove_outliers=True, normalize=True):
    if alg == "kmeans":
        distances = np.min(
            np.linalg.norm(df[:, np.newaxis] - model.cluster_centers_, axis=2), axis=1) 
    else:
        raise NotImplementedError("Only KMeans is supported")    
    
    return distances
    

In [14]:
def remove_outliers_and_normalize(df, distance_column="distance", label_column="labels"):
    outliers = detect_outliers_z_score(df["distance"])
    df["outlier"] = df[distance_column].apply(lambda x: x in outliers)
    df_no_outliers = df[~df["outlier"]]
    
    # min max scale the df. dont use outliers
    min_val = df_no_outliers.groupby(label_column)[distance_column].apply('min')
    max_val = df_no_outliers.groupby(label_column)[distance_column].apply('max')
    
    def scale(x, label):
        return (x - min_val[label]) / (max_val[label] - min_val[label]) if max_val[label] > min_val[label] else 0
        
    dist_norm = df.apply(lambda row: scale(row[distance_column], row[label_column]), axis=1)
    
    return dist_norm

In [15]:
train_data_df["distance"] = get_distance(X_train, clustering_model)
test_data_df["distance"] = get_distance(X_test, clustering_model)

In [16]:
train_data_df["distance_norm"] = remove_outliers_and_normalize(train_data_df) 
test_data_df["distance_norm"] = remove_outliers_and_normalize(test_data_df)

assert train_data_df.isna().sum().sum() == 0, "Train data contains NaNs"
assert test_data_df.isna().sum().sum() == 0, "Train data contains NaNs"

logger.info(f"Step 4: Distance calculation done")

24-04-26 19:44:19 [INFO] Step 4: Distance calculation done


## DST

In [63]:
num_breaks = 3
mult_rules = False
debug_mode = True
print_final_model = True
num_workers = 0

RULE_FOLDER = "rules_saved_clean"

ignore_for_training = ["labels_clustering", "distance_norm"]
df_cols = [i for i in list(data.columns) if i not in ignore_for_training]

rows_use = 100000000

# for method in ["random", "kmeans"]:
for method in ["kmeans", "random"]:
    name = f"dataset={dataset_name}, breaks={num_breaks}, add_mult_rules={mult_rules}, maf_method={method}"
    logger.info(f"Step 5: Run DST ({name})")
    DSC = DSClassifierMultiQ(2, debug_mode=debug_mode, num_workers=num_workers, maf_method=method,
                            data=train_data_df.head(rows_use))
    logger.debug(f"\tModel init done")    
    res = DSC.fit(X_train[:rows_use], y_train[:rows_use], 
            add_single_rules=True, single_rules_breaks=num_breaks, add_mult_rules=mult_rules,
            column_names=df_cols, print_every_epochs=5, print_final_model=print_final_model)
    losses, epoch, dt = res
    logger.debug(f"\tModel fit done")

    DSC.model.save_rules_bin(os.path.join(RULE_FOLDER, f"{name}.dsb"))
#     most_important_rules = DSC.model.find_most_important_rules()
#     print(most_important_rules)
    DSC.model.print_most_important_rules()
    y_pred = DSC.predict(X_test)

    logger.info(f"Step 6: Inference done")

    report_results(y_test, y_pred, dataset=dataset_name, method=method,
                epoch=epoch, dt=dt, losses=losses, 
                save_results=True, name=name, print_results=True,
                breaks=num_breaks, mult_rules=mult_rules)
    
    logging.info("-"*30)
    
    

24-04-26 20:16:22 [INFO] Step 5: Run DST (dataset=Brain Tumor, breaks=3, add_mult_rules=False, maf_method=kmeans)
24-04-26 20:16:22 [DEBUG] 	Model init done


Optimization started
Processing epoch	11	0.4363	

In [28]:
experiments = pd.read_csv("experiments.csv")

experiments.tail(4)

Unnamed: 0,name,accuracy,f1,confusion_matrix,training_time,epochs,min_loss,datetime,MAF method,dataset,breaks,mult_rules
68,"dataset=Brain Tumor, breaks=3, add_mult_rules=...",0.983171,0.981132,[[616 6]\n [ 13 494]],56.823486,80,0.024735,26-04-2024 19:51:10,kmeans,Brain Tumor,3.0,False
69,"dataset=Brain Tumor, breaks=3, add_mult_rules=...",0.984057,0.982036,[[619 3]\n [ 15 492]],127.310002,137,0.029311,26-04-2024 19:53:18,random,Brain Tumor,3.0,False
70,"dataset=Brain Tumor, breaks=3, add_mult_rules=...",0.983171,0.981132,[[616 6]\n [ 13 494]],61.127786,80,0.024735,26-04-2024 19:55:26,kmeans,Brain Tumor,3.0,False
71,"dataset=Brain Tumor, breaks=3, add_mult_rules=...",0.984942,0.983051,[[619 3]\n [ 14 493]],86.98929,123,0.02834,26-04-2024 19:56:54,random,Brain Tumor,3.0,False


## Rules

In [30]:
RULE_FOLDER = "rules_saved_clean"

assert os.path.exists(RULE_FOLDER), f"Folder {RULE_FOLDER} does not exist"

rules = os.listdir(RULE_FOLDER)
logging.info(f"Found {len(rules)} rules")

24-04-26 20:00:24 [INFO] Found 2 rules


In [50]:
def get_pairs(rules):
    pairs = []
    for rule1 in rules:
        if 'maf_method' in rule1:
            for rule2 in rules:
                if rule1 == rule2 or (rule2, rule1) in pairs:
                    continue
                if rule2.startswith(rule1.split('maf_method')[0]):
                    pairs.append((rule1, rule2))
    return pairs

In [51]:
def extract_res(rules_info):
    rules = rules_info["preds"]
    mafs = rules_info["masses"]
    
    rule_names = [str(i) for i in rules]
    mass_first = [float(i[0]) for i in mafs]
    mass_second = [float(i[1]) for i in mafs]
    uncertainty = [float(i[2]) for i in mafs]
    
    res = {"rule": rule_names, "mass_first": mass_first, "mass_second": mass_second, "uncertainty": uncertainty}
    return res

In [58]:
pairs = get_pairs(rules)

for r1, r2 in pairs:
    r1_info = pickle.load(open(os.path.join(RULE_FOLDER, r1), "rb"))
    r2_info = pickle.load(open(os.path.join(RULE_FOLDER, r2), "rb"))
    
    r1_df = pd.DataFrame(extract_res(r1_info))
    r2_df = pd.DataFrame(extract_res(r2_info))
    
    merged = pd.merge(r1_df, r2_df, on="rule", suffixes=('_kmeans', '_random'))



In [59]:
merged

Unnamed: 0,rule,mass_first_kmeans,mass_second_kmeans,uncertainty_kmeans,mass_first_random,mass_second_random,uncertainty_random
0,Mean < 5.570,0.103486,0.541283,0.355231,0.039337,0.15054,0.810123
1,5.570 < Mean < 9.285,0.232696,0.418036,0.349268,0.039537,0.140639,0.819823
2,9.285 < Mean < 13.001,0.302733,0.363204,0.334063,0.139017,0.032521,0.828462
3,Mean > 13.001,0.095045,0.685722,0.219234,0.254088,0.0,0.745912
4,Variance < 398.858,0.121278,0.524622,0.3541,0.230529,0.0,0.769471
5,398.858 < Variance < 687.977,0.243402,0.403388,0.35321,0.141709,0.040688,0.817603
6,687.977 < Variance < 977.096,0.104279,0.539633,0.356087,0.045751,0.130867,0.823382
7,Variance > 977.096,0.0638,0.710347,0.225853,0.0,0.287257,0.712743
8,Standard Deviation < 19.156,0.100246,0.547191,0.352563,0.165505,0.004691,0.829804
9,19.156 < Standard Deviation < 24.839,0.256166,0.395774,0.348059,0.178942,0.011055,0.810002


In [60]:
px.bar(merged, x='rule', y=['uncertainty_kmeans', 'uncertainty_random'], 
       title='Uncertainty comparison between kmeans and random', # put bars next to each other
       barmode='group')