# Imports and variable setup

In [1]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import plotly.express as px
import pandas as pd
import numpy as np
import os

import datetime 

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from utils import filter_by_rule, detect_outliers_z_score, report_results
from config import *
import pickle


import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning, FutureWarning))

from DSClassifierMultiQ import DSClassifierMultiQ



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import logging


if not os.path.exists("logs"):
    os.mkdir("logs")

log_file = os.path.join("logs", "api.log")

rfh = logging.handlers.RotatingFileHandler(
    filename=log_file,
    mode='a',
    maxBytes=LOGGING_MAX_SIZE_MB*1024*1024,
    backupCount=LOGGING_BACKUP_COUNT,
    encoding=None,
    delay=0
)

console_handler = logging.StreamHandler()

logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%y-%m-%d %H:%M:%S",
    handlers=[
        rfh,
        console_handler
    ],
)

logger = logging.getLogger(__name__)

In [3]:
#  https://www.kaggle.com/datasets/jakeshbohaju/brain-tumor/data
DATASET_FOLDER = "datasets" 

assert os.path.exists(DATASET_FOLDER), "Dataset folder not found"

datasets = os.listdir(DATASET_FOLDER)
logging.info(f"Found {len(datasets)} datasets")

24-04-26 20:47:26 [INFO] Found 1 datasets


# Reading the data

## Load the dataset, make checks

In [4]:
dataset = datasets[0]
dataset_name = dataset.split(".")[0]

data = pd.read_csv(os.path.join(DATASET_FOLDER, dataset))

assert data.isna().sum().sum() == 0, "Dataset contains missing values"
assert "labels" in data.columns, "Dataset does not contain `labels` column"
assert data.labels.nunique() == 2, "Dataset labels are not binary"

label_ratio = data.labels.value_counts(normalize=True).iloc[0]
assert 0.4 < label_ratio < 0.6, "Label ratio is not balanced"

# move labels column to the end 
data = data[[col for col in data.columns if col != "labels"] + ["labels"]]

logging.info(f"------ Dataset: {dataset_name} | Shape: {data.shape} | Label ratio: {label_ratio:.2f} -------")

24-04-26 20:47:26 [INFO] ------ Dataset: Brain Tumor | Shape: (3762, 14) | Label ratio: 0.55 -------


## Train test split

In [5]:
data = data.sample(frac=1).reset_index(drop=True)
data = data.apply(pd.to_numeric)
cut = int(train_set_size*len(data))

train_data_df = data.iloc[:cut]
test_data_df = data.iloc[cut:]

X_train = data.iloc[:cut, :-1].values
y_train = data.iloc[:cut, -1].values
X_test = data.iloc[cut:, :-1].values
y_test = data.iloc[cut:, -1].values

logging.info(f"Step 0: Data split done | {len(X_train)} - {len(X_test)}")


24-04-26 20:47:26 [INFO] Step 0: Data split done | 2633 - 1129


## Scaling

In [6]:
st_scaler = StandardScaler().fit(train_data_df)
# break
scale = st_scaler.scale_
mean = st_scaler.mean_
var = st_scaler.var_ 

X_train_scaled = st_scaler.transform(train_data_df)
X_test_scaled = st_scaler.transform(test_data_df)  #! during inference we won't have this

logging.debug("Step 1: Standard scaling complete")

24-04-26 20:47:26 [DEBUG] Step 1: Standard scaling complete


## KMeans

In [7]:
logging.info("Step 2: Performing clustering")

CLUSTERING_ALG = "kmeans" # in future we'll add DBSCAN

clustering_model = KMeans(n_clusters=2, random_state=42, n_init="auto")      
clustering_model.fit(X_train)  

clustering_labels_train = clustering_model.predict(X_train)
clustering_labels_test = clustering_model.predict(X_test)

train_data_df["labels_clustering"] = clustering_labels_train
test_data_df["labels_clustering"] = clustering_labels_test

def evaluate_clustering(df, labels, model=None, alg="kmeans", round_digits=3, 
                        print_results=False, dataset="train"):
    silhouette = silhouette_score(df, labels).round(round_digits)
    calinski_harabasz = calinski_harabasz_score(df, labels).round(round_digits)
    
    if alg == "kmeans" and dataset=="train":
        inertia = round(model.inertia_,round_digits)
        
    
    if print_results:
        logging.debug(f"Evaluation on {dataset}")
        logging.debug(f"\t{silhouette = }")
        logging.debug(f"\t{calinski_harabasz = }")
        if alg == "kmeans" and dataset=="train":
            logging.debug(f"\t{inertia = }")
    
evaluate_clustering(X_train, clustering_labels_train, clustering_model, 
                    CLUSTERING_ALG, print_results=True)
evaluate_clustering(X_test, clustering_labels_test, clustering_model, 
                    CLUSTERING_ALG, print_results=True, dataset="test")

logging.info("Step 2: Clustering done")

24-04-26 20:47:26 [INFO] Step 2: Performing clustering
Exception in thread Thread-7 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\hayk_\.conda\envs\thesis\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\hayk_\.conda\envs\thesis\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\hayk_\.conda\envs\thesis\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\hayk_\.conda\envs\thesis\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x81 in position 3: invalid start byte
found 0 physical cores < 1
  File "c:\Users\hayk_\.conda\envs\thesis\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
24-04-26 20:47:27 [DEBUG] Evaluation o

## KMeans evaluation

In [8]:

def evaluate_classifier(*, y_actual, y_clust, dataset="train", print_results=False,
                        purpose="clustering_eval"):    
    accuracy = accuracy_score(y_actual, y_clust)
    if purpose == "clustering_eval":
        if accuracy < 0.5: # swap 1's and 0's
            y_clust = [1 if label == 0 else 0 for label in y_clust]
            accuracy = accuracy_score(y_actual, y_clust)

    f1 = f1_score(y_actual, y_clust)
    conf_matrix = confusion_matrix(y_actual, y_clust)
    
    if print_results:
        logging.debug(f"Evaluation on {dataset}")
        logging.debug(f"\tAccuracy:  {accuracy:.2f}")
        logging.debug(f"\tF1 Score: {f1:.2f}")
        logging.debug(f"\tConfusion Matrix: \n{conf_matrix}")
    
    return {"accuracy": accuracy, "f1": f1, "confusion_matrix": conf_matrix}

In [9]:
evaluate_classifier(y_actual=y_train, y_clust=clustering_labels_train, 
                    dataset="train", print_results=True)
evaluate_classifier(y_actual=y_test, y_clust=clustering_labels_test, 
                    dataset="test", print_results=True)

logger.info("Step 3: Clustering as a classifier, evaluation done")

24-04-26 20:47:27 [DEBUG] Evaluation on train
24-04-26 20:47:27 [DEBUG] 	Accuracy:  0.66
24-04-26 20:47:27 [DEBUG] 	F1 Score: 0.54
24-04-26 20:47:27 [DEBUG] 	Confusion Matrix: 
[[1213  222]
 [ 667  531]]
24-04-26 20:47:27 [DEBUG] Evaluation on test
24-04-26 20:47:27 [DEBUG] 	Accuracy:  0.66
24-04-26 20:47:27 [DEBUG] 	F1 Score: 0.51
24-04-26 20:47:27 [DEBUG] 	Confusion Matrix: 
[[543 101]
 [287 198]]
24-04-26 20:47:27 [INFO] Step 3: Clustering as a classifier, evaluation done


## Distance calculation

In [10]:
def get_distance(df, model, alg="kmeans"):#, remove_outliers=True, normalize=True):
    if alg == "kmeans":
        distances = np.min(
            np.linalg.norm(df[:, np.newaxis] - model.cluster_centers_, axis=2), axis=1) 
    else:
        raise NotImplementedError("Only KMeans is supported")    
    
    return distances
    

In [11]:
def remove_outliers_and_normalize(df, distance_column="distance", label_column="labels"):
    outliers = detect_outliers_z_score(df["distance"])
    df["outlier"] = df[distance_column].apply(lambda x: x in outliers)
    df_no_outliers = df[~df["outlier"]]
    
    # min max scale the df. dont use outliers
    min_val = df_no_outliers.groupby(label_column)[distance_column].apply('min')
    max_val = df_no_outliers.groupby(label_column)[distance_column].apply('max')
    
    def scale(x, label):
        return (x - min_val[label]) / (max_val[label] - min_val[label]) if max_val[label] > min_val[label] else 0
        
    dist_norm = df.apply(lambda row: scale(row[distance_column], row[label_column]), axis=1)
    
    return dist_norm

In [12]:
train_data_df["distance"] = get_distance(X_train, clustering_model)
test_data_df["distance"] = get_distance(X_test, clustering_model)

In [13]:
train_data_df["distance_norm"] = remove_outliers_and_normalize(train_data_df) 
test_data_df["distance_norm"] = remove_outliers_and_normalize(test_data_df)

assert train_data_df.isna().sum().sum() == 0, "Train data contains NaNs"
assert test_data_df.isna().sum().sum() == 0, "Train data contains NaNs"

logger.info(f"Step 4: Distance calculation done")

24-04-26 20:47:28 [INFO] Step 4: Distance calculation done


## DST

In [14]:
# !pip install -U kaleido
print(1)

1


In [32]:
num_breaks = 3
mult_rules = False
debug_mode = True
print_final_model = True
num_workers = 0

RULE_FOLDER = "rules_saved_clean"

ignore_for_training = ["labels_clustering", "distance_norm"]
df_cols = [i for i in list(data.columns) if i not in ignore_for_training]

rows_use = None
if rows_use:
    train_data_df_use = train_data_df.head(rows_use)
    X_train_use = X_train[:rows_use]
    y_train_use = y_train[:rows_use]
else:
    train_data_df_use = train_data_df
    X_train_use = X_train
    y_train_use = y_train
logger.debug(f"Train: {len(X_train_use)}")
for method in ["kmeans", "random"]:
    name = f"dataset={dataset_name}, breaks={num_breaks}, add_mult_rules={mult_rules}, maf_method={method}"
    logger.info(f"Step 5: Run DST ({name})")
    DSC = DSClassifierMultiQ(2, debug_mode=debug_mode, num_workers=num_workers, maf_method=method,
                            data=train_data_df_use)#.head(rows_use))
    logger.debug(f"\tModel init done")    
    res = DSC.fit(X_train_use, y_train_use, 
            add_single_rules=True, single_rules_breaks=num_breaks, add_mult_rules=mult_rules,
            column_names=df_cols, print_every_epochs=5, print_final_model=print_final_model)
    losses, epoch, dt = res
    logger.debug(f"\tModel fit done")

    DSC.model.save_rules_bin(os.path.join(RULE_FOLDER, f"{name}.dsb"))
#     most_important_rules = DSC.model.find_most_important_rules()
#     print(most_important_rules)
    DSC.model.print_most_important_rules()
    y_pred = DSC.predict(X_test)

    logger.info(f"Step 6: Inference done")

    report_results(y_test, y_pred, dataset=dataset_name, method=method,
                epoch=epoch, dt=dt, losses=losses, 
                save_results=True, name=name, print_results=True,
                breaks=num_breaks, mult_rules=mult_rules)
    
    logging.info("-"*30)
    
    

24-04-26 21:19:03 [DEBUG] Train: 2633
24-04-26 21:19:03 [INFO] Step 5: Run DST (dataset=Brain Tumor, breaks=3, add_mult_rules=False, maf_method=kmeans)
24-04-26 21:19:03 [DEBUG] 	Model init done


Optimization started
Processing epoch	66	0.0382	

24-04-26 21:24:14 [DEBUG] 	Model fit done



Training time: 288.09s, epochs: 69

Least training loss reached: 0.033
DSModelMultiQ(
  DS Classifier using 49 rules
  
  Rule 1: Mean < 5.696
  	C1: 0.459	C2: 0.182	Unc: 0.359
  
  Rule 2: 5.696 < Mean < 9.586
  	C1: 0.381	C2: 0.266	Unc: 0.354
  
  Rule 3: 9.586 < Mean < 13.475
  	C1: 0.389	C2: 0.258	Unc: 0.354
  
  Rule 4: Mean > 13.475
  	C1: 0.878	C2: 0.008	Unc: 0.114
  
  Rule 5: Variance < 400.038
  	C1: 0.517	C2: 0.138	Unc: 0.346
  
  Rule 6: 400.038 < Variance < 718.473
  	C1: 0.385	C2: 0.261	Unc: 0.354
  
  Rule 7: 718.473 < Variance < 1036.907
  	C1: 0.430	C2: 0.210	Unc: 0.360
  
  Rule 8: Variance > 1036.907
  	C1: 0.596	C2: 0.124	Unc: 0.280
  
  Rule 9: Standard Deviation < 19.398
  	C1: 0.534	C2: 0.122	Unc: 0.344
  
  Rule 10: 19.398 < Standard Deviation < 25.324
  	C1: 0.367	C2: 0.287	Unc: 0.346
  
  Rule 11: 25.324 < Standard Deviation < 31.249
  	C1: 0.479	C2: 0.172	Unc: 0.349
  
  Rule 12: Standard Deviation > 31.249
  	C1: 0.599	C2: 0.118	Unc: 0.283
  
  Rule 13: Ent

24-04-26 21:24:14 [INFO] Step 6: Inference done
24-04-26 21:24:14 [DEBUG] Training Time: 288.09s
24-04-26 21:24:14 [DEBUG] Epochs: 69
24-04-26 21:24:14 [DEBUG] Min Loss: 0.033


24-04-26 21:24:14 [DEBUG] Accuracy:  0.98
24-04-26 21:24:14 [DEBUG] F1 Score: 0.97
24-04-26 21:24:14 [DEBUG] Confusion Matrix: 
[[635   9]
 [ 19 466]]
24-04-26 21:24:14 [INFO] ------------------------------
24-04-26 21:24:14 [INFO] Step 5: Run DST (dataset=Brain Tumor, breaks=3, add_mult_rules=False, maf_method=random)
24-04-26 21:24:14 [DEBUG] 	Model init done


Optimization started
Processing epoch	136	0.0266	

24-04-26 21:30:49 [DEBUG] 	Model fit done



Training time: 394.81s, epochs: 137

Least training loss reached: 0.026
DSModelMultiQ(
  DS Classifier using 49 rules
  
  Rule 1: Mean < 5.696
  	C1: 0.000	C2: 0.300	Unc: 0.700
  
  Rule 2: 5.696 < Mean < 9.586
  	C1: 0.000	C2: 0.278	Unc: 0.722
  
  Rule 3: 9.586 < Mean < 13.475
  	C1: 0.109	C2: 0.079	Unc: 0.812
  
  Rule 4: Mean > 13.475
  	C1: 0.320	C2: 0.000	Unc: 0.680
  
  Rule 5: Variance < 400.038
  	C1: 0.289	C2: 0.000	Unc: 0.711
  
  Rule 6: 400.038 < Variance < 718.473
  	C1: 0.176	C2: 0.019	Unc: 0.805
  
  Rule 7: 718.473 < Variance < 1036.907
  	C1: 0.125	C2: 0.061	Unc: 0.814
  
  Rule 8: Variance > 1036.907
  	C1: 0.000	C2: 0.307	Unc: 0.693
  
  Rule 9: Standard Deviation < 19.398
  	C1: 0.288	C2: 0.000	Unc: 0.712
  
  Rule 10: 19.398 < Standard Deviation < 25.324
  	C1: 0.296	C2: 0.000	Unc: 0.704
  
  Rule 11: 25.324 < Standard Deviation < 31.249
  	C1: 0.083	C2: 0.104	Unc: 0.813
  
  Rule 12: Standard Deviation > 31.249
  	C1: 0.000	C2: 0.339	Unc: 0.661
  
  Rule 13: En

24-04-26 21:30:50 [INFO] Step 6: Inference done
24-04-26 21:30:50 [DEBUG] Training Time: 394.81s
24-04-26 21:30:50 [DEBUG] Epochs: 137
24-04-26 21:30:50 [DEBUG] Min Loss: 0.026


24-04-26 21:30:50 [DEBUG] Accuracy:  0.98
24-04-26 21:30:50 [DEBUG] F1 Score: 0.98
24-04-26 21:30:50 [DEBUG] Confusion Matrix: 
[[635   9]
 [ 11 474]]
24-04-26 21:30:50 [INFO] ------------------------------


In [33]:
experiments = pd.read_csv("experiments.csv")

experiments.tail(2)

Unnamed: 0,name,accuracy,f1,confusion_matrix,training_time,epochs,min_loss,datetime,MAF method,dataset,breaks,mult_rules
84,"dataset=Brain Tumor, breaks=3, add_mult_rules=...",0.975199,0.970833,[[635 9]\n [ 19 466]],288.094734,69,0.033078,26-04-2024 21:24:14,kmeans,Brain Tumor,3.0,False
85,"dataset=Brain Tumor, breaks=3, add_mult_rules=...",0.982285,0.979339,[[635 9]\n [ 11 474]],394.811531,137,0.026403,26-04-2024 21:30:50,random,Brain Tumor,3.0,False


## Rules

In [34]:
RULE_FOLDER = "rules_saved_clean"

assert os.path.exists(RULE_FOLDER), f"Folder {RULE_FOLDER} does not exist"

rules = os.listdir(RULE_FOLDER)
logging.info(f"Found {len(rules)} rules")

24-04-26 21:41:45 [INFO] Found 4 rules


In [35]:
def get_pairs(rules):
    pairs = []
    for rule1 in rules:
        if 'maf_method' in rule1:
            for rule2 in rules:
                if rule1 == rule2 or (rule2, rule1) in pairs:
                    continue
                if rule2.startswith(rule1.split('maf_method')[0]):
                    pairs.append((rule1, rule2))
    return pairs

In [36]:
def extract_res(rules_info):
    rules = rules_info["preds"]
    mafs = rules_info["masses"]
    
    rule_names = [str(i) for i in rules]
    mass_first = [float(i[0]) for i in mafs]
    mass_second = [float(i[1]) for i in mafs]
    uncertainty = [float(i[2]) for i in mafs]
    
    res = {"rule": rule_names, "mass_first": mass_first, "mass_second": mass_second, "uncertainty": uncertainty}
    return res

In [39]:
pairs = get_pairs(rules)

dfs = {}

for r1, r2 in pairs:
    r1_info = pickle.load(open(os.path.join(RULE_FOLDER, r1), "rb"))
    r2_info = pickle.load(open(os.path.join(RULE_FOLDER, r2), "rb"))
    
    r1_df = pd.DataFrame(extract_res(r1_info))
    r2_df = pd.DataFrame(extract_res(r2_info))
    
    
    merged = pd.merge(r1_df, r2_df, on="rule", suffixes=('_kmeans', '_random'))
    merged.to_csv(f"rules_{r1}_{r2}.csv", index=False)

    print(f"Num rules {len(merged)}")
    fig = px.bar(merged, x='rule', y=['uncertainty_kmeans', 'uncertainty_random'], 
        title='Uncertainty comparison between kmeans and random', # put bars next to each other
        barmode='group')
    fig.write_html(f"{r1}_{r2}.html")
    dfs[r1] = merged 
    fig.show()

Num rules 49


Num rules 205


In [55]:
ks = list(dfs.keys())
print(ks)
a = dfs[ks[1]]


a["Uncertainty ratio"]  = a["uncertainty_random"] / a["uncertainty_kmeans"]

# remove inf
res = a["Uncertainty ratio"] 
res.replace([np.inf, -np.inf, ""], np.nan, inplace=True)

res = a["Uncertainty ratio"].dropna()

res.describe().round(3).to_clipboard()

['dataset=Brain Tumor, breaks=3, add_mult_rules=False, maf_method=kmeans.dsb', 'dataset=Brain Tumor, breaks=3, add_mult_rules=True, maf_method=kmeans.dsb']


In [47]:
a["Uncertainty ratio"].dropna()

0      1.950997
1      2.039998
2      2.296427
3      5.962318
4      2.055425
5      2.271974
6      2.257644
7      2.475599
8      2.068167
9      2.035401
10     2.328404
11     2.335880
12          inf
13     2.287148
14     1.489149
15     1.555324
16     1.407726
17     2.373192
18     2.151206
19     1.607796
20     0.800000
21     2.002392
22     2.090169
23     1.189022
24     2.078127
25     2.271453
26    11.785043
27     2.108319
28          inf
29    81.841992
30     1.546927
31     1.712679
32          inf
33     2.370462
34     1.966143
35     1.921688
36          inf
37     1.557658
38     1.915628
39     1.645202
40     2.094582
41     2.243877
42     2.615344
43     1.885207
44     2.361644
45     2.370196
46     1.955638
47     2.339852
48     5.545559
Name: Uncertainty ratio, dtype: float64

In [31]:

print(f"Num rules {len(merged)}")

Num rules 205


In [71]:
px.bar(merged, x='rule', y=['uncertainty_kmeans', 'uncertainty_random'], 
       title='Uncertainty comparison between kmeans and random', # put bars next to each other
       barmode='group')