# imports and config

In [1]:
!pip install nbformat>=4.2.0 entrypoints toolz

In [2]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
import plotly.express as px
import pandas as pd
import numpy as np
import os

import datetime 
import logging

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import train_test_split



In [3]:

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", 
                    datefmt="%d-%b-%y %H:%M:%S")

In [4]:
DATASET = "gaussian_df.csv"
PATH = os.path.join(os.getcwd(), "..", "data", DATASET)
LOWER_CONFIDENCE_BY_PROPORTION = True
OUTLIER_THRESHOLD_NUM_STD = 2

# Loading data

In [5]:

df = pd.read_csv(PATH)
df.head()

Unnamed: 0,x,y,labels
0,1.61458,-0.10042,0.0
1,1.019888,0.312567,0.0
2,1.493949,0.417392,0.0
3,0.985614,0.118068,0.0
4,0.684116,0.219825,0.0


In [6]:
data = df[["x", "y"]]
n_clusters = 2

In [7]:
# scatter plot
fig = px.scatter(df, x="x", y="y", color="labels")
fig.show()

# KMeans

In [8]:
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")

kmeans_labels = kmeans.fit_predict(data)
# # Calculate metrics
# kmeans_silhouette = (silhouette_score(data, kmeans_labels)
#                      if len(set(kmeans_labels)) > 2 else 0)


Exception in thread Thread-8 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\hayk_\.conda\envs\thesis\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\hayk_\.conda\envs\thesis\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\hayk_\.conda\envs\thesis\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\hayk_\.conda\envs\thesis\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x81 in position 3: invalid start byte

Could not find the number of physical cores for the following reason:
found 0 physical cores < 1

  File "c:\Users\hayk_\.conda\envs\thesis\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [9]:
kmeans.cluster_centers_

array([[1.19285911, 0.18145612],
       [0.1777886 , 0.90350888]])

In [10]:
# calculate each points distance to the centroids
df["distance_to_centroid"] = np.min(
    np.linalg.norm(data.values[:, np.newaxis] - kmeans.cluster_centers_, axis=2), axis=1)



In [11]:
# plot
df["labels_kmeans"] = kmeans_labels.astype(str)


fig = px.scatter(df, x="x", y="y", color="labels_kmeans", hover_data=["distance_to_centroid"])

def add_centroids(fig, kmeans):
    fig.add_scatter(x=kmeans.cluster_centers_[:, 0],
                    y=kmeans.cluster_centers_[:, 1],
                    mode="markers",
                    marker=dict(symbol="x", size=15, color="Green"))

    return fig

fig = add_centroids(fig, kmeans)

fig.show()

In [12]:
df.groupby("labels_kmeans").agg({"distance_to_centroid": "max"})

Unnamed: 0_level_0,distance_to_centroid
labels_kmeans,Unnamed: 1_level_1
0,0.704649
1,0.951743


# Outliers

In [13]:
def detect_outliers_z_score(data, threshold=OUTLIER_THRESHOLD_NUM_STD):
    outliers = []
    mean = np.mean(data)
    std_dev = np.std(data)
    
    for i in data:
        z_score = (i - mean) / std_dev 
        if np.abs(z_score) > threshold:
            outliers.append(i)
    return outliers

outliers = detect_outliers_z_score(df["distance_to_centroid"])

df["outlier"] = df["distance_to_centroid"].apply(lambda x: x in outliers)


In [14]:
# plot
fig = px.scatter(df, x="x", y="y", color="labels_kmeans", symbol="outlier", symbol_map={True: "cross", False: "circle"})

# add crosses for centroids
fig.add_scatter(x=kmeans.cluster_centers_[:, 0],
                y=kmeans.cluster_centers_[:, 1],
                mode="markers",
                marker=dict(symbol="x", size=15, color="Green"))

fig.show()

In [15]:
df_okay = df[~df["outlier"]]

# normalize distance to centroid for each cluster
df_okay["distance_to_centroid_norm"] = df_okay.groupby("labels_kmeans")["distance_to_centroid"].transform(
    lambda x: (x - x.min()) / (x.max() - x.min()))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Rule scoring

In [16]:
def filter_by_rule(df, rule_lambda, lower_confidence_by_proportion=LOWER_CONFIDENCE_BY_PROPORTION,
                   only_plot=False):
    """
    Filters a DataFrame based on a given rule lambda function and calculates the confidence score.

    Note:
        The confidence score is calculated as the average distance to the centroid of the most common cluster.
        If the data points belong to the same cluster, the confidence score is the average distance to the centroid.
        If the data points belong to different clusters, the confidence score is the average distance to the centroid of the most common cluster.
        If the confidence score is lowered by proportion, the confidence score is multiplied by the proportion of data points in the most common cluster.

        Since the closer the data points are to the centroid, the better, the confidence score is calculated as 1 - ... 
    Args:
        df (pd.DataFrame): The input DataFrame to filter.
        rule_lambda (function): A lambda function that defines the filtering rule.
        lower_confidence_by_proportion (bool, optional): Whether to lower the confidence score by proportion. 
            Defaults to True.
        only_plot (bool, optional): Whether to to only plot the data. Defaults to False.
        
    Returns:
        tuple: A tuple containing the filtered DataFrame and the confidence score.

    Example:
        filter_by_rule(df, lambda row: row["x"]>0.5 and row["y"]>0.5)
        

    """
    
    # example is lambda row: row["x"]>0.5 and row["y"]>0.5
    df["rule_applies"] = df.apply(rule_lambda, axis=1)
    
    df_rule = df[df["rule_applies"]]
    
    if df_rule.empty:
        logging.info("No data points left after filtering")
        return df
    
    if only_plot:
        fig = px.scatter(df, x="x", y="y", color="rule_applies")
        fig = add_centroids(fig, kmeans)
        fig.show()
        return fig 
    
    num_labels = df_rule["labels_kmeans"].nunique()
    
    logging.info(f"Number of data points left after filtering: {len(df_rule)}")
    logging.info(f"Number of clusters left after filtering: {num_labels}")  
    
    if df_rule["labels_kmeans"].nunique() == 1:
        logging.info("All data points belong to the same cluster")
        confidence = df_rule["distance_to_centroid_norm"].mean()
        
        logging.info(f"Confidence: {confidence}")
    else:
        logging.info("Data points belong to different clusters")
        # most common cluster
        most_common_cluster = df_rule["labels_kmeans"].mode().values[0]
        logging.info(f"Most common cluster: {most_common_cluster}")
        
        # confidence
        confidence = df_rule[df_rule["labels_kmeans"] == most_common_cluster]["distance_to_centroid_norm"].mean()
        logging.info(f"Confidence: {confidence}")
        
        if lower_confidence_by_proportion:
            # num of data points in most common cluster
            num_points = len(df_rule[df_rule["labels_kmeans"] == most_common_cluster])
            # proportion of data points in most common cluster
            proportion = num_points / len(df_rule)
            
            confidence = confidence * proportion
            logging.info(f"Confidence after lowering based on proportion: {confidence}")
            
    return df, 1 - confidence


In [17]:
d, conf = filter_by_rule(df_okay, lambda row: 0.2<row["x"]<0.3 and 0.5<row["y"]<1)

fig = px.scatter(d, x="x", y="y", color="rule_applies")

fig = add_centroids(fig, kmeans)

fig



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

23-Apr-24 19:10:07 [INFO] Number of data points left after filtering: 18
23-Apr-24 19:10:07 [INFO] Number of clusters left after filtering: 1
23-Apr-24 19:10:07 [INFO] All data points belong to the same cluster
23-Apr-24 19:10:07 [INFO] Confidence: 0.17051691869533558


In [18]:
d, conf = filter_by_rule(df_okay, lambda row: 0.5<row["x"]<1 and 0.5<row["y"]<1)

fig = px.scatter(d, x="x", y="y", color="rule_applies")

fig = add_centroids(fig, kmeans)

fig



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

23-Apr-24 19:10:08 [INFO] Number of data points left after filtering: 20
23-Apr-24 19:10:08 [INFO] Number of clusters left after filtering: 2
23-Apr-24 19:10:08 [INFO] Data points belong to different clusters
23-Apr-24 19:10:08 [INFO] Most common cluster: 1
23-Apr-24 19:10:08 [INFO] Confidence: 0.7030490123603427
23-Apr-24 19:10:08 [INFO] Confidence after lowering based on proportion: 0.45698185803422275


# Rules generators

## CN2

In [19]:
PATH 

'c:\\Users\\hayk_\\OneDrive\\Desktop\\Thesis\\CDSGD\\cdsgd\\..\\data\\gaussian_df.csv'

https://github.com/scikit-learn-contrib/skope-rules/issues/58

https://stackoverflow.com/questions/61867945/python-import-error-cannot-import-name-six-from-sklearn-externals

In [None]:
!pip install skope-rules





Collecting skope-rules
  Downloading skope_rules-1.0.1-py3-none-any.whl.metadata (463 bytes)
Downloading skope_rules-1.0.1-py3-none-any.whl (14 kB)
Installing collected packages: skope-rules
Successfully installed skope-rules-1.0.1


In [None]:
df_use = df[["x", "y", "labels_kmeans"]]

In [None]:
df_use 

Unnamed: 0,x,y,labels_kmeans
0,1.614580,-0.100420,0
1,1.019888,0.312567,0
2,1.493949,0.417392,0
3,0.985614,0.118068,0
4,0.684116,0.219825,0
...,...,...,...
495,0.427324,1.220816,1
496,-0.237872,1.111249,1
497,0.109251,0.857175,1
498,0.039282,1.253865,1


In [None]:
import six
import sys
sys.modules['sklearn.externals.six'] = six

from sklearn.datasets import load_iris
from skrules import SkopeRules

feature_names = df_use.columns
clf = SkopeRules(max_depth_duplication=3,
                 n_estimators=30,
                 precision_min=0.02,
                 recall_min=0.02,
                 feature_names=feature_names)

# for idx, species in enumerate(df_use):
#     print(idx, species)
X, y = df_use[["x", 'y']], df_use["labels_kmeans"].astype(int )

clf.fit(X, y)
rules = clf.rules_[0:3]
# print("Rules for iris", species)
for rule in rules:
    print(rule)
print()
print(20*'=')
print()


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.



('x <= 0.6895888149738312 and y > 0.35303783416748047', (1.0, 0.9849683977570663, 10))
('x <= 0.5822996497154236', (0.9948453608247423, 0.9368932038834952, 1))
('x > 0.5822996497154236 and y > 0.7978371977806091', (1.0, 0.05825242718446602, 1))




In [None]:
import Orange

# Sample data: a mix of categorical and numerical features
data = Orange.data.Table(PATH)  # using an in-built dataset for demonstration

# Initialize the CN2 learner
learner = Orange.classification.rules.CN2Learner()

# Fit the model
classifier = learner(data)

ValueError: Categorical class variable expected.

In [None]:
import Orange

data = Orange.data.Table("iris")
learner = Orange.classification.CN2UnorderedLearner()

# # consider up to 10 solution streams at one time
# learner.rule_finder.search_algorithm.beam_width = 10

# # continuous value space is constrained to reduce computation time
# learner.rule_finder.search_strategy.constrain_continuous = True

# # found rules must cover at least 15 examples
# learner.rule_finder.general_validator.min_covered_examples = 15

# # found rules may combine at most 2 selectors (conditions)
learner.rule_finder.general_validator.max_rule_length = 2

classifier = learner(data)

In [None]:
print(classifier.rule_list[0])


IF petal length<=3.0 AND sepal width>=2.9 THEN iris=Iris-setosa 


In [None]:
# Initialize the CN2 learner
learner = Orange.classification.rules.CN2Learner()

# Fit the model
classifier = learner(data)



ValueError: Categorical class variable expected.

In [None]:

# Output rules
print("Rules generated from CN2 algorithm:")

print(classifier.rule_list[0])
# for rule in classifier.rule_list:
#     print(Orange.classification.rules.to_string(rule))

Rules generated from CN2 algorithm:
IF chest pain==atypical ang AND ST by exercise>=1.8 THEN diameter narrowing=1 


# DS

In [21]:
def report_results(y_test, y_pred, epoch=None, dt=None, losses=None, 
                   name=None, save_results=False, save_path=None):
    if epoch and dt and losses:
        logging.info(f"Training Time: {dt:.2f}s")
        logging.info(f"Epochs: {epoch+1}")
        logging.info(f"Min Loss: {losses[-1]:.3f}")
        px.line(losses, markers=True).show()
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    logging.info(f"Accuracy:  {accuracy:.2f}")
    logging.info(f"F1 Score: {f1:.2f}")
    logging.info(f"Confusion Matrix: \n{conf_matrix}")
    
    if save_results:
        now = datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S")
        if save_path is None:
            save_path = f"experiments.csv"
        if name is None:
            name = "No name"
        res_row = {"name": name, "accuracy": accuracy, "f1": f1, 
                    "confusion_matrix": conf_matrix, 
                    "training_time": dt, "epochs": epoch+1,"min_loss": losses[-1], 
                    "datetime": now}
        
        res_df = pd.read_csv(save_path) if os.path.exists(save_path) else pd.DataFrame()
        res_df = pd.concat([res_df, pd.DataFrame([res_row])], ignore_index=True)
        res_df.to_csv(save_path, index=False)

## Data prep

In [22]:
# train test split

X = df_okay[["x", "y"]]
y = df_okay["labels_kmeans"]

data = df_okay[["x", 'y', 'labels_kmeans']]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
data = data.sample(frac=1).reset_index(drop=True)

data = data.apply(pd.to_numeric)
cut = int(0.7*len(data))

X_train = data.iloc[:cut, :-1].values
y_train = data.iloc[:cut, -1].values
X_test = data.iloc[cut:, :-1].values
y_test = data.iloc[cut:, -1].values


logging.info(f"Train: {len(X_train)} Test: {len(X_test)}")



23-Apr-24 19:11:36 [INFO] Train: 343 Test: 147


In [23]:

# data = pd.read_csv("../data/breast-cancer-wisconsin.csv")

# data = data.drop("id", axis=1)
# data["class"] = data["class"].map({2: 0, 4: 1})

# data = data.apply(pd.to_numeric, args=("coerce",))
# data = data.sample(frac=1).reset_index(drop=True)

# cut = int(0.7*len(data))

# X_train = data.iloc[:cut, :-1].values
# y_train = data.iloc[:cut, -1].values
# X_test = data.iloc[cut:, :-1].values
# y_test = data.iloc[cut:, -1].values


In [24]:
def uniform_rule(X, num_parts=5, col_names=None):
    # create a uniform rule
    rules = []
    
    for i in range(X.shape[1]):
        col = X[:, i]
        min_val = np.min(col)
        max_val = np.max(col)
        
        step = (max_val - min_val) / num_parts
        
        for j in range(num_parts):
            text = f"{min_val + j*step} <= x[{i}] <= {min_val + (j+1)*step}"
            print(text)
            # todo
            rules.append(DSRule(lambda x: min_val + j*step <= x[i] <= min_val + (j+1)*step))
            
    return rules

In [25]:
rules = uniform_rule(X_train, 3, col_names=["x", "y"])
rules

-0.360481450892285 <= x[0] <= 0.32273102453801616


NameError: name 'DSRule' is not defined

## Run Model

In [26]:
import sys
print(sys.path)

['c:\\Users\\hayk_\\OneDrive\\Desktop\\Thesis\\CDSGD\\cdsgd', 'c:\\Users\\hayk_\\.conda\\envs\\thesis\\python310.zip', 'c:\\Users\\hayk_\\.conda\\envs\\thesis\\DLLs', 'c:\\Users\\hayk_\\.conda\\envs\\thesis\\lib', 'c:\\Users\\hayk_\\.conda\\envs\\thesis', '', 'C:\\Users\\hayk_\\AppData\\Roaming\\Python\\Python310\\site-packages', 'C:\\Users\\hayk_\\AppData\\Roaming\\Python\\Python310\\site-packages\\win32', 'C:\\Users\\hayk_\\AppData\\Roaming\\Python\\Python310\\site-packages\\win32\\lib', 'C:\\Users\\hayk_\\AppData\\Roaming\\Python\\Python310\\site-packages\\Pythonwin', 'c:\\Users\\hayk_\\.conda\\envs\\thesis\\lib\\site-packages', 'c:\\Users\\hayk_\\.conda\\envs\\thesis\\lib\\site-packages\\win32', 'c:\\Users\\hayk_\\.conda\\envs\\thesis\\lib\\site-packages\\win32\\lib', 'c:\\Users\\hayk_\\.conda\\envs\\thesis\\lib\\site-packages\\Pythonwin']


In [27]:

from DSClassifierMultiQ import DSClassifierMultiQ
from DSRule import DSRule

# from importlib import reload
# reload(DSClassifierMultiQ)


In [28]:


logging.info("Training DSClassifierMultiQ")
DSC = DSClassifierMultiQ(2, debug_mode=True, num_workers=0)
# DSC = DSClassifierMultiQ(2, debug_mode=True, num_workers=0, maf_method="panir")

DSC.model.add_rule(DSRule(lambda x: x[0] > 18, "Patient is adult"))
# DSC.model.add_rule(DSRule(lambda x: x[0] < 18, "Patient is adult"))
# for i in range(len(rules)):
#     # print(rules[i])
#     DSC.model.add_rule(rules[i])


# for num_breaks in [2, 3, 4, 5]:
num_breaks = 3

name = f"gauusian, {num_breaks} single breaks, add_mult_rules=False"

res = DSC.fit(X_train, y_train, 
            add_single_rules=True, single_rules_breaks=num_breaks, add_mult_rules=False,
            column_names=["x", 'y'], print_every_epochs=1, print_final_model=True)

losses, epoch, dt = res

DSC.model.save_rules_bin(os.path.join("rules_saved", f"{name}.dsb"))


y_pred = DSC.predict(X_test)

    
report_results(y_test, y_pred, epoch=epoch, dt=dt, losses=losses, 
            save_results=True, name=name)

23-Apr-24 19:11:43 [INFO] Training DSClassifierMultiQ


Optimization started
Processing epoch	84	0.0723	

KeyboardInterrupt: 

In [None]:




# print("Explaining instance: ")
# print(X_test[0])
# pred, cls, rls, builder = DSC.predict_explain(X_test[0])
# print(builder)
# print(rls)

23-Apr-24 18:20:16 [INFO] Training Time: 25.26s
23-Apr-24 18:20:16 [INFO] Epochs: 200
23-Apr-24 18:20:16 [INFO] Min Loss: 0.027


23-Apr-24 18:20:16 [INFO] Accuracy:  0.98
23-Apr-24 18:20:16 [INFO] F1 Score: 0.98
23-Apr-24 18:20:16 [INFO] Confusion Matrix: 
[[67  2]
 [ 1 77]]


In [None]:

X_train_df = data.iloc[:cut, :-1].values
y_train = data.iloc[:cut, -1].values
X_test = data.iloc[cut:, :-1].values
y_test = data.iloc[cut:, -1].values

Unnamed: 0,x,y,labels_kmeans
0,1.245085,-0.067630,0
1,0.519859,1.021516,1
2,0.384995,1.313368,1
3,1.233272,0.415007,0
4,1.109102,0.135470,0
...,...,...,...
485,1.233437,0.302323,0
486,1.263470,-0.236248,0
487,1.188081,0.165184,0
488,1.577180,0.358378,0


In [None]:
DSC.find_most_important_rules(threshold=0.1)

{0: [(0.03163551673105758,
   5,
   '0.234 < y < 0.527',
   0.17786375890286807,
   array([0.15833704, 0.04146155, 0.8002014 ], dtype=float32)),
  (0.028245294758107775,
   4,
   'y < 0.234',
   0.16806336530638608,
   array([0.14274216, 0.05513415, 0.80212367], dtype=float32)),
  (0.024575292036107754,
   1,
   '0.329 < x < 0.708',
   0.15676508551366836,
   array([0.1290192 , 0.06145862, 0.8095222 ], dtype=float32)),
  (0.024048811874058273,
   3,
   'x > 1.086',
   0.1550767934736151,
   array([0.12398298, 0.06998566, 0.80603135], dtype=float32)),
  (0.021501795279943003,
   2,
   '0.708 < x < 1.086',
   0.14663490471215576,
   array([0.11201493, 0.07993972, 0.80804527], dtype=float32)),
  (0.021418596140573953,
   7,
   'y > 0.819',
   0.14635093488110676,
   array([0.11235696, 0.07827288, 0.8093701 ], dtype=float32)),
  (0.012596876743142804,
   6,
   '0.527 < y < 0.819',
   0.11223580864921322,
   array([0.06448371, 0.13086608, 0.80465025], dtype=float32))],
 1: [(0.0405167148707

In [None]:
filter_by_rule(df_okay, lambda row: 0.5<row["x"]<1 and 0.5<row["y"]<1, only_plot=True)

In [None]:
px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=y_train)