In [26]:
import os 
import numpy as np
import pandas as pd

import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from config import OUTLIER_THRESHOLD_NUM_STD

In [27]:
def detect_outliers_z_score(data, threshold=OUTLIER_THRESHOLD_NUM_STD):
    outliers = []
    mean = np.mean(data)
    std_dev = np.std(data)
    
    for i in data:
        z_score = (i - mean) / std_dev 
        if np.abs(z_score) > threshold:
            outliers.append(i)
    return outliers

In [16]:
DATA_FOLDER = os.path.join("..","data")
OUTPUT_FOLDER = os.path.join("..", "data_binary")

DATASETS = ['gaussian_df.csv', "rectangle_df.csv", "uniform_df.csv", "wine.csv",
            "breast-cancer-wisconsin.csv"]

In [32]:
for path in DATASETS:
    print(path)
    df = pd.read_csv(os.path.join(DATA_FOLDER, path))
    
    last_column = df.columns[-1]   
     # last column is the label
    df_not_label = df.iloc[:, :-1]
    data_labels = df[last_column]
    
    assert data_labels.nunique() == 2, "Data is not binary"
    
    kmeans = KMeans(n_clusters=2, random_state=42, n_init="auto")

    kmeans_labels = kmeans.fit_predict(df_not_label)
    
    silhouette = silhouette_score(df_not_label, kmeans_labels)
    inertia = kmeans.inertia_
    print(f"Silhouette score for {path} is {silhouette}")
    print(f"Inertia for {path} is {inertia}")
    print()
    df["labels_kmeans"] = kmeans_labels
    
    # if df.shape[1] == 3:
    #     print(path)
    #     fig = px.scatter(df, x=df.columns[0], y=df.columns[1], color="labels_kmeans")
    #     fig.show()
    #     fig = px.scatter(df, x=df.columns[0], y=df.columns[1], color="labels")
    #     fig.show()
    
    # calculate each points distance to the centroids
    df["distance_to_centroid"] = np.min(
        np.linalg.norm(df_not_label.values[:, np.newaxis] - kmeans.cluster_centers_, axis=2), axis=1)

    # detect outliers
    outliers = detect_outliers_z_score(df["distance_to_centroid"])

    df["outlier"] = df["distance_to_centroid"].apply(lambda x: x in outliers)

    df_okay = df[~df["outlier"]]

    # normalize distance to centroid for each cluster
    df["distance_to_centroid_norm"] = df_okay.groupby("labels_kmeans")["distance_to_centroid"].transform(
        lambda x: (x - x.min()) / (x.max() - x.min()))
    
    # make "labels_kmeans" be the last column
    cols_to_drop = ["distance_to_centroid", "outlier"]
    
    df = df[[col for col in df.columns if col not in ["labels_kmeans"] + cols_to_drop] + ["labels_kmeans"]]
    
        
    df.to_csv(os.path.join(OUTPUT_FOLDER, f"kmeans_{path}"), index=False)
    


gaussian_df.csv
Silhouette score for gaussian_df.csv is 0.6435277436707083
Inertia for gaussian_df.csv is 62.05997560557416

rectangle_df.csv
Silhouette score for rectangle_df.csv is 0.439638954527144
Inertia for rectangle_df.csv is 501.1251039938792

uniform_df.csv


Silhouette score for uniform_df.csv is 0.6092631731974764
Inertia for uniform_df.csv is 1056.128462713776

wine.csv
Silhouette score for wine.csv is 0.5091099728827927
Inertia for wine.csv is 8595922.912948519

breast-cancer-wisconsin.csv
Silhouette score for breast-cancer-wisconsin.csv is 0.9681880608975368
Inertia for breast-cancer-wisconsin.csv is 71344784701932.75

