In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# For optional 3D plotting
from mpl_toolkits.mplot3d import Axes3D

import os   
from typing import Final

import evoml_client as ec
from evoml_client.trial_conf_models import BudgetMode

import evoml_client as ec
from dotenv import load_dotenv

In [5]:
def initialise_client(base_url: str):
    load_dotenv()
 
    username = os.getenv("USER_NAME")
    password = os.getenv("PASSWORD")

    ec.init(username=username, password=password, base_url=base_url)

In [6]:
BASE_URL = "http://192.168.58.242/"

# initialise client using credentials
initialise_client(BASE_URL)

In [3]:
###############################################################################
# 1. Load synthetic data (adjust the path as needed)
###############################################################################
df_synthetic = pd.read_csv(
    "/home/manal/Workspace/evoml-usecases/src/synthetic_credit_data.csv"
)

###############################################################################
# 2. (Optionally) drop the binary targets if you only want pure unsupervised
#    segmentation. If you plan to interpret them, you can keep them.
###############################################################################
df_for_clustering = df_synthetic.drop(
    columns=["HighIncome", "HighBalance"], 
    errors="ignore"
)

# Identify categorical and numeric columns. Adjust names to match your data.
cat_cols = ["Gender", "Student", "Married", "Ethnicity"]
numeric_cols = [col for col in df_for_clustering.columns if col not in cat_cols]

###############################################################################
# 3. One-hot encode the categorical columns
###############################################################################
df_encoded = pd.get_dummies(df_for_clustering, columns=cat_cols, drop_first=True)

###############################################################################
# 4. Scale all features for K-means
###############################################################################
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_encoded)

###############################################################################
# 5. Run K-means
###############################################################################
k = 5  # Example: 5 clusters
kmeans = KMeans(n_clusters=k, random_state=92)
kmeans.fit(X_scaled)

###############################################################################
# 6. Get cluster labels. Optionally assign risk labels
###############################################################################
cluster_labels = kmeans.labels_

risk_mapping = {
    0: "Low Risk", 
    1: "Moderate Risk", 
    2: "Medium Risk", 
    3: "High Risk", 
    4: "Very High Risk"
}
risk_labels = [risk_mapping[label] for label in cluster_labels]

# Attach cluster info to both your scaled DataFrame and your original DataFrame
df_encoded["Cluster"] = cluster_labels
df_encoded["RiskLabel"] = risk_labels

df_for_clustering["Cluster"] = cluster_labels
df_for_clustering["RiskLabel"] = risk_labels


###############################################################################
# 7. Examine "Centroids" in the Original (Unscaled) Space
###############################################################################
# Means (or medians) for numeric columns by cluster.
cluster_means = (
    df_for_clustering
    .groupby("Cluster")[numeric_cols]
    .mean()
    # Rename index from 0..4 to risk labels
    .rename(index=risk_mapping)
)

print("=== Cluster Means (Unscaled) ===")
print(cluster_means)
print()

# Check how many people per cluster.
cluster_sizes = df_for_clustering["Cluster"].value_counts()

# Replace numeric indices with risk labels
cluster_sizes.index = cluster_sizes.index.map(risk_mapping)
print("=== Cluster Sizes ===")
print(cluster_sizes)
print()

=== Cluster Means (Unscaled) ===
                      Limit      Rating     Cards        Age  Education
Cluster                                                                
Low Risk        5024.360137  375.907120  2.606999  30.850242  13.990420
Moderate Risk   3569.523380  275.913741  2.641659  30.838607  13.545058
Medium Risk     2891.097692  231.637729  2.669526  31.000817  13.688975
High Risk       6163.716952  450.767984  2.578358  30.878985  13.141159
Very High Risk  7552.433875  543.270641  2.670866  31.009080  12.921658

=== Cluster Sizes ===
Cluster
Medium Risk       31827
High Risk         26162
Moderate Risk     17888
Very High Risk    14207
Low Risk           9916
Name: count, dtype: int64



In [8]:

dataset = ec.Dataset.from_pandas(df_synthetic, name="synthetic data")
dataset.put()
dataset.wait()

print(f"Dataset URL: {BASE_URL}/platform/datasets/view/{dataset.dataset_id}")

Dataset URL: http://192.168.58.242//platform/datasets/view/67c9b6bc508e1a68559616a9


In [9]:
df_synthetic

Unnamed: 0,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,HighBalance,HighIncome
0,3436.998892,272,2,35,9,Male,No,No,African American,True,True
1,3648.266365,264,3,29,15,Female,No,No,African American,True,True
2,2216.075540,163,2,37,11,Female,Yes,Yes,Caucasian,True,True
3,2342.170273,202,3,45,12,Male,No,Yes,African American,True,True
4,2299.162771,211,3,28,13,Female,No,Yes,African American,True,False
...,...,...,...,...,...,...,...,...,...,...,...
99995,6204.508693,476,4,28,15,Male,No,Yes,African American,True,True
99996,4167.398463,291,2,25,16,Female,No,Yes,African American,True,False
99997,5788.455796,421,3,34,16,Male,No,Yes,African American,True,True
99998,4344.742920,331,4,28,12,Male,Yes,Yes,Caucasian,True,True


In [None]:
config = ec.TrialConfig.with_default(
    task=ec.MlTask.classification,
    budget_mode=BudgetMode.fast,
    loss_funcs=["ROC AUC"],
    dataset_id=dataset.dataset_id,
)

# Trying disabling hyperparameter tuning for faster trial execution
# config.enableBudgetTuning = False

trial, _ = ec.Trial.from_dataset_id(
    dataset.dataset_id,
    target_col=target_column,
    trial_name="synthetic data",
    config=config,
)

trial.run(timeout=900)