In [42]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [43]:
# Read the CSV file
df = pd.read_csv("./Mall_Customers.csv")
print(df.columns)

# Rename the columns (equivalent to R's rename)
df = df.rename(columns={
    'Annual Income (k$)': 'Annual_Income',
    'Spending Score (1-100)': 'Spending_score'
})

# Remove rows with missing values (equivalent to na.omit)
df = df.dropna()

# Scale the numeric variables to have mean of 0 and std of 1
# First, identify numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
print(numeric_columns)

# Create a copy for scaling
df1 = df.copy()

# Scale numeric columns
scaler = StandardScaler()
df1[numeric_columns] = df1[numeric_columns].apply(lambda x: (x - x.mean()) / x.std(ddof=1)) # to match the R standardization result
df1_selected = df1.iloc[:, 3:5]

# Select only columns 4 and 5 (Annual_Income and Spending_score)
# Note: Python uses 0-based indexing, so columns 3 and 4 correspond to R's columns 4 and 5
  # This selects Annual_Income and Spending_score

# Display first few rows
print("First 6 rows of scaled data:")
print(df1_selected.head(6))
print()

Index(['CustomerID', 'Genre', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='object')
Index(['CustomerID', 'Age', 'Annual_Income', 'Spending_score'], dtype='object')
First 6 rows of scaled data:
   Annual_Income  Spending_score
0      -1.734646       -0.433713
1      -1.734646        1.192711
2      -1.696572       -1.711618
3      -1.696572        1.037814
4      -1.658498       -0.394989
5      -1.658498        0.999089



In [44]:
# Set random seed for reproducibility (equivalent to set.seed(1))
np.random.seed(1)

# Perform k-means clustering with k=5 clusters
# n_init=25 is equivalent to nstart=25 in R
kmeans = KMeans(n_clusters=5, n_init=25, random_state=1)
fit = kmeans.fit(df1_selected)

In [55]:
# Print detailed results (equivalent to R's fit output)
print("K-Means Clustering Results:")
print("=" * 40)
print(f"Number of clusters: {fit.n_clusters}")
print(f"Algorithm used: {kmeans.algorithm}")
print(f"Number of iterations: {fit.n_iter_}")
print(f"Within-cluster sum of squares: {fit.inertia_:.4f}")
print()

print("Cluster Centers:")
print("           Annual_Income  Spending_score")
for i, center in enumerate(fit.cluster_centers_):
    print(f"Cluster {i}: {center[0]:8.4f}    {center[1]:8.4f}")
print()

# Get cluster assignments for each observation
cluster_labels = fit.labels_

print("Cluster Sizes:")
unique, counts = np.unique(cluster_labels, return_counts=True)
for cluster, count in zip(unique, counts):
    print(f"Cluster {cluster}: {count} observations")
print()

print("Available sum of squares:")
print(f"Total within-cluster sum of squares: {fit.inertia_:.4f}")

K-Means Clustering Results:
Number of clusters: 5
Algorithm used: lloyd
Number of iterations: 4
Within-cluster sum of squares: 65.2406

Cluster Centers:
           Annual_Income  Spending_score
Cluster 0:   1.0524     -1.2812
Cluster 1:   0.9891      1.2364
Cluster 2:  -0.2004     -0.0264
Cluster 3:  -1.3262      1.1293
Cluster 4:  -1.3042     -1.1341

Cluster Sizes:
Cluster 0: 35 observations
Cluster 1: 39 observations
Cluster 2: 81 observations
Cluster 3: 22 observations
Cluster 4: 23 observations

Available sum of squares:
Total within-cluster sum of squares: 65.2406


In [56]:
# Add cluster assignments to original dataframe for reference
# df_with_clusters = df.copy()
# df_with_clusters['Cluster'] = cluster_labels  # Add 1 to match R's 1-based indexing

# print("\nFirst 3 observations with cluster assignments:")
# print(df_with_clusters[['Annual_Income', 'Spending_score', 'Cluster']].head(3))

print('Cluster assignment for each customer subject: ')
print(kmeans.predict(df1_selected))

Cluster assignment for each customer subject: 
[4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4
 3 4 3 4 3 4 2 4 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 1 0 1 2 1 0 1 0 1 2 1 0 1 0 1 0 1 0 1 2 1 0 1 0 1
 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0
 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1]


In [47]:
print(kmeans.get_feature_names_out())

['kmeans0' 'kmeans1' 'kmeans2' 'kmeans3' 'kmeans4']


In [48]:
print(kmeans.get_params())

{'algorithm': 'lloyd', 'copy_x': True, 'init': 'k-means++', 'max_iter': 300, 'n_clusters': 5, 'n_init': 25, 'random_state': 1, 'tol': 0.0001, 'verbose': 0}


In [5]:
l = '4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 2 4 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 0 1 2 1 0 1 0 1 2 1 0 1 0 1 0 1 0 1 2 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1'
l_tolist = l.split()
cluster_assignments = []
for n in l_tolist:
    cluster_assignments.append(int(n))
print(cluster_assignments)
print(len(cluster_assignments))

[4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 1, 2, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
200


In [17]:
import numpy as np
import pandas as pd
from datetime import datetime

# Create the new ADaM BDS-compliant dataset
n_subjects = len(cluster_assignments)
data = {
    "USUBJID": [f"SUBJ{i+1:03d}" for i in range(n_subjects)],
    "PARAMCD": ["CLUS"] * n_subjects,
    "PARAM": ["K-Means Cluster Assignment"] * n_subjects,
    "AVAL": cluster_assignments,
    "AVALC": [f"Cluster {i}" for i in cluster_assignments],
    "ADT": [datetime.strptime("2025-06-23", "%Y-%m-%d").date()] * n_subjects,
    "ASEQ": list(range(1, n_subjects + 1)),
    "ANL01FL": ["Y"] * n_subjects
}

adam_bds_df = pd.DataFrame(data)
print(adam_bds_df.head())

print(len(cluster_assignments))


   USUBJID PARAMCD                       PARAM  AVAL      AVALC         ADT  \
0  SUBJ001    CLUS  K-Means Cluster Assignment     4  Cluster 4  2025-06-23   
1  SUBJ002    CLUS  K-Means Cluster Assignment     3  Cluster 3  2025-06-23   
2  SUBJ003    CLUS  K-Means Cluster Assignment     4  Cluster 4  2025-06-23   
3  SUBJ004    CLUS  K-Means Cluster Assignment     3  Cluster 3  2025-06-23   
4  SUBJ005    CLUS  K-Means Cluster Assignment     4  Cluster 4  2025-06-23   

   ASEQ ANL01FL  
0     1       Y  
1     2       Y  
2     3       Y  
3     4       Y  
4     5       Y  
200


In [19]:
import pandas as pd
from datetime import datetime

# Define cluster centers and sizes
cluster_info = {
    0: {"size": 35, "Annual_Income": 1.0524, "Spending_score": -1.2812},
    1: {"size": 39, "Annual_Income": 0.9891, "Spending_score": 1.2364},
    2: {"size": 81, "Annual_Income": -0.2004, "Spending_score": -0.0264},
    3: {"size": 22, "Annual_Income": -1.3262, "Spending_score": 1.1293},
    4: {"size": 23, "Annual_Income": -1.3042, "Spending_score": -1.1341},
}

# Initialize list to collect rows
rows = []
aseq = 1
adt = datetime.strptime("2025-06-23", "%Y-%m-%d")

# Generate rows for each cluster
usubjid_counter = 1
for i in range(5):
    usubjid = f"CLUST{i}"
    # Row for cluster assignment
    rows.append({
        "USUBJID": usubjid,
        "PARAMCD": "CLSIZE",
        "PARAM": "Cluster Size",
        "AVAL": cluster_info[i]['size'],
        "AVALC": "",
        "ADT": adt,
        "ASEQ": aseq,
        "ANL01FL": "Y"
    })
    aseq += 1
    # Row for normalized Annual Income
    rows.append({
        "USUBJID": usubjid,
        "PARAMCD": "CENINCOME",
        "PARAM": "Cluster Center: Annual Income",
        "AVAL": cluster_info[i]["Annual_Income"],
        "AVALC": "",
        "ADT": adt,
        "ASEQ": aseq,
        "ANL01FL": "Y"
    })
    aseq += 1
    # Row for normalized Spending Score
    rows.append({
        "USUBJID": usubjid,
        "PARAMCD": "CENSPEND",
        "PARAM": "Cluster Center: Spending Score",
        "AVAL": cluster_info[i]["Spending_score"],
        "AVALC": "",
        "ADT": adt,
        "ASEQ": aseq,
        "ANL01FL": "Y"
    })
    aseq += 1
    usubjid_counter += 1
    # Add summary statistic: total within-cluster sum of squares

# Total within-cluster sum of squares
total_wcss = 65.2406
rows.append({
    "USUBJID": "SUMMARY",
    "PARAMCD": "TOTWCSS",
    "PARAM": "Total Within-Cluster SS",
    "AVAL": total_wcss,
    "AVALC": "",
    "ADT": adt,
    "ASEQ": aseq,
    "ANL01FL": "Y"
})
aseq += 1

# Create DataFrame
bds_df = pd.DataFrame(rows)
print(bds_df.head(50))


    USUBJID    PARAMCD                           PARAM     AVAL AVALC  \
0    CLUST0     CLSIZE                    Cluster Size  35.0000         
1    CLUST0  CENINCOME   Cluster Center: Annual Income   1.0524         
2    CLUST0   CENSPEND  Cluster Center: Spending Score  -1.2812         
3    CLUST1     CLSIZE                    Cluster Size  39.0000         
4    CLUST1  CENINCOME   Cluster Center: Annual Income   0.9891         
5    CLUST1   CENSPEND  Cluster Center: Spending Score   1.2364         
6    CLUST2     CLSIZE                    Cluster Size  81.0000         
7    CLUST2  CENINCOME   Cluster Center: Annual Income  -0.2004         
8    CLUST2   CENSPEND  Cluster Center: Spending Score  -0.0264         
9    CLUST3     CLSIZE                    Cluster Size  22.0000         
10   CLUST3  CENINCOME   Cluster Center: Annual Income  -1.3262         
11   CLUST3   CENSPEND  Cluster Center: Spending Score   1.1293         
12   CLUST4     CLSIZE                    Cluster S

In [21]:
df = pd.concat([adam_bds_df, bds_df])
# Save to CSV
df.to_csv("python_kmeans_bds.csv", index=False)
print("CSV file 'python_kmeans_bds.csv' created.")

CSV file 'python_kmeans_bds.csv' created.


In [4]:
import pandas as pd

# Read CSV and convert to list of lists
df = pd.read_csv("./python_kmeans_bds.csv")
rows = [
    [
        str(row["USUBJID"]),
        str(row["PARAMCD"]),
        str(row["PARAM"]),
        float(row["AVAL"]) if pd.notna(row["AVAL"]) else None,
        str(row["AVALC"]) if pd.notna(row["AVALC"]) else None,
        str(row["ADT"]),
        int(row["ASEQ"]),
        str(row["ANL01FL"])
    ]
    for _, row in df.iterrows()
]

for i in range(len(rows)):
    print(rows[i])

['SUBJ001', 'CLUS', 'K-Means Cluster Assignment', 4.0, 'Cluster 4', '2025-06-23', 1, 'Y']
['SUBJ002', 'CLUS', 'K-Means Cluster Assignment', 3.0, 'Cluster 3', '2025-06-23', 2, 'Y']
['SUBJ003', 'CLUS', 'K-Means Cluster Assignment', 4.0, 'Cluster 4', '2025-06-23', 3, 'Y']
['SUBJ004', 'CLUS', 'K-Means Cluster Assignment', 3.0, 'Cluster 3', '2025-06-23', 4, 'Y']
['SUBJ005', 'CLUS', 'K-Means Cluster Assignment', 4.0, 'Cluster 4', '2025-06-23', 5, 'Y']
['SUBJ006', 'CLUS', 'K-Means Cluster Assignment', 3.0, 'Cluster 3', '2025-06-23', 6, 'Y']
['SUBJ007', 'CLUS', 'K-Means Cluster Assignment', 4.0, 'Cluster 4', '2025-06-23', 7, 'Y']
['SUBJ008', 'CLUS', 'K-Means Cluster Assignment', 3.0, 'Cluster 3', '2025-06-23', 8, 'Y']
['SUBJ009', 'CLUS', 'K-Means Cluster Assignment', 4.0, 'Cluster 4', '2025-06-23', 9, 'Y']
['SUBJ010', 'CLUS', 'K-Means Cluster Assignment', 3.0, 'Cluster 3', '2025-06-23', 10, 'Y']
['SUBJ011', 'CLUS', 'K-Means Cluster Assignment', 4.0, 'Cluster 4', '2025-06-23', 11, 'Y']
['SUBJ01

In [6]:
import json
# Write to JSON file
with open("python_kmeans_bds_2.json", "w") as f:
    json.dump(rows, f, indent=4)