In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score# Loading datafile
data= pd.read_csv('wdbc.csv')

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading datafile
data= pd.read_csv('wdbc.csv')

In [3]:
with open('wdbc_names.csv', 'r') as file:
    names = file.read()


In [4]:
# List of base features
base_features = [
    "radius", "texture", "perimeter", "area", "smoothness",
    "compactness", "concavity", "concave points", "symmetry", "fractal dimension"
]

# Construct full column names list
data.columns = ["ID", "Diagnosis"] + \
               [f"{feature} (mean)" for feature in base_features] + \
               [f"{feature} (SE)" for feature in base_features] + \
               [f"{feature} (Worst)" for feature in base_features]


In [5]:
data.columns

Index(['ID', 'Diagnosis', 'radius (mean)', 'texture (mean)',
       'perimeter (mean)', 'area (mean)', 'smoothness (mean)',
       'compactness (mean)', 'concavity (mean)', 'concave points (mean)',
       'symmetry (mean)', 'fractal dimension (mean)', 'radius (SE)',
       'texture (SE)', 'perimeter (SE)', 'area (SE)', 'smoothness (SE)',
       'compactness (SE)', 'concavity (SE)', 'concave points (SE)',
       'symmetry (SE)', 'fractal dimension (SE)', 'radius (Worst)',
       'texture (Worst)', 'perimeter (Worst)', 'area (Worst)',
       'smoothness (Worst)', 'compactness (Worst)', 'concavity (Worst)',
       'concave points (Worst)', 'symmetry (Worst)',
       'fractal dimension (Worst)'],
      dtype='object')

In [6]:
#split features
# Drop non-numeric columns
features= data.drop(['ID','Diagnosis'], axis = 1)
true_labels = data['Diagnosis'].map({'B': 0, 'M': 1})  # Convert 'Diagnosis' to numeric labels

In [7]:
features.head()

Unnamed: 0,radius (mean),texture (mean),perimeter (mean),area (mean),smoothness (mean),compactness (mean),concavity (mean),concave points (mean),symmetry (mean),fractal dimension (mean),...,radius (Worst),texture (Worst),perimeter (Worst),area (Worst),smoothness (Worst),compactness (Worst),concavity (Worst),concave points (Worst),symmetry (Worst),fractal dimension (Worst)
0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [8]:
from sklearn.preprocessing import StandardScaler

# Scale and standardized the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


In [9]:
from sklearn.cluster import KMeans

# Initialize the KMeans algorithm
kmeans = KMeans(n_clusters=2, init='k-means++', random_state=42)

# Fit the KMeans algorithm to the scaled data
kmeans.fit(scaled_features)

# Add the cluster labels to the original DataFrame
data['cluster'] = kmeans.labels_

In [10]:
# Display the cluster centers
cluster_centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=features.columns)

In [11]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from scipy.optimize import linear_sum_assignment 
def calculate_clustering_accuracy(true_labels, cluster_labels):
    contingency_matrix = pd.crosstab(true_labels, cluster_labels)
    row_ind, col_ind = linear_sum_assignment(-contingency_matrix.values)
    return contingency_matrix.values[row_ind, col_ind].sum() / len(true_labels)


In [12]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# Initialize variables
init_method = 'random'
n_clusters = 2
n_init = 10

# Initialize KMeans
kmeans = KMeans(n_clusters=n_clusters, init=init_method, n_init=n_init, random_state=42)

# Fit the KMeans algorithm to the scaled data (assuming scaled_features is already defined)
kmeans.fit(scaled_features)

# Add the cluster labels to the original DataFrame (assuming 'data' is your DataFrame)
data['cluster'] = kmeans.labels_

# Retrieve centroids
centroids = kmeans.cluster_centers_

# Calculate accuracy (assuming 'true_labels' and 'calculate_clustering_accuracy' are defined)
accuracy = calculate_clustering_accuracy(true_labels, kmeans.labels_)

# Calculate ARI and NMI
ari = adjusted_rand_score(true_labels, kmeans.labels_)
nmi = normalized_mutual_info_score(true_labels, kmeans.labels_)

# Print the results
print(f"Initialization: {init_method}, Clusters: {n_clusters}, Init Attempts: {n_init}")
print(f"Clustering Accuracy: {accuracy}")
print(f"Centroids:\n{centroids}")
print(f"Adjusted Rand Index (ARI): {ari}")
print(f"Normalized Mutual Information (NMI): {nmi}")


Initialization: random, Clusters: 2, Init Attempts: 10
Clustering Accuracy: 0.9066901408450704
Centroids:
[[-0.48540642 -0.2425197  -0.5026054  -0.48022292 -0.30816009 -0.52716774
  -0.57968706 -0.58803674 -0.30617181 -0.14419503 -0.42567434 -0.01911206
  -0.42854955 -0.40001236 -0.02164743 -0.36763445 -0.33388796 -0.39652985
  -0.07067017 -0.23444793 -0.51936742 -0.25314083 -0.53378469 -0.50043199
  -0.31410694 -0.49170292 -0.5382171  -0.58138837 -0.30081656 -0.33478169]
 [ 0.93578351  0.46753798  0.9689403   0.92579057  0.59408182  1.01629245
   1.11754104  1.13363784  0.59024875  0.27798423  0.8206299   0.0368449
   0.82617284  0.77115785  0.04173268  0.70873858  0.64368091  0.76444415
   0.13624042  0.45197693  1.00125472  0.48801377  1.02904884  0.96475033
   0.60554637  0.94792213  1.0375938   1.12082088  0.57992471  0.64540388]]
Adjusted Rand Index (ARI): 0.6588181173986278
Normalized Mutual Information (NMI): 0.5371187064967642


In [13]:
import numpy as np

# Input data (replace this with actual patient data)
# Example input data (your specific patient data after removing non-numeric columns)
input_data = [17.99, 10.38, 122.8, 1001, 0.1184, 0.2776, 0.3001, 0.1471, 0.2419, 0.07871, 
              1.095, 0.9053, 8.589, 153.4, 0.006399, 0.04904, 0.05373, 0.01587, 0.03003, 0.006193, 
              25.38, 17.33, 184.6, 2019, 0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189]

# Convert the input data into a NumPy array
input_data_as_numpy = np.asarray(input_data)

# Reshape the array for a single instance prediction
input_data_reshape = input_data_as_numpy.reshape(1, -1)

# Scale the input data (assuming 'scaler' is already fitted on training data)
input_data_scaled = scaler.transform(input_data_reshape)

# Make prediction using the trained clustering model
prediction = kmeans.predict(input_data_scaled)

# Map the cluster to a meaningful label (you should determine this based on your clusters)
if prediction[0] == 0:
    print('non cancerous')
else:
    print('cancerous')


cancerous


In [14]:
import pickle

In [15]:
# Assuming 'kmeans' is your trained KMeans model
#filename = 'trained_kmeans_model.sav'

# Save the trained KMeans model to a file
#pickle.dump(kmeans, open(filename, 'wb'))


In [16]:
import pickle

# Assuming 'kmeans' is your trained KMeans model and 'scaler' is your trained scaler (e.g., StandardScaler)
kmeans_filename = 'trained_kmeans_model.sav'
scaler_filename = 'scaler.sav'

# Save the trained KMeans model to a file
with open(kmeans_filename, 'wb') as kmeans_file:
    pickle.dump(kmeans, kmeans_file)

# Save the trained scaler to a file
with open(scaler_filename, 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [17]:
# Load the saved scaler
scaler = pickle.load(open('scaler.sav', 'rb'))

# Load the saved KMeans model
kmeans = pickle.load(open('trained_kmeans_model.sav', 'rb'))

In [18]:
#loaded_model = pickle.load(open('trained_kmeans_model.sav', 'rb'))

In [19]:
# Input data (replace this with actual patient data)
# Example input data (your specific patient data after removing non-numeric columns)
input_data = [17.99, 10.38, 122.8, 1001, 0.1184, 0.2776, 0.3001, 0.1471, 0.2419, 0.07871, 
              1.095, 0.9053, 8.589, 153.4, 0.006399, 0.04904, 0.05373, 0.01587, 0.03003, 0.006193, 
              25.38, 17.33, 184.6, 2019, 0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189]

# Convert the input data into a NumPy array
input_data_as_numpy = np.asarray(input_data)

# Reshape the array for a single instance prediction
input_data_reshape = input_data_as_numpy.reshape(1, -1)

# Scale the input data (assuming 'scaler' is already fitted on training data)
input_data_scaled = scaler.transform(input_data_reshape)

# Make prediction using the trained clustering model
prediction = kmeans.predict(input_data_scaled)

# Map the cluster to a meaningful label (you should determine this based on your clusters)
if prediction[0] == 0:
    print('non cancerous')
else:
    print('cancerous')


cancerous


In [20]:
for column in features.columns:
    print(column)

radius (mean)
texture (mean)
perimeter (mean)
area (mean)
smoothness (mean)
compactness (mean)
concavity (mean)
concave points (mean)
symmetry (mean)
fractal dimension (mean)
radius (SE)
texture (SE)
perimeter (SE)
area (SE)
smoothness (SE)
compactness (SE)
concavity (SE)
concave points (SE)
symmetry (SE)
fractal dimension (SE)
radius (Worst)
texture (Worst)
perimeter (Worst)
area (Worst)
smoothness (Worst)
compactness (Worst)
concavity (Worst)
concave points (Worst)
symmetry (Worst)
fractal dimension (Worst)
