In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import silhouette_score

# Load the CSV files
ptbxl_data = pd.read_csv("ptbxl_database.csv")
scp_statements = pd.read_csv("scp_statements.csv")

# Extract relevant features and labels
# (Adjust features based on your analysis and domain knowledge)
features = ptbxl_data[['age', 'sex', 'height', 'weight']]
label_mapping = {
    'NORM': 0,
    'MI': 1,
    'STTC': 2,
    'CD': 3,
    'HYP': 4
}
def extract_labels(codes):
    labels = []
    for code in codes:
        code_dict = eval(code)
        found_label = -1 # Representing 'Other' or unknown
        for key in code_dict.keys():
            if key in label_mapping:
                found_label = label_mapping[key]
                break
        labels.append(found_label)
    return labels

ptbxl_data['label'] = extract_labels(ptbxl_data['scp_codes'])
labels = ptbxl_data['label']

# Handle missing values
features = features.fillna(features.mean())

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

silhouette_scores = []
for n_clusters in range(2, 11):  # Adjust the range as needed
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(scaled_features)
    silhouette_avg = silhouette_score(scaled_features, cluster_labels)
    silhouette_scores.append(silhouette_avg)

optimal_clusters = np.argmax(silhouette_scores) + 2  # Adjust index for correct cluster number
print(f"Optimal number of clusters: {optimal_clusters}")

kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Optimal number of clusters: 6


  super()._check_params_vs_input(X, default_n_init=10)


In [39]:
silhouette_avg = silhouette_score(scaled_features, clusters)
print(f"Silhouette Coefficient: {silhouette_avg}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    scaled_features, labels, test_size=0.2, random_state=42
)

# Train KMeans on training data
kmeans_train = KMeans(n_clusters=optimal_clusters, random_state=42)
train_clusters = kmeans_train.fit_predict(X_train)

# Predict clusters for test data
test_clusters = kmeans_train.predict(X_test)

from collections import Counter
def map_clusters_to_labels(true_labels, clusters):
    mapping = {}
    for i in range(optimal_clusters): # Use optimal_clusters instead of n_clusters
        cluster_labels = true_labels[clusters == i]
        if cluster_labels.size > 0: # Check if the cluster has any labels
            most_common_label = Counter(cluster_labels).most_common(1)[0][0]
        else:
            most_common_label = -1 # Handle the case of empty cluster
        mapping[i] = most_common_label
    return mapping

cluster_to_label_map = map_clusters_to_labels(y_train, train_clusters)

predicted_labels = [cluster_to_label_map[c] for c in test_clusters]

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy}")


Silhouette Coefficient: 0.5302698327106786


  super()._check_params_vs_input(X, default_n_init=10)


Accuracy: 0.5697247706422018


# New Section

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import silhouette_score

# Load the CSV files
ptbxl_data = pd.read_csv("ptbxl_database.csv")
scp_statements = pd.read_csv("scp_statements.csv")

# Create DBSCAN object
dbscan = DBSCAN(eps=0.5, min_samples=5)  # Adjust eps and min_samples as needed

# Fit DBSCAN to the scaled features
clusters = dbscan.fit_predict(scaled_features)


In [20]:
silhouette_avg = silhouette_score(scaled_features, clusters)
print(f"Silhouette Coefficient: {silhouette_avg}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    scaled_features, labels, test_size=0.2, random_state=42
)

# Train DBSCAN on training data
dbscan_train = DBSCAN(eps=0.5, min_samples=5)  # Adjust parameters if needed
train_clusters = dbscan_train.fit_predict(X_train)

# Similar mapping function as before, but consider noise points (-1)
def map_clusters_to_labels(true_labels, clusters):
    mapping = {}
    for i in set(clusters):
        if i == -1:  # Noise points
            mapping[i] = -1
        else:
            most_common_label = Counter(true_labels[clusters == i]).most_common(1)[0][0]
            mapping[i] = most_common_label
    return mapping

cluster_to_label_map = map_clusters_to_labels(labels, clusters)
predicted_labels = [cluster_to_label_map[c] for c in clusters]

Silhouette Coefficient: 0.4218188314088124


In [23]:
# Handle potential new clusters in test data
for c in set(test_clusters):
    if c not in cluster_to_label_map:
        # Assign a default label or handle it based on your use case
        cluster_to_label_map[c] = -1  # For example, treat it as a noise point

predicted_labels = [cluster_to_label_map[c] for c in test_clusters]

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5559633027522936


# New Section

In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import silhouette_score

# Load the CSV files
ptbxl_data = pd.read_csv("ptbxl_database.csv")
scp_statements = pd.read_csv("scp_statements.csv")

In [31]:
features = ptbxl_data[['age', 'sex', 'height', 'weight']]  # Adjust features as needed
label_mapping = {'NORM': 0, 'MI': 1, 'STTC': 2, 'CD': 3, 'HYP': 4}

def extract_labels(codes):
    labels = [] # Start of the indented code block within the function
    for code_string in codes:
        diagnosis_codes = code_string.split(',')
        mapped_labels = [label_mapping.get(code.strip(), -1) for code in diagnosis_codes]
        # Use the most severe label (lower number is more severe)
        if mapped_labels:
            labels.append(min(mapped_labels))
        else:
            labels.append(-1)  # Handle cases with no diagnostic codes
    return labels # End of the indented code block within the function

ptbxl_data['label'] = extract_labels(ptbxl_data['scp_codes'])
labels = ptbxl_data['label']

In [32]:
features = features.fillna(features.mean())  # Handle missing values
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [34]:
# Create AgglomerativeClustering object
n_clusters = 5  # Adjust as needed
hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')  # Choose linkage method

# Fit hierarchical clustering to the scaled features
clusters = hierarchical.fit_predict(scaled_features)

silhouette_avg = silhouette_score(scaled_features, clusters)
print(f"Silhouette Coefficient: {silhouette_avg}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    scaled_features, labels, test_size=0.2, random_state=42
)

# Train hierarchical clustering on training data
hierarchical_train = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')  # Adjust parameters if needed
train_clusters = hierarchical_train.fit_predict(X_train)

# Predict clusters for test data
test_clusters = hierarchical_train.fit_predict(X_test)  # Note: fit_predict is used for prediction in hierarchical clustering


Silhouette Coefficient: 0.46744440684618693


In [35]:
from collections import Counter
def map_clusters_to_labels(true_labels, clusters):
    # Count the true labels within each cluster
    cluster_counts = {}
    for i, cluster in enumerate(clusters):
        if cluster not in cluster_counts:
            cluster_counts[cluster] = Counter()
        cluster_counts[cluster][true_labels[i]] += 1

    # Map each cluster to the most frequent true label within it
    cluster_to_label_map = {cluster: counts.most_common(1)[0][0]
                           for cluster, counts in cluster_counts.items()}
    return cluster_to_label_map

cluster_to_label_map = map_clusters_to_labels(labels, clusters)
predicted_labels = [cluster_to_label_map[c] for c in clusters]

In [36]:
# ... (Implement a mapping function similar to the one in your K-Means code)

predicted_labels = [cluster_to_label_map[c] for c in test_clusters]

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0
