In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the CSV files (Assuming you've already downloaded them)
ptbxl_data = pd.read_csv("ptbxl_database.csv")
scp_statements = pd.read_csv("scp_statements.csv")

# Extract relevant features and labels
# (Adjust features based on your analysis and domain knowledge)
features = ptbxl_data[['age', 'sex', 'height', 'weight']]
label_mapping = {
    'NORM': 0,
    'MI': 1,
    'STTC': 2,
    'CD': 3,
    'HYP': 4
}

def extract_labels(codes):
    labels = []
    for code in codes:
        code_dict = eval(code)
        found_label = -1 # Representing 'Other' or unknown
        for key in code_dict.keys():
            if key in label_mapping:
                found_label = label_mapping[key]
                break
        labels.append(found_label)
    return labels

ptbxl_data['label'] = extract_labels(ptbxl_data['scp_codes'])
labels = ptbxl_data['label']

# Handle missing values
features = features.fillna(features.mean())

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
# Determine optimal number of clusters (using elbow method or other techniques)
# For this example, let's assume 5 clusters (matching our label types)
n_clusters = 5

# Apply K-Means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    scaled_features, labels, test_size=0.2, random_state=42
)

# Train K-Means on training data
kmeans_train = KMeans(n_clusters=n_clusters, random_state=42)
train_clusters = kmeans_train.fit_predict(X_train)

# Predict clusters for test data
test_clusters = kmeans_train.predict(X_test)

# Map cluster labels to original labels (this is a simplification)
# You might need more sophisticated mapping based on cluster characteristics
from collections import Counter
def map_clusters_to_labels(true_labels, clusters):
    mapping = {}
    for i in range(n_clusters):
        most_common_label = Counter(true_labels[clusters == i]).most_common(1)[0][0]
        mapping[i] = most_common_label
    return mapping

cluster_to_label_map = map_clusters_to_labels(y_train, train_clusters)
predicted_labels = [cluster_to_label_map[c] for c in test_clusters]

# Calculate accuracy (this is a pseudo-accuracy for demonstration)
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Pseudo-Accuracy: {accuracy}")

  super()._check_params_vs_input(X, default_n_init=10)


Pseudo-Accuracy: 0.5685779816513762


In [None]:
from sklearn.cluster import DBSCAN

# Experiment with eps and min_samples parameters
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(scaled_features)

In [None]:
# Similar mapping function as before, but consider noise points (-1)
def map_clusters_to_labels(true_labels, clusters):
    mapping = {}
    for i in set(clusters):
        if i == -1:  # Noise points
            mapping[i] = -1
        else:
            most_common_label = Counter(true_labels[clusters == i]).most_common(1)[0][0]
            mapping[i] = most_common_label
    return mapping

cluster_to_label_map = map_clusters_to_labels(labels, clusters)
predicted_labels = [cluster_to_label_map[c] for c in clusters]

In [None]:
# Assuming you've split data and fit DBSCAN on training data
test_clusters = dbscan.fit_predict(X_test)

#Handle unknown clusters
predicted_test_labels = [cluster_to_label_map.get(c, -1) for c in test_clusters] # Map clusters to labels, handling unknown clusters

accuracy = accuracy_score(y_test, predicted_test_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5559633027522936


# New Section

In [None]:
import pandas as pd
ptbxl_data = pd.read_csv("ptbxl_database.csv")
scp_statements = pd.read_csv("scp_statements.csv")

In [None]:
features = ptbxl_data[['age', 'sex', 'height', 'weight']]  # Adjust features as needed
label_mapping = {'NORM': 0, 'MI': 1, 'STTC': 2, 'CD': 3, 'HYP': 4}

def extract_labels(codes):
    labels = [] # Start of the indented code block within the function
    for code_string in codes:
        diagnosis_codes = code_string.split(',')
        mapped_labels = [label_mapping.get(code.strip(), -1) for code in diagnosis_codes]
        # Use the most severe label (lower number is more severe)
        if mapped_labels:
            labels.append(min(mapped_labels))
        else:
            labels.append(-1)  # Handle cases with no diagnostic codes
    return labels # End of the indented code block within the function

ptbxl_data['label'] = extract_labels(ptbxl_data['scp_codes'])
labels = ptbxl_data['label']

In [None]:
features = features.fillna(features.mean())  # Handle missing values
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
from sklearn.cluster import AgglomerativeClustering
n_clusters = 5  # Match the number of disease labels
clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
clusters = clustering.fit_predict(scaled_features)

In [None]:
from collections import Counter
def map_clusters_to_labels(true_labels, clusters):
    # Count the true labels within each cluster
    cluster_counts = {}
    for i, cluster in enumerate(clusters):
        if cluster not in cluster_counts:
            cluster_counts[cluster] = Counter()
        cluster_counts[cluster][true_labels[i]] += 1

    # Map each cluster to the most frequent true label within it
    cluster_to_label_map = {cluster: counts.most_common(1)[0][0]
                           for cluster, counts in cluster_counts.items()}
    return cluster_to_label_map

cluster_to_label_map = map_clusters_to_labels(labels, clusters)
predicted_labels = [cluster_to_label_map[c] for c in clusters]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(scaled_features, labels, test_size=0.2, random_state=42)

# Fit clustering on training data and predict on test data
# ...

accuracy = accuracy_score(y_test, predicted_test_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9979357798165137


# New Section

In [None]:
import pandas as pd
ptbxl_data = pd.read_csv("ptbxl_database.csv")
scp_statements = pd.read_csv("scp_statements.csv")

In [None]:
features = ptbxl_data[['age', 'sex', 'height', 'weight']]  # Adjust features as needed
label_mapping = {'NORM': 0, 'MI': 1, 'STTC': 2, 'CD': 3, 'HYP': 4}

def extract_labels(codes):
    # BEGIN_SOLUTION
    labels = []
    for code_string in codes:
        extracted_labels = []
        for code in code_string.split(','):
            if code in label_mapping:
                extracted_labels.append(label_mapping[code])
        # Assuming you want to take the first label if multiple are present
        labels.append(extracted_labels[0] if extracted_labels else -1)
    return labels
    # END_SOLUTION

ptbxl_data['label'] = extract_labels(ptbxl_data['scp_codes'])
labels = ptbxl_data['label']

In [None]:
features = features.fillna(features.mean())  # Handle missing values
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [1]:
from sklearn.decomposition import PCA
n_components = 2  # Adjust as needed
pca = PCA(n_components=n_components)
pca_features = pca.fit_transform(scaled_features)

NameError: name 'scaled_features' is not defined