In [4]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the file
type = 'kmeans'
df = pd.read_csv(f'pdbbind_cluster_{type}.txt', sep='\t', header=None, names=['PDB_ID', 'Cluster'])

# Calculate basic statistics
total_samples = len(df)
unique_clusters = df['Cluster'].nunique()
cluster_sizes = df['Cluster'].value_counts()

print(f"Total samples: {total_samples}")
print(f"Number of clusters: {unique_clusters}")
print(f"Largest cluster size: {cluster_sizes.max()}")
print(f"Smallest cluster size: {cluster_sizes.min()}")
print(f"Average cluster size: {cluster_sizes.mean():.2f}")
print(f"Median cluster size: {cluster_sizes.median()}")

# Plot histogram of cluster size distribution
plt.figure(figsize=(10, 6))
plt.hist(cluster_sizes, bins=50)
plt.title(f'Cluster Size Distribution with {type} Clustering')
plt.xlabel('Cluster Size')
plt.ylabel('Frequency')
plt.savefig(f'{type}_cluster_size_distribution.png')
plt.close()

# Output the top 10 largest clusters
print("\nTop 10 largest clusters:")
print(cluster_sizes.head(10))

# Count the number of single-sample clusters
single_sample_clusters = sum(cluster_sizes == 1)
print(f"\nNumber of single-sample clusters: {single_sample_clusters}")

# Calculate the number of clusters in different size ranges
size_ranges = [(1, 1), (2, 5), (6, 10), (11, 20), (21, 50), (51, 100), (101, float('inf'))]
range_counts = {f"{start}-{end}": sum((cluster_sizes >= start) & (cluster_sizes <= end)) for start, end in size_ranges}

print("\nNumber of clusters in different size ranges:")
for range_name, count in range_counts.items():
    print(f"{range_name}: {count}")


Total samples: 19443
Number of clusters: 1000
Largest cluster size: 151
Smallest cluster size: 3
Average cluster size: 19.44
Median cluster size: 16.0

Top 10 largest clusters:
Cluster
10     151
499    104
94      82
289     81
180     81
24      79
139     58
714     58
112     56
176     56
Name: count, dtype: int64

Number of single-sample clusters: 0

Number of clusters in different size ranges:
1-1: 0
2-5: 18
6-10: 195
11-20: 449
21-50: 321
51-100: 15
101-inf: 2
