In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

# Constants
LUMINANCE_RANGE = (0, 120)  # Adjusted based on the actual data range
N_CLUSTERS = 5  # You can adjust this value as needed

# Step 1: Load data
data = pd.read_csv('classification_results.csv')

# Convert 'Correct Prediction' to boolean
data['Correct Prediction'] = data['Correct Prediction'].map({'Yes': True, 'No': False})

correct = data[data['Correct Prediction']]
incorrect = data[~data['Correct Prediction']]

# Step 2: Agglomerative Clustering
X = incorrect[['Luminescence']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize and fit Agglomerative Clustering
ac = AgglomerativeClustering(n_clusters=N_CLUSTERS)
incorrect['cluster'] = ac.fit_predict(X_scaled)

print(f'Number of clusters: {N_CLUSTERS}')

# Calculate cluster centers, ranges, and misclassification rates
cluster_centers = []
cluster_ranges = []
cluster_misclassification_rates = []

for cluster_id in range(N_CLUSTERS):
    cluster_data = incorrect[incorrect['cluster'] == cluster_id]
    cluster_center = cluster_data['Luminescence'].mean()
    cluster_range = (cluster_data['Luminescence'].min(), cluster_data['Luminescence'].max())
   
    total_in_range = len(data[(data['Luminescence'] >= cluster_range[0]) & (data['Luminescence'] <= cluster_range[1])])
    misclassified_in_range = len(cluster_data)
    misclassification_rate = misclassified_in_range / total_in_range if total_in_range > 0 else 0
   
    cluster_centers.append(cluster_center)
    cluster_ranges.append(cluster_range)
    cluster_misclassification_rates.append(misclassification_rate)

# Agglomerative Clustering plot
plt.figure(figsize=(12, 8))
sns.scatterplot(data=incorrect, x='Luminescence', y=[0]*len(incorrect), hue='cluster', palette='viridis')
plt.title('Luminescence Clusters (Agglomerative Clustering)')

sns.scatterplot(data=correct, x='Luminescence', y=[-0.1]*len(correct), color='green', marker='o', label='Correctly Classified')
sns.scatterplot(data=incorrect, x='Luminescence', y=[-0.1]*len(incorrect), color='red', marker='x', label='Misclassified')

# Calculate positions for text annotations and avoid overlap
cluster_centers_sorted = sorted(cluster_centers)
y_positions = np.linspace(0.2, 1.2, len(cluster_centers_sorted))

plt.ylim(-0.5, 2)

for i, cluster_id in enumerate(range(N_CLUSTERS)):
    cluster_data = incorrect[incorrect['cluster'] == cluster_id]
    cluster_center = cluster_data['Luminescence'].mean()

    misclassified_count = len(cluster_data)
    total_count = len(data[(data['Luminescence'] >= cluster_data['Luminescence'].min()) & (data['Luminescence'] <= cluster_data['Luminescence'].max())])
    correctly_classified_count = total_count - misclassified_count
    misclassification_rate = misclassified_count / total_count if total_count > 0 else 0

    y_position = y_positions[i]

    plt.text(
        x=cluster_centers_sorted[i],
        y=y_position,
        s=(f"Cluster {cluster_id}\n"
           f"Misclassification Rate: {misclassification_rate:.2f}\n"
           f"Correctly Classified: {correctly_classified_count}\n"
           f"Misclassified: {misclassified_count}"),
        horizontalalignment='center',
        fontsize=10,
        bbox=dict(facecolor='white', alpha=0.8)
    )

plt.legend()
plt.show()

# Function to create binned analysis
def create_binned_analysis(bin_size):
    bins = np.arange(LUMINANCE_RANGE[0], LUMINANCE_RANGE[1] + bin_size, bin_size)
    bin_centers = (bins[:-1] + bins[1:]) / 2
   
    correct_counts, _ = np.histogram(correct['Luminescence'], bins=bins)
    incorrect_counts, _ = np.histogram(incorrect['Luminescence'], bins=bins)
    total_counts = correct_counts + incorrect_counts
   
    misclassification_rates = np.divide(incorrect_counts, total_counts, out=np.zeros_like(incorrect_counts, dtype=float), where=total_counts!=0)
   
    return bin_centers, correct_counts, incorrect_counts, total_counts, misclassification_rates

# Create plots for different bin sizes
bin_sizes = [5, 7, 10]
fig, axes = plt.subplots(len(bin_sizes), 1, figsize=(14, 6*len(bin_sizes)), sharex=True)
fig.suptitle('Luminescence Analysis with Different Bin Sizes', fontsize=16)

for i, bin_size in enumerate(bin_sizes):
    bin_centers, correct_counts, incorrect_counts, total_counts, misclassification_rates = create_binned_analysis(bin_size)
   
    ax = axes[i]
    bar_width = bin_size
    ax.bar(bin_centers, correct_counts, width=bar_width, color='blue', label='Correctly Classified')
    ax.bar(bin_centers, incorrect_counts, width=bar_width, bottom=correct_counts, color='red', label='Misclassified')
   
    ax2 = ax.twinx()
    ax2.plot(bin_centers, misclassification_rates, color='green', marker='o', linestyle='-', label='Misclassification Rate')
   
    ax.set_xlabel('Luminescence')
    ax.set_ylabel('Number of Images')
    ax2.set_ylabel('Misclassification Rate')
    ax.set_title(f'Bin Size: {bin_size}')
    ax.legend(loc='upper left')
    ax2.legend(loc='upper right')
    ax.grid(True)
    ax.set_xlim(LUMINANCE_RANGE)
    ax2.set_ylim(0, 1)

plt.tight_layout()
plt.show()

# Print cluster statistics
print("\nCluster Statistics:")
print(f"{'Cluster':<10} {'Size':<10} {'Range':<30} {'Center':<15} {'Misclassification Rate':<25}")
for i, (center, range_, rate) in enumerate(zip(cluster_centers, cluster_ranges, cluster_misclassification_rates)):
    cluster_size = len(incorrect[incorrect['cluster'] == i])
    print(f"{i:<10} {cluster_size:<10} {str(range_):<30} {center:<15.2f} {rate:.2f}")

ModuleNotFoundError: No module named 'seaborn'