In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
structures = np.loadtxt("structures_120k.txt")

In [3]:
structures.shape

(120000, 8)

In [4]:
spectra = np.loadtxt("spectra_120k.dat")

In [5]:
spectra = spectra.reshape(120000,61,4)

In [6]:
spectra.shape

(120000, 61, 4)

In [35]:
def plot_elbow_method(data, max_clusters=500):
    inertia = []
    for k in range(2, max_clusters + 1):  # Start from 2 clusters
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data)
        inertia.append(kmeans.inertia_)
    

    plt.figure(figsize=(15, 8))
    plt.plot(range(2, max_clusters + 1), inertia, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal Clusters')
    plt.grid()
    plt.show()



In [36]:
plot_elbow_method(structures, max_clusters=500)  # Adjust max_clusters as needed


KeyboardInterrupt: 

In [26]:
optimal_clusters = int(input("Enter the optimal number of clusters (from the elbow plot): "))


In [27]:
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
kmeans.fit(structures)
labels = kmeans.labels_


In [28]:
def select_equal_points_from_clusters(data, labels, spectra, points_per_cluster):
    selected_indices = []
    for cluster in range(np.max(labels) + 1):  # Iterate over all clusters
        cluster_indices = np.where(labels == cluster)[0]
        selected_indices.extend(cluster_indices[:points_per_cluster])  # Select points_per_cluster points
    return np.array(selected_indices)


In [29]:
points_per_cluster = 10000 // optimal_clusters
selected_indices = select_equal_points_from_clusters(structures, labels, spectra, points_per_cluster)


In [30]:
selected_structures = structures[selected_indices]
selected_spectra = spectra[selected_indices]


In [31]:
model = tf.keras.models.load_model("finalmodel_15_10_2024.h5")


In [32]:
predicted_spectra = model.predict(selected_structures)




In [33]:
mse = tf.keras.losses.MeanSquaredError()
loss = mse(selected_spectra, predicted_spectra).numpy()


In [34]:
print(f"The loss for 10000 clustering sampling data from 120k data is {loss}")

The loss for 10000 clustering sampling data from 120k data is 0.5076581239700317
