In [1]:
import numpy as np
import tensorflow as tf
import random

In [2]:
structures = np.loadtxt("structures_120k.txt")

In [3]:
spectra = np.loadtxt("spectra_120k.dat")
spectra = spectra.reshape(120000,61,4)

In [4]:
spectra.shape

(120000, 61, 4)

In [5]:
def euclidean_distance(a,b):
    return np.linalg.norm(a-b)

In [6]:
def greedy_selection(data, num_samples, start_index):
    selected_indices = [start_index]
    remaining_indices = list(range(len(data)))
    remaining_indices.remove(start_index)

    while len(selected_indices) < num_samples:
        last_selected = data[selected_indices[-1]]
        distances = np.array([euclidean_distance(last_selected, data[i]) for i in remaining_indices])
        max_distance_index = np.argmax(distances)
        selected_indices.append(remaining_indices[max_distance_index])
        del remaining_indices[max_distance_index]

    return np.array(selected_indices)


In [7]:
random_start = True  # Set to True for a random starting index, False for deterministic
if random_start:
    starting_index = random.randint(0, len(structures) - 1)
else:
    starting_index = 0  # Fixed starting index for deterministic results


In [8]:
print(starting_index)

111177


In [9]:
selected_indices = greedy_selection(structures, 100, starting_index)
selected_structures = structures[selected_indices]
selected_spectra = spectra[selected_indices]


In [10]:
selected_indices

array([111177,  99590, 108710,  43098,  48672,  66575,  60457,  12572,
        59791, 107711,  64713, 100208,  15981,   3664,  19060,  47878,
        96328, 102870,  44404,  42554,   5204,  58447,     81,  18983,
        26704,  35494, 115378,  84950,  18303,  12939,  16346,    756,
        51677,  23050,    309,  33419,  96497,  24848,   5074, 118123,
        77125,  43911, 102095,  33423,   3177,  90700, 119076,  72688,
        98256,  75945,    542,  46827,  99819,  80103,  92616,  93492,
        65463,  12898, 102587,  16670,  60281,  69339,  88151,  98438,
        22547,  41572,  82479,  75427,   2022,  35271,  11303,  77183,
        57934, 100403,  23263,  48666,   6308,   1864,  14538,   9225,
        51685,  27764,  94698,  81446,  47166,  32923,  91622,  50194,
       117670,  23900,  27065, 116669,  50556,  69467,  77065,  16961,
        78360,  60615,  53490,  20026])

In [11]:
model = tf.keras.models.load_model("finalmodel_15_10_2024.h5")

In [12]:
predicted_spectra = model.predict(selected_structures)



In [13]:
mse = tf.keras.losses.MeanSquaredError()
loss = mse(selected_spectra,predicted_spectra).numpy()

In [14]:
print(f"The loss for 10000 greedy sampling data from 120k data is {loss}")

The loss for 10000 greedy sampling data from 120k data is 0.5109357833862305
