In [1]:
# Imports
import pandas as pd
from tensorflow.keras import models, layers
import matplotlib.pyplot as plt
from lmfit.models import GaussianModel, VoigtModel, LinearModel, ConstantModel, SkewedGaussianModel
import numpy as np
from sklearn.preprocessing import StandardScaler
from statistics import mean, median
from scipy.stats import skewnorm, gamma
from tqdm import tqdm
import random

In [2]:
# Load the dataset
df = pd.read_csv('train_dataset_RAW.csv')

In [3]:
# Parse the dataset
cr = df['Cr']
clean_df=df.T
clean_df.drop(clean_df.tail(8).index, inplace=True)
clean_df.index = clean_df.index.to_series().astype(str).str.replace('X','',regex=True).astype(float)
clean_df.index = clean_df.index.astype(float)

In [4]:
clean_df.shape

(40002, 2100)

In [5]:
# For each target, generate spectra as random combinations of availible spectra
x = []
y = np.zeros(42)
for i in tqdm(range(42)):
    inflated_df = clean_df.loc[:,(i*50):(i*50+50)]
    for j in range(150):
        new_x = inflated_df.sample(n=30,axis='columns').sum(axis='columns').divide(30)
        inflated_df = pd.concat([inflated_df, new_x], axis='columns')
    x.append(inflated_df)
    y[i] = cr[i*50]
    

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [05:19<00:00,  7.60s/it]


In [6]:
# Select training and testing data
train_ids = np.random.choice(range(30), 300)
test_ids = np.random.choice(range(30,42), 300)


x_train = np.zeros([9000,40002]) # 300 by 40002*30
y_train = np.zeros(9000)

x_test = np.zeros([50,1200060]) # 50 by 40002*30
y_test = np.zeros(50)

for i in tqdm(range(300)):
    df_to_reformat = x[train_ids[i]]
    #randomly select 30 columns and append them as 1 row to x_train[i]
    a = df_to_reformat.sample(n=30, axis='columns').to_numpy().flatten()
    x_train[i,:] = a
    y_train[i] = y[train_ids[i]]
    
for i in tqdm(range(50)):
    df_to_reformat = x[test_ids[i]]
    #randomly select 30 columns and append them as 1 row to x_train[i]
    a = df_to_reformat.sample(n=30, axis='columns').to_numpy().flatten()
    x_test[i,:] = a
    y_test[i] = y[test_ids[i]]

    

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:06<00:00, 44.98it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 18.95it/s]


In [7]:
np.shape(x_train), np.shape(y_train), np.shape(x_test), np.shape(y_test)
np.save('multiplied_data_x_train',x_train)
np.save('multiplied_data_y_train',y_train)
np.save('multiplied_data_x_test',x_test)
np.save('multiplied_data_y_test',y_test)

In [8]:
np.shape(x_train), np.shape(y_train), np.shape(x_test), np.shape(y_test)

((300, 1200060), (300,), (50, 1200060), (50,))

In [9]:
# Scaling
scaler = StandardScaler()
scaler.fit(X=x_train,y=y_train)
scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)
#scaled_y = scaler.transform(y_train.T)

In [10]:
# Model architecture
model = models.Sequential(
    [
        #layers.Dropout(0.2, input_shape=(1200060,))
        layers.Conv1D(10,100, activation='relu'),
        layers.MaxPooling1D(),
        layers.Flatten(),
        layers.Dense(50, activation="relu"),

    ]
)

In [11]:
model.compile(loss='MSE', optimizer='adam', metrics=['mse','mae'])
history = model.fit(x_train,y_train, epochs=1000, batch_size=10)

Epoch 1/1000


ValueError: in user code:

    File "C:\Users\tdvorak\Desktop\bakalarka_lul\.venv\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\tdvorak\Desktop\bakalarka_lul\.venv\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\tdvorak\Desktop\bakalarka_lul\.venv\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\tdvorak\Desktop\bakalarka_lul\.venv\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\tdvorak\Desktop\bakalarka_lul\.venv\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\tdvorak\Desktop\bakalarka_lul\.venv\lib\site-packages\keras\engine\input_spec.py", line 250, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer "sequential" "                 f"(type Sequential).
    
    Input 0 of layer "conv1d" is incompatible with the layer: expected min_ndim=3, found ndim=2. Full shape received: (10, 1200060)
    
    Call arguments received by layer "sequential" "                 f"(type Sequential):
      • inputs=tf.Tensor(shape=(10, 1200060), dtype=float32)
      • training=True
      • mask=None


In [None]:
y_pred = model.predict(x_test)
y_pred.shape

In [None]:
plt.scatter(x=range(50), y=y_test, marker='o')
plt.scatter(x=range(50), y=y_pred, marker='*')

In [None]:
np.shape(x_test), np.shape(y_test)

In [None]:
mse = ((y_test - y_pred)**2).mean()
mse

In [None]:
model.save('18_10_data_multiplier_first_decent_try.h5')