In [None]:
%load_ext autoreload 
%autoreload 2 

In [3]:
import random
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec # for subplots
import pandas as pd
import numpy as np
import seaborn as sns
import seaborn.objects as so
import tensorflow as tf
import json

from src.data_processing.pipelines.LSTMPipe import LSTMPipe
from src.models.experimental_dropout_StackedLSTM import StackedLSTM




In [None]:
# print gpus available
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
DATA_PATH = '/projects/p31961/gaby_data/aggregated_data/raw_data/datasets/raw_data_raw_data.parquet.gzip'
# MODEL_PATH = "/projects/p31961/ENIGMA/results/experiments/dopamine_full_sequence_stacked_lstm_01/models/dopamine_full_sequence_stacked_lstm_01"

# locald
# DATA_PATH = '/Users/mds8301/iterm_data_storage/raw_data_raw_data.parquet.gzip'
# MODEL_PATH = '/Users/mds8301/Development/enigma/results/experiments/full_sequence_stacked_lstm_01/models/full_sequence_stacked_lstm_01'

In [None]:
processor = LSTMPipe(DATA_PATH)
processor.read_raw_data()
processor.raw_data=processor.raw_data[::1000]
processor.raw_data

In [None]:
with open('/projects/p31961/ENIGMA/results/experiments/full_sequence_stacked_lstm_01/subjects.json', 'r') as f:
    subjects = json.load(f)
subjects

In [None]:
training_subjects = subjects['training']
dev_subjects = subjects['dev']
test_subjects = subjects['test']

training_query = ' or '.join([f"mouse_id=={subject}" for subject in training_subjects])
dev_query = ' or '.join([f"mouse_id=={subject}" for subject in dev_subjects])
test_query = ' or '.join([f"mouse_id=={subject}" for subject in test_subjects])

def split_by_subjects_query(subjects):
    query = ' or '.join([f"mouse_id=={subject}" for subject in subjects])
    x, y = processor.raw_data.query(query).drop(columns ="signal"), processor.raw_data.query(query)['signal']
    return x, y

processor.X_train, processor.y_train = split_by_subjects_query(training_subjects)
processor.X_dev, processor.y_dev = split_by_subjects_query(dev_subjects)
processor.X_test, processor.y_test = split_by_subjects_query(test_subjects)


In [None]:
processor.transorm_data()
processor.X_train

In [None]:
def lr_schedular(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(1.)/epoch
epochs = range(1,400,1)
lrs = np.array([lr_schedular(e, 0.001) for e in epochs])
plt.plot(epochs, lrs)
# plot loss



In [None]:
# tf tensor
X_train = tf.convert_to_tensor(processor.X_train)
y_train = tf.convert_to_tensor(processor.y_train)

In [None]:
X_train.shape[1]

In [None]:
model = StackedLSTM(
    sequence_length=processor.raw_data['time'].nunique(),
    num_features=processor.X_train.shape[1],
    lstm_units=processor.X_train.shape[1] * 2
)


model.compile(optimizer='adam', loss='mse', metrics=[
    'mae', 'mse', 'mape', 'cosine_similarity'])



# learning_rate_callback = tf.keras.callbacks.LearningRateScheduler(
#     lr_schedular, verbose=1)

model.fit(processor.X_train,
            processor.y_train,
            epochs=5,
            validation_data=(processor.X_dev, processor.y_dev)
            )


In [None]:
model = tf.keras.models.load_model(MODEL_PATH)

# model.compile(optimizer='adam', loss='mse', metrics=[
#         'mae', 'mse', 'mape', 'cosine_similarity'])

In [None]:
model.summary()

In [None]:
predicted_signal = model.predict(processor.X_test)

In [None]:
x_test_raw, y_test_raw = split_by_subjects_query(test_subjects)
full_test_set = (x_test_raw
                 .assign(
                     true_signal=y_test_raw,
                     predicted_signal = predicted_signal
                     )
                 )


In [None]:
full_test_set

In [None]:
full_test_set.isnull().sum()

In [None]:
full_test_set

In [None]:

processor_pipe = (LSTMPipe(DATA_PATH)
.read_raw_data(sort_by=['mouse_id','sensor','event', 'trial_count']))
processor.raw_data = processor_pipe.raw_data[::10000]
(processor_pipe.split_data(processed_data = False, 
            test_size=0.3,
            test_dev_size=0.5, 
            split_group = "mouse_id", 
            stratify_group = "sex", 
            target='signal', 
            save_subject_ids=False)
.transorm_data()
)

In [None]:
from src.models.experimental_dropout_StackedLSTM import StackedLSTM
def lr_schedular(epoch, lr):
        if epoch < 10:
            return lr
        else:
            return lr * tf.math.exp(-0.1)


model = StackedLSTM(
    sequence_length=processor.raw_data['time'].nunique(),
    num_features=processor.X_train.shape[1],
    lstm_units=processor.X_train.shape[1] * 2
)


model.compile(optimizer='adam', loss='mse', metrics=[
    'mae', 'mse', 'mape', 'cosine_similarity'])

# call backs


learning_rate_callback = tf.keras.callbacks.LearningRateScheduler(
    lr_schedular, verbose=1)

model.fit(processor.X_train,
            processor.y_train,
            epochs=5,
            validation_data=(processor.X_dev, processor.y_dev),
            callbacks=[learning_rate_callback]
            )


model.evaluate(processor.X_test, processor.y_test)





In [None]:
sns.lineplot(data = avoid, x = 'time', y = 'signal', hue = 'signal_type')
# sns.lineplot(data = query, x = 'time', y = 'predicted_signal')

In [None]:
facet = sns.FacetGrid(avoid, row = 'day', col = "learning_phase")
facet.map_dataframe(sns.lineplot, x = 'time', y = 'signal', hue = 'signal_type', hue_order = ["true_signal", "predicted_signal"])

In [None]:
mouse_3 = avoid.query("mouse_id_3==1 & trial_count < 10")
mouse_3

In [None]:
sns.lineplot(data = mouse_3.query("trial_count==9"), x = 'time', y = 'signal', hue = 'signal_type')

In [None]:
evalulation = model.evaluate(X_test, y_test)
for name, value in zip(model.metrics_names, evalulation):
    print(f'{name}: {value}')

## Analysis on training set

In [None]:
def query_testing_subjects(subjects, df):
    
    full_query = ''
    for mouse in subjects:
        query =f"{mouse} == 1"
        if full_query == '':
            full_query = query
        else:
            full_query += f" or {query}" 
        
    return df.query(full_query)

train_set = query_testing_subjects(subjects_by_category['training'], train_processor.data)
X_train, y_train = train_set.drop(columns = 'signal'), train_set['signal']

In [None]:
predicted_signal = model.predict(X_train)

In [None]:
full_train_set = (X_train
                 .assign(
                     true_signal=y_train,
                     predicted_signal = predicted_signal
                     )
                 )

In [None]:
train_avoid = (full_train_set
         .query("action_avoid==1 & event_cue==1")
         .melt(id_vars = full_test_set.drop(columns = ['predicted_signal', 'true_signal']).columns, value_vars= ['predicted_signal', 'true_signal'], value_name = "signal", var_name = 'signal_type'))
train_avoid

In [None]:
sns.lineplot(data = train_avoid, x = 'time', y = 'signal', hue = 'signal_type')

In [None]:
facet = sns.FacetGrid(train_avoid, row = 'day', col = "learning_phase")
facet.map_dataframe(sns.lineplot, x = 'time', y = 'signal', hue = 'signal_type', hue_order = ["true_signal", "predicted_signal"])

In [None]:
mouse_id_1_train  = train_avoid.query("mouse_id_1==1 & trial_count < 20")
sns.lineplot(data = mouse_id_1_train.query("trial_count==17"), x = 'time', y = 'signal', hue = 'signal_type')

# Summary
given the training performance data, I think this model is still overrfitting. It looks a little better, but validation loss is still increasing. I am going to expand on this and include a standard learning rate schedular in experiment 3
