In [51]:
%load_ext autoreload 
%autoreload 2 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
import random
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec # for subplots
import pandas as pd
import numpy as np
import seaborn as sns
import seaborn.objects as so
import tensorflow as tf


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tensorflow.keras.preprocessing.sequence import pad_sequences




from src.utilities.pandas_helpers import get_features


plt.rcParams['path.simplify_threshold'] = 1.0
plt.rcParams['agg.path.chunksize'] = 2000 


### Objectives to restructer data:
* one hot encode mice (done)
* encode cyclic time : time_sin and time_cos
* Dim reduction

In [53]:
# DATA_PATH = '/projects/p31961/gaby_data/aggregated_data/data_pipeline_full_dataset/datasets/full_dataset.parquet.gzip'
DATA_PATH = "/Users/michaelschaid/interm_data_transfer/full_dataset.parquet.gzip" #local

In [54]:
data = pd.read_parquet(DATA_PATH)
# data = data[::1000].reset_index(drop=True) # subsample for speed

In [None]:
### Encoding cyclic time
def cyclic_time(df):
    max_time = df.time.max()
    return (
        df
        .assign(
            time_max_norm = lambda df_: (df_.time / max_time) * (2 * np.pi),
            time_cos = lambda df_: np.cos(df_.time_max_norm),
            time_sin = lambda df_: np.sin(df_.time_max_norm)
             
            
                )
        .sort_values(by=['time', 'trial_count'])
        .reset_index(drop=True)
        )
update_df = cyclic_time(data)
sig = update_df.query("mouse_id_7==1 & sensor_DA == 1").sort_values(by=['time', 'trial_count'])
sig.shape

In [None]:

plt.figure(figsize=(40, 40))
plt.matshow(np.corrcoef(data.corr()), cmap = 'plasma')
# Set the tick labels for the x and y axes
plt.xticks(range(len(data.columns)), data.columns, rotation=90)
plt.yticks(range(len(data.columns)), data.columns)
plt.show()

# Concept for data structure 
<span style="color:red">**batch size**</span> since we are training for trial_count, we consider each subject (mouse) a batch. 

<span style="color:red">**Sequence length**</span> This is the number of time steps, in this case we consider 1 time step, 1 sequence length. Since the trials are different lengths, we will need to pad the shorter sequences. We will use a value of 1000 filled with zeros, so our model will learn to ignore them. *we might be able to get away with a lambda layer* 

<span style="color:red">**Number of features**</span> everything else, including time (seconds)


# Preprocessing for LSTM input 


## Semgementation 
*  Spit data into seperate dataframes for each subject
*  Also need to restructure so signal for each event is seperated, and we will predict all 4 signals from the given data. Might have issues with missing data, but hopefully padding corrects that

## Drop Unnessary columns

* Since we are segmenting by subject, we can drop mouse_id columns
* we need to track this data externally, we can do so in pandas and save it
  
## Pad sequences 
* to control for varying length 
``` {python}
from tensorflow.keras.preprocessing.sequence import pad_sequence
```

## Reshape data for LSTM
``` LSTM(num_seq, sequence_length, num_features)```

* num_seq: number of subjects

* sequence_length: number of trials

* num_features: features minus trial_count and subject_identifies

In [None]:
# seperate events into seperate signal colums: cue, shock, escape, avoid
def seperate_events(df):
    return (df
            .assign(cue = lambda df_: df_.query("event_cue==1").signal, 
                    shock = lambda df_: df_.query("event_shock==1").signal,
                    escape = lambda df_: df_.query("event_escape==1").signal,
                    avoid = lambda df_: df_.query("event_avoid==1").signal)
            .query("sensor_DA == 1")
            .drop(columns=['signal', 'event_cue', 'event_shock', 'event_escape', 'event_avoid', 'sensor_DA', 'sensor_D1', 'sensor_D2', 'learning_phase', 'latency'])
            .dropna(subset=['cue'])
    )
seperated_signal = seperate_events(data)
seperated_signal.sort_values(by=['time', 'trial_count']).head(50)

In [None]:
seperated_signal.nunique()

In [None]:
def get_subject_columns(df, prefix):
    return [col for col in df.columns if col.startswith(prefix)]


In [None]:
data.sort_values(by = ['trial_count', 'time']).reset_index(drop=True).query("mouse_id_7==1 & trial ==1")

In [None]:
data.head(50)