In [76]:
%load_ext autoreload 
%autoreload 2 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
import random
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec # for subplots
import pandas as pd
import numpy as np
import seaborn as sns
import seaborn.objects as so
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences

from src.data_processing.processors.SequenceProcessor import SequenceProcessor
from src.utilities.pandas_helpers import get_features

plt.rcParams['path.simplify_threshold'] = 1.0
plt.rcParams['agg.path.chunksize'] = 2000 


### Objectives to restructer data:
* one hot encode mice (done)
* encode cyclic time : time_sin and time_cos

In [4]:
DATA_PATH = '/projects/p31961/gaby_data/aggregated_data/data_pipeline_full_dataset/datasets/full_dataset.parquet.gzip'
# DATA_PATH = "/Users/michaelschaid/interm_data_transfer/full_dataset.parquet.gzip" #local

In [7]:
data = pd.read_parquet(DATA_PATH)
# data = data[::1000].reset_index(drop=True) # subsample for speed
sensor_cols = [col for col in data.columns if "sensor_" in col]
da_data = data.query("sensor_DA==1").drop(sensor_cols, axis=1).reset_index(drop=True)
da_data

Unnamed: 0,time,sex_M,day,trial,trial_count,learning_phase,event_cue,event_escape,event_avoid,latency,...,mouse_id_4,mouse_id_5,mouse_id_6,mouse_id_7,mouse_id_8,mouse_id_9,mouse_id_10,mouse_id_11,mouse_id_12,mouse_id_13
0,-25.000000,1,5,0,110,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,-24.901531,1,5,0,110,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,-24.803064,1,5,0,110,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,-24.704596,1,5,0,110,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,-24.606127,1,5,0,110,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935368,19.606127,1,4,11,98,1,0,1,0,5.5,...,0,1,0,0,0,0,0,0,0,0
2935369,19.704596,1,4,11,98,1,0,1,0,5.5,...,0,1,0,0,0,0,0,0,0,0
2935370,19.803064,1,4,11,98,1,0,1,0,5.5,...,0,1,0,0,0,0,0,0,0,0
2935371,19.901531,1,4,11,98,1,0,1,0,5.5,...,0,1,0,0,0,0,0,0,0,0


# Concept for data structure 
<span style="color:red">**batch size**</span> since we are training for trial_count, we consider each subject (mouse) a batch. 

<span style="color:red">**Sequence length**</span> This is the number of time steps, in this case we consider 1 time step, 1 sequence length. Since the trials are different lengths, we will need to pad the shorter sequences. We will use a value of 1000 filled with zeros, so our model will learn to ignore them. *we might be able to get away with a lambda layer* 

<span style="color:red">**Number of features**</span> everything else, including time (seconds)


In [62]:
### Encoding cyclic time
def cyclic_time(df):
    max_time = df.time.max()
    return (
        df
        .assign(
            time_max_norm = lambda df_: (df_.time / max_time) * (2 * np.pi),
            time_cos = lambda df_: np.cos(df_.time_max_norm),
            time_sin = lambda df_: np.sin(df_.time_max_norm)
             
            
                )
        .reset_index(drop=True)
        )
    

# update_df = cyclic_time(da_data)
# update_df
def batch_by_mouse(df):
    # get all cols with mouse_id
    
    def query_mouse(df, mouse):
        return df.query(f'{mouse}==1').drop(columns = mouse_ids).reset_index(drop = True)
    
    mouse_ids = [col for col in df.columns if "mouse_id_" in col]
    # create list of dataframes for each mouse
    
    batches = [query_mouse(df, mouse) for mouse in mouse_ids]
    # return list of mouse_ids and list of dataframes
    return mouse_ids, batches
mouse_id, batches = batch_by_mouse(da_data)

def split_training_val_test_batches(batches, target,  train_ratio=0.7, val=0.15):
    
    #calculate indexes for training, validation, and testing
    """
    """
    samples = len(batches)
    num_train = int(samples * train_ratio)
    num_val = int(samples * val)
    print(num_train, num_val)

    
    training_idx = num_train
    val_idx = num_train + num_val
    print(training_idx, val_idx)

    split_data = {}
    #split features and target into seperate dfs
    split_data['features'] = [df.drop(columns=[target]) for df in batches]
    features = [df.drop(columns=[target]) for df in batches]
    targets = [df[target] for df in batches]
    
    
    #training
    training_batches_X = features[:training_idx]
    training_batches_y = targets[:training_idx]
    training_mice = mouse_id[:training_idx]
    
    #validation
    val_batches_X = features[training_idx:val_idx]
    val_batches_y = targets[training_idx:val_idx]
    val_mice = mouse_id[training_idx:val_idx]
    
    # #testing
    test_batches_X = features[val_idx:]
    test_batches_y = targets[val_idx:]
    test_mice = mouse_id[val_idx:]
    
    
    return (training_batches_X, training_batches_y, training_mice,
            val_batches_X, val_batches_y, val_mice,
            test_batches_X, test_batches_y, test_mice)
    
training_batches_X, training_batches_y, training_mice, val_batches_X, val_batches_y, val_mice, test_batches_X, test_batches_y, test_mice = split_training_val_test_batches(batches, target = 'signal')

9 2
9 11


In [78]:
dopamine_seq_processor = SequenceProcessor(data = da_data)

In [79]:
dopamine_seq_processor.batch_by_subject(subject_prefix = 'mouse_id_')

AttributeError: 'SequenceProcessor' object has no attribute 'batch_by_subject'

In [33]:
round((len(mouse_id)*.5))

7

[              time  sex_M  day  trial  trial_count  learning_phase  event_cue  \
 1374    -25.000000      0    5      0          116               0          0   
 1375    -24.901531      0    5      0          116               0          0   
 1376    -24.803064      0    5      0          116               0          0   
 1377    -24.704596      0    5      0          116               0          0   
 1378    -24.606127      0    5      0          116               0          0   
 ...            ...    ...  ...    ...          ...             ...        ...   
 2856134  19.606127      0    4     11           98               1          0   
 2856135  19.704596      0    4     11           98               1          0   
 2856136  19.803064      0    4     11           98               1          0   
 2856137  19.901531      0    4     11           98               1          0   
 2856138  20.000000      0    4     11           98               1          0   
 
          even

In [None]:

plt.figure(figsize=(40, 40))
plt.matshow(np.corrcoef(data.corr()), cmap = 'plasma')
# Set the tick labels for the x and y axes
plt.xticks(range(len(data.columns)), data.columns, rotation=90)
plt.yticks(range(len(data.columns)), data.columns)
plt.show()

# Preprocessing for LSTM input 


## Semgementation 
*  Spit data into seperate dataframes for each subject
*  Also need to restructure so signal for each event is seperated, and we will predict all 4 signals from the given data. Might have issues with missing data, but hopefully padding corrects that

## Drop Unnessary columns

* Since we are segmenting by subject, we can drop mouse_id columns
* we need to track this data externally, we can do so in pandas and save it
  
## Pad sequences 
* to control for varying length 
``` {python}
from tensorflow.keras.preprocessing.sequence import pad_sequence
```

## Reshape data for LSTM
``` LSTM(num_seq, sequence_length, num_features)```

* num_seq: number of subjects

* sequence_length: number of trials

* num_features: features minus trial_count and subject_identifies

In [None]:
# seperate events into seperate signal colums: cue, shock, escape, avoid
def seperate_events(df):
    return (df
            .assign(cue = lambda df_: df_.query("event_cue==1").signal, 
                    shock = lambda df_: df_.query("event_shock==1").signal,
                    escape = lambda df_: df_.query("event_escape==1").signal,
                    avoid = lambda df_: df_.query("event_avoid==1").signal)
            .query("sensor_DA == 1")
            .drop(columns=['signal', 'event_cue', 'event_shock', 'event_escape', 'event_avoid', 'sensor_DA', 'sensor_D1', 'sensor_D2', 'learning_phase', 'latency'])
            .dropna(subset=['cue'])
    )
seperated_signal = seperate_events(data)
seperated_signal.sort_values(by=['time', 'trial_count']).head(50)

In [None]:
seperated_signal.nunique()

In [None]:
def get_subject_columns(df, prefix):
    return [col for col in df.columns if col.startswith(prefix)]


In [None]:
data.sort_values(by = ['trial_count', 'time']).reset_index(drop=True).query("mouse_id_7==1 & trial ==1")

In [None]:
data.head(50)