In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
import logging
import numpy as np
import os
import pandas as pd


from src.utilities.os_helpers import create_new_directory, create_directories
from src.utilities.pandas_helpers import get_features

from src.data_processing.processors.Preprocessor import Preprocessor


In [134]:
lat_path = "/projects/p31961/gaby_all_raw_data/AA_Latencies.xlsx"
sex_path = "/projects/p31961/gaby_all_raw_data/AA_ListofSex.xlsx"


lat_data = pd.read_excel(lat_path)
sex_data = pd.read_excel(sex_path)
data = pd.read_parquet('/projects/p31961/gaby_data/aggregated_data/downsampled_aggregated_data.parquet.gzp')

mouse_id    category
day            int64
event         object
sensor        object
time         float32
trial          int32
signal       float32
dtype: object

In [136]:
def tweak_sex_data(df):

    return (
        df
        .rename(columns = lambda c: c.replace(' ', '_').lower())
        .assign(mouse_id = lambda df: df['mouse_id'].str.replace("-", "_").astype('category')
        )
    )
sex_df = tweak_sex_data(sex_data)

In [135]:
lat_data

Unnamed: 0,Mouse,Day,Trial,Latency,Event
0,309-910,1,1,5.69,Escape
1,309-910,1,2,6.07,Escape
2,309-910,1,3,9.65,Escape
3,309-910,1,4,12.62,Escape
4,309-910,1,5,6.70,Escape
...,...,...,...,...,...
2935,152-071,7,26,2.78,Avoid
2936,152-071,7,27,4.80,Avoid
2937,152-071,7,28,1.89,Avoid
2938,152-071,7,29,3.64,Avoid


In [128]:
def tweak_lat_data(df):
    return (df.
            rename(columns = lambda c: c.replace(' ', '_').lower())
            .rename(columns = {'mouse_': 'mouse_id'})
            .assign(mouse_id = lambda df_: df_['mouse_id'].str.replace("-", "_").astype('category'), 
                    event = lambda df_: df_['event'].str.lower())
    )
lat_df = tweak_lat_data(lat_data)

Unnamed: 0,mouse_id,day,trial,latency,event
0,309_910,1,1,5.69,escape
1,309_910,1,2,6.07,escape
2,309_910,1,3,9.65,escape
3,309_910,1,4,12.62,escape
4,309_910,1,5,6.70,escape
...,...,...,...,...,...
2935,152_071,7,26,2.78,avoid
2936,152_071,7,27,4.80,avoid
2937,152_071,7,28,1.89,avoid
2938,152_071,7,29,3.64,avoid


In [149]:
all_data = (data
            .dropna()
            .merge(lat_df, on = ['mouse_id', 'day', 'trial', 'event'], how = 'left')
            .assign(latency = lambda df: df['latency'].fillna(0))
            .merge(sex_df, on = 'mouse_id', how = 'left')
)
all_data



Unnamed: 0,mouse_id,day,event,sensor,time,trial,signal,latency,sex
0,312_257,5,cue,D2,-25.000000,0,-0.155359,0.00,F
1,312_257,5,cue,D2,-24.901531,0,-0.420553,0.00,F
2,312_257,5,cue,D2,-24.803064,0,-1.592294,0.00,F
3,312_257,5,cue,D2,-24.704596,0,-1.268734,0.00,F
4,312_257,5,cue,D2,-24.606127,0,-0.210176,0.00,F
...,...,...,...,...,...,...,...,...,...
5858795,142_238,4,escape,D1,19.606127,1,0.299603,7.54,F
5858796,142_238,4,escape,D1,19.704596,1,-0.249408,7.54,F
5858797,142_238,4,escape,D1,19.803064,1,-0.486369,7.54,F
5858798,142_238,4,escape,D1,19.901531,1,-0.146374,7.54,F


In [3]:
DATA_PATH = '/projects/p31961/gaby_data/aggregated_data/data_pipeline_full_dataset/datasets/full_dataset.parquet.gzip'


def split_data(data, features, target, day_cut_off, feature_to_drop=None):

    # drops feature prior to splitting data
    if feature_to_drop is not None:
        data = data.drop(columns=feature_to_drop)

    day_cut_off = day_cut_off

    training_set = data.query('day < @day_cut_off')
    valdidation_set = data.query('day == @day_cut_off')
    testing_set = data.query('day > @day_cut_off')

    X_train, y_train = training_set[features], training_set[target]
    X_val, y_val = valdidation_set[features], valdidation_set[target]
    X_test, y_test = testing_set[features], testing_set[target]

    return X_train, y_train, X_val, y_val, X_test, y_test


data = pd.read_parquet(DATA_PATH)
target = 'signal'
control_features = get_features(data, target)
no_bin_features = control_features.copy()
no_bin_features.remove('learning_phase')

# split data for control
logging.info('Splitting data into training, validation and testing sets')
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    data, control_features, target, day_cut_off=6, feature_to_drop=None)

# split data for dropped binned trials
X_train_dropped, X_val_dropped, X_test_dropped, y_train_dropped, y_val_dropped, y_test_dropped = split_data(
    data, no_bin_features, target, day_cut_off=6, feature_to_drop='learning_phase')

In [4]:
X_train

Unnamed: 0,time,mouse_id,sex_M,day,trial,learning_phase,event_cue,event_escape,event_avoid,latency,event_shock,sensor_D1,sensor_D2,sensor_DA
0,-25.000000,12,0,5,0,0,1,0,0,0.00,0,0,1,0
1,-24.901531,12,0,5,0,0,1,0,0,0.00,0,0,1,0
2,-24.803064,12,0,5,0,0,1,0,0,0.00,0,0,1,0
3,-24.704596,12,0,5,0,0,1,0,0,0.00,0,0,1,0
4,-24.606127,12,0,5,0,0,1,0,0,0.00,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5858795,19.606127,1,0,4,1,0,0,1,0,7.54,0,1,0,0
5858796,19.704596,1,0,4,1,0,0,1,0,7.54,0,1,0,0
5858797,19.803064,1,0,4,1,0,0,1,0,7.54,0,1,0,0
5858798,19.901531,1,0,4,1,0,0,1,0,7.54,0,1,0,0


In [5]:
data

Unnamed: 0,time,mouse_id,sex_M,day,trial,learning_phase,event_cue,event_escape,event_avoid,latency,event_shock,sensor_D1,sensor_D2,sensor_DA,signal
0,-25.000000,12,0,5,0,0,1,0,0,0.00,0,0,1,0,-0.155359
1,-24.901531,12,0,5,0,0,1,0,0,0.00,0,0,1,0,-0.420553
2,-24.803064,12,0,5,0,0,1,0,0,0.00,0,0,1,0,-1.592294
3,-24.704596,12,0,5,0,0,1,0,0,0.00,0,0,1,0,-1.268734
4,-24.606127,12,0,5,0,0,1,0,0,0.00,0,0,1,0,-0.210176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5858795,19.606127,1,0,4,1,0,0,1,0,7.54,0,1,0,0,0.299603
5858796,19.704596,1,0,4,1,0,0,1,0,7.54,0,1,0,0,-0.249408
5858797,19.803064,1,0,4,1,0,0,1,0,7.54,0,1,0,0,-0.486369
5858798,19.901531,1,0,4,1,0,0,1,0,7.54,0,1,0,0,-0.146374


In [7]:
y_train.shape

(790050,)

In [8]:
data[target]

0         -0.155359
1         -0.420553
2         -1.592294
3         -1.268734
4         -0.210176
             ...   
5858795    0.299603
5858796   -0.249408
5858797   -0.486369
5858798   -0.146374
5858799    0.068169
Name: signal, Length: 5858800, dtype: float32