In [2]:
import numpy as np
import pandas as pd

# Add background and intro to the dataset (drosophila sleep) and why the HMM is a useful too here

# Loading and curating the dataset

## Data can come in all formats, but a csv file is one of the most common due to its simplicity and intergration with spreadsheets. The data we will be using for this turorial is real, raw data from the Gilestro lab, where we track and record the movement of fruit flies using machine vision. The tracking is able to discern small movements in the fly that can robustly record the flies several times per second giving a multiude of variables to work with

In [5]:
# df = pd.read_csv('/home/lab/Desktop/ReCoDE-HMMs-for-the-discovery-of-behavioural-states/data/training_data_small.zip', index_col = 'id')

In [3]:
df = pd.read_csv('/home/lab/Desktop/ReCoDE-HMMs-for-the-discovery-of-behavioural-states/data/training_data_30.zip', index_col = 'id')

#### Lets have a look at how the data is structured

In [4]:
df

Unnamed: 0_level_0,t,x,y,w,h,max_velocity,mean_velocity,moving,micro,walk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-04-04_17-39-22_033aee|01,31140,0.269116,0.069594,0.038829,0.020012,75.662162,25.713480,True,False,True
2016-04-04_17-39-22_033aee|01,31170,0.606590,0.068019,0.048224,0.020609,27.471271,9.145901,True,False,True
2016-04-04_17-39-22_033aee|01,31200,0.398307,0.070464,0.049073,0.020628,19.718721,5.478951,True,False,True
2016-04-04_17-39-22_033aee|01,31230,0.469571,0.066383,0.046558,0.020423,20.224544,7.475374,True,False,True
2016-04-04_17-39-22_033aee|01,31260,0.260085,0.073667,0.047548,0.020133,34.824007,6.163203,True,False,True
...,...,...,...,...,...,...,...,...,...,...
2016-09-27_10-56-35_053c6b|19,606420,0.537450,0.047642,0.052998,0.023141,7.428117,2.037493,True,False,True
2016-09-27_10-56-35_053c6b|19,606450,0.211436,0.063828,0.048854,0.024929,21.177698,4.470726,True,False,True
2016-09-27_10-56-35_053c6b|19,606480,0.131377,0.065893,0.041694,0.025711,10.986990,3.057987,True,False,True
2016-09-27_10-56-35_053c6b|19,606510,0.512140,0.064421,0.054938,0.021951,29.166126,6.249765,True,False,True


#### In the index column we have the ID which is unique per fly and will allow us to filter and apply methods to just one fly at a time. The next most important variable is 't' or time, as we are working with time series data we must ensure this is strucutred properly i.e. in sequential order and at regular intervals (the later we will go over). The rest are various variables per each timestamp, for this turorial we'll only be interested in 'moving', 'micro', and 'walk'.

#### Most real datasets will not be perfectly populated with tracking dropping out over the course of an experiment. In a dataframe or an array where there is data missing at a timepoint or index this will be represented by a NaN value, which lets methods and functions know there is no data rather than a zero value. However, often analysing packages will throw an error if you feed it NaN values, so it's good practice to check for them first and either remove them or replace then with an approximation.

In [7]:
# Lets filter our dataframe for nan values
# With pandas you can filter the dataframe by the columns
# To filter or slice the dataframe put some square brackets after the dataframe and inside call the column slice 
# For finding NaN values we have to call a method, for other regualr filtering you just use =, <, > and so on

df[df['moving'].isnull()]

Unnamed: 0_level_0,t,x,y,w,h,phi,max_velocity,mean_velocity,distance,moving,micro,walk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-04-04_17-39-22_033aee|01,43260,,,,,,,,,False,,
2016-04-04_17-39-22_033aee|01,43320,,,,,,,,,False,,
2016-04-04_17-39-22_033aee|01,43380,,,,,,,,,False,,
2016-04-04_17-39-22_033aee|01,43440,,,,,,,,,False,,
2016-04-04_17-39-22_033aee|01,52860,0.25742,0.079236,0.051246,0.018465,154.47191,1.179991,0.650638,0.17372,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
2016-04-04_17-39-05_009aee|20,132900,,,,,,,,,False,,
2016-04-04_17-39-05_009aee|20,132960,,,,,,,,,False,,
2016-04-04_17-39-05_009aee|20,133020,,,,,,,,,False,,
2016-04-04_17-39-05_009aee|20,393480,,,,,,,,,False,,


In [8]:
# To break down whats happening we can just call whats inside the brackets, you can see that it is an array (or series in pandas terms) with False or True per row.
# This array then dictates what rows get returned from the whole dataframe, i.e. only the ones that fullfill the argument and are True

df['moving'].isnull()

id
2016-04-04_17-39-22_033aee|01    False
2016-04-04_17-39-22_033aee|01    False
2016-04-04_17-39-22_033aee|01    False
2016-04-04_17-39-22_033aee|01    False
2016-04-04_17-39-22_033aee|01    False
                                 ...  
2016-04-04_17-39-05_009aee|20    False
2016-04-04_17-39-05_009aee|20    False
2016-04-04_17-39-05_009aee|20    False
2016-04-04_17-39-05_009aee|20    False
2016-04-04_17-39-05_009aee|20    False
Name: moving, Length: 327031, dtype: bool

In [9]:
# However, we are not just looking at one column. 
# Luckily with pandas you can filter by multiple columns, all you need to do is put each filter argument in round brackets and then seperate them by an & ("and") or | ("or") logical operator

df[(df['moving'].isnull()) | (df['micro'].isnull()) | (df['walk'].isnull())]

Unnamed: 0_level_0,t,x,y,w,h,phi,max_velocity,mean_velocity,distance,moving,micro,walk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-04-04_17-39-22_033aee|01,43260,,,,,,,,,False,,
2016-04-04_17-39-22_033aee|01,43320,,,,,,,,,False,,
2016-04-04_17-39-22_033aee|01,43380,,,,,,,,,False,,
2016-04-04_17-39-22_033aee|01,43440,,,,,,,,,False,,
2016-04-04_17-39-22_033aee|01,52860,0.25742,0.079236,0.051246,0.018465,154.47191,1.179991,0.650638,0.17372,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
2016-04-04_17-39-05_009aee|20,132900,,,,,,,,,False,,
2016-04-04_17-39-05_009aee|20,132960,,,,,,,,,False,,
2016-04-04_17-39-05_009aee|20,133020,,,,,,,,,False,,
2016-04-04_17-39-05_009aee|20,393480,,,,,,,,,False,,


### Extra Task: If you're new to pandas (or just want some practice) have a play around with the filtering (such as df[df['mean_velocity'] > 5]). It makes a quick and easy way to filter your data and if you're doing the same thing repeatably you can create a function to do it instantly.

### There are multiple tutorials online that will go further on this topic, **add link**

# **TO DO**

##  Binning data to a higher number 
### Outcome - Why you would want to do it, the benfits and downsides

In [6]:
# bin to a higher time interval    

def bin_data(data, column, bin_column, function, bin_secs):

    index_name = data['id'].iloc[0]

    data[bin_column] = data[bin_column].map(lambda t: bin_secs * floor(t / bin_secs))
    output_parse_name = f'{column}_{function}' # create new column name

    bout_gb = data.groupby(bin_column).agg(**{
        output_parse_name : (column, function)    
    })

    bin_parse_name = f'{bin_column}_bin'
    bout_gb.rename_axis(bin_parse_name, inplace = True)
    bout_gb.reset_index(level=0, inplace=True)
    old_index = pd.Index([index_name] * len(bout_gb.index), name = 'id')
    bout_gb.set_index(old_index, inplace =True)

    return bout_gb

# **TO DO**

##  Checking for gaps in the data
### Outcome - Why the data needs to continuous with no gaps. Downsides to interpolation

In [7]:
# check every if any of the time series skip time point (interpolate)

def check_points():
    

def _wrapped_interpolate(data, var, step, t_col = 't'):

    id = data['id'].iloc[0]
    sample_seq = np.arange(min(data[t_col]), np.nanmax(data[t_col]), step)
    if len(sample_seq) < 3:
        return None
    f  = interp1d(data[t_col].to_numpy(), data[var].to_numpy())
    return  pd.DataFrame(data = {'id' : id, t_col : sample_seq, var : f(sample_seq)})

# **TO DO**

##  Creating a new column that is a categorical summantion of the other 3
### Outcome - How to do the above

# **Extra Tasks**

## 1. Split the data by Male and Female into seperate dataframes 
## 2. Convert a continuous float column to a descrete categorical column