# Set up
This section should be interacted with. Use `data_path` to designate the folder where the OpenBARR data is. This folder needs to be formatted beforehand - take a look at the **README** or see `/sample_data/original_data` for an example. Change `output_path` to a folder where you want the graphs and the data used to make them.

Here, you will also specify the experimental conditions in `conditions` and days in `days`. This will help the program navigate the files and batch process.


In [1]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

# import packages we need for analysis and graphing
import os
import numpy as np
import pandas as pd

# specify where the data is - this should be changed
data_path = '/content/drive/MyDrive/0 Revamp/sample_data/original_data'
output_path = '/content/drive/MyDrive/0 Revamp/sample_data/output'

# specify experimental conditions - this should be changed
conditions = ['3EtOH', '25EtOH', '50EtOH', '75EtOH']
days = ['D1', 'D2', 'D3']

Mounted at /content/drive


# Preprocess the data
We read in and format the OpenBARR data to be more understandable and remove the first second of data in case of initial mistracking.

In [None]:
def read_raw(file_path):
  """Read an OpenBARR file.

  Reads a tab-delimited OpenBARR file and specifies the column names.

  Parameters
  ----------
  file_path : str
    The path to the OpenBARR file.

  Returns
  -------
  pandas.DataFrame
    A DataFrame containing the data from the OpenBARR file.
  """
  return pd.read_csv(file_path, sep='\t', header=None,
                     names=['time', 'x', 'y', 'in', 'entry', 'exit'])

def adjust_y(df):
  """Adjust the y-coordinates of an OpenBARR dataframe.

  Adjusts the y-coordinates of an OpenBARR dataframe relative to the border of
  ROSA and RONSA. Where y >= 0 indicates how far animal is in ROSA while y < 0
  indicate how far animal is in RONSA. This adjustment makes the data more
  intuitive and easier to analyze.

  Parameters
  ----------
  df : pandas.DataFrame
    A DataFrame containing the data from the OpenBARR file.

  Notes
  -----
  This works under the assumption, in short, that the OpenBARR is set up in the
  exact same way. Common problems could be flipped ROSA/RONSA causing RONSA to
  be >= 0 and ROSA <0 and camera height differences could change the arbitrary
  coordinates, affecting the adjustment for a fly never entering ROSA.
  """
  border = df[df['entry'] == 1]['y']  # get y-coords when fly initially enters ROSA
  if len(border) > 0:  # if fly was ever in ROSA
    df.loc[:, 'y'] = max(border) - df['y']  # the adjustment
  else:  # if the fly never entered ROSA,
    df.loc[:, 'y'] = 235 - df['y']  # use upper-bound estimate - see notes

def preprocess(file_path):
  """Preprocess an OpenBARR file.

  Reads an OpenBARR file, removes the first second of data, and adjusts the
  y-coordinates relative to the border of ROSA and RONSA.

  Parameters
  ----------
  file_path : str
    The path to the OpenBARR file.

  Returns
  -------
  pandas.DataFrame
    A DataFrame containing the preprocessed data from the OpenBARR file.
  """
  df = read_raw(file_path)
  df = df[df['time'] >= 1]
  adjust_y(df)
  return df.reset_index(drop=True)

def get_bouts(df):
  """Retrieve bout data for an OpenBARR dataframe.

  Extracts bout data to make some analyses easier. A bout is defined as a
  continuous portion of time in ROSA or RONSA.

  Parameters
  ----------
  df : pandas.DataFrame
    A DataFrame containing the preprocessed data from the OpenBARR file.

  Returns
  -------
  pandas.DataFrame
    A DataFrame containing bout data.

  """
  # determine where bouts start/end and sort for easy iteration "bucketing"
  indices = np.sort(np.concatenate([0, np.where(df['entry'] == 1)[0], np.where(df['exit'] == 1)[0], df.shape[0]], axis=None))

  bouts = pd.DataFrame(columns=['bout', 'in', 'start', 'end', 'max depth', 'distance'])
  for i in range(indices.size-1):
    start, end = indices[i], indices[i+1]
    if end - start > 1:  # make sure this bout is greater than 1 frame
      # get max depth for bout
      depth = np.amax(np.abs(df.loc[start:end, 'y']))

      # compute total distance traveled in a bout
      dist = np.sum(np.linalg.norm(np.subtract(df.iloc[start+1:end][['x', 'y']].reset_index(drop=True),
                                                df.iloc[start:end-1][['x', 'y']].reset_index(drop=True)), axis=1))
      bouts.loc[bouts.shape[0]] = [i, df.iloc[start]['in'], df.iloc[start]['time'], df.iloc[end-1]['time'], depth, dist]
  return bouts

# read in all the data and put them into dataframes for analysis
combined_df, bouts_df = None, None
for condition in conditions:
  for day in days:
    temp_path = os.path.join(data_path, condition, day)
    print('Processing files in', temp_path)
    for a_file in os.listdir(temp_path):
      temp_df = preprocess(os.path.join(temp_path, a_file))

      # take preprocessed file, label, and add to the big preprocessed df
      temp_df.loc[:, ['condition', 'day', 'id']] = condition, day, a_file
      if combined_df is None:
        combined_df = temp_df.copy()
      else:
        combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

      # take preprocessed file, get bouts, label, and add to bouts df
      temp_bouts = get_bouts(temp_df)
      temp_bouts.loc[:, ['condition', 'day', 'id']] = condition, day, a_file
      if bouts_df is None:
        bouts_df = temp_bouts.copy()
      else:
        bouts_df = pd.concat([bouts_df, temp_bouts], ignore_index=True)

    print(len(os.listdir(temp_path)), 'files processed\n')

# Graphing Time