# Set up
This section should be interacted with. Use `data_path` to designate the folder where the OpenBARR data is. This folder needs to be formatted beforehand - take a look at the **README** or see `/sample_data/original_data` for an example. Change `output_path` to a folder where you want the graphs and the data used to make them.

Here, you will also specify the experimental conditions in `conditions` and days in `days`. This will help the program navigate the files and batch process.


In [11]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

# import packages we need for analysis and graphing
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# specify where the data is
data_path = '/content/drive/MyDrive/0 Revamp/sample_data/original_data'
output_path = '/content/drive/MyDrive/0 Revamp/sample_data/output'

# specify experimental conditions
conditions = ['3EtOH', '25EtOH', '50EtOH', '75EtOH']
days = ['D1', 'D2', 'D3']

# specify binning parameters
bin_size, total_time = 60, 900
bins_array = bin_size*np.arange(total_time/bin_size+1)

# specify some aesthetic stuff
colors = [
    ['#4188C5', '#326EA3', '#26547C'],  # for condition 0, 3EtOH, days 1, 2, and 3
    ['#F47C98', '#F16284', '#EF476F'],  # for condition 1, 25Etoh, days 1, 2, and 3
    ['#FFDE92', '#FFD87C', '#FFD166'],  # ...
    ['#35FAC5', '#0CF9BA', '#06D6A0']
  ]

# in case you have data preprocessed already
# tracking_df = pd.read_csv(os.path.join(output_path, 'tracking_df.csv'))
bouts_df = pd.read_csv(os.path.join(output_path, 'bouts_df.csv'))
bins_df = pd.read_csv(os.path.join(output_path, 'bins_df.csv'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Preprocess the data
We read in and format the OpenBARR data to be more understandable and remove the first second of data in case of initial mistracking.

In [61]:
def read_raw(file_path):
  """Read an OpenBARR file.

  Reads a tab-delimited OpenBARR file and specifies the column names.

  Parameters
  ----------
  file_path : str
    The path to the OpenBARR file.

  Returns
  -------
  pandas.DataFrame
    A DataFrame containing the data from the OpenBARR file.
  """
  return pd.read_csv(file_path, sep='\t', header=None,
                     names=['time', 'x', 'y', 'in', 'entry', 'exit'])

def adjust_y(df):
  """Adjust the y-coordinates of an OpenBARR dataframe.

  Adjusts the y-coordinates of an OpenBARR dataframe relative to the border of
  ROSA and RONSA. Where y >= 0 indicates how far animal is in ROSA while y < 0
  indicate how far animal is in RONSA. This adjustment makes the data more
  intuitive and easier to analyze.

  Parameters
  ----------
  df : pandas.DataFrame
    A DataFrame containing the data from the OpenBARR file.

  Notes
  -----
  This works under the assumption, in short, that the OpenBARR is set up in the
  exact same way. Common problems could be flipped ROSA/RONSA causing RONSA to
  be >= 0 and ROSA <0 and camera height differences could change the arbitrary
  coordinates, affecting the adjustment for a fly never entering ROSA.
  """
  border = df[df['entry'] == 1]['y']  # get y-coords when fly initially enters ROSA
  if len(border) > 0:  # if fly was ever in ROSA
    df.loc[:, 'y'] = max(border) - df['y']  # the adjustment
  else:  # if the fly never entered ROSA,
    df.loc[:, 'y'] = 235 - df['y']  # use upper-bound estimate - see notes

def preprocess(file_path):
  """Preprocess an OpenBARR file.

  Reads an OpenBARR file, removes the first second of data, and adjusts the
  y-coordinates relative to the border of ROSA and RONSA.

  Parameters
  ----------
  file_path : str
    The path to the OpenBARR file.

  Returns
  -------
  pandas.DataFrame
    A DataFrame containing the preprocessed data from the OpenBARR file.
  """
  df = read_raw(file_path)
  df = df[df['time'] >= 1]
  adjust_y(df)
  return df.reset_index(drop=True)

def get_bouts(df, bins=None):
  """Retrieve bout data for an OpenBARR dataframe.

  Extracts bout data to make some analyses easier. A bout is defined as a
  continuous portion of time in ROSA or RONSA.

  Parameters
  ----------
  df : pandas.DataFrame
    A DataFrame containing the preprocessed data from the OpenBARR file.
  bins : np.array, optional
    A list containing the bins to further split the data

  Returns
  -------
  pandas.DataFrame
    A DataFrame containing bout data.

  """
  # normalizer to convert OpenBARR's arbitrary coordinates to cm
  normalizer = 50

  # determine where bouts start/end and sort for easy iteration
  indices = None
  if bins is None:
    indices = np.sort(np.unique(np.concatenate([0, np.where(df['entry'] == 1)[0], np.where(df['exit'] == 1)[0], df.shape[0]-1], axis=None)))
  else:
    indices = np.sort(np.unique(np.concatenate([np.where(df['entry'] == 1)[0],
                                                np.where(df['exit'] == 1)[0],
                                                [np.where((df['time'].reset_index(drop=True) - bin) >= 0)[0][0] for bin in bins],
                                                df.shape[0]-1], axis=None)))

  bout_indicator, bout_num = df.loc[indices[0], 'in'], 0
  running_max_depth, running_sample_timer = 0, 0
  bouts = pd.DataFrame(columns=['bout', 'in', 'start', 'end', 'duration', 'max depth', 'distance', 'speed', 'entry', 'sample'])
  for i in range(indices.size-1):
    # where this bout starts and ends
    start, end = indices[i], indices[i+1]

    # where next bout starts and ends
    next_start, next_end, future_bout_indicator = None, None, None
    if i+1 < indices.size-1:
      next_start, next_end = indices[i+1], indices[i+2]
      future_bout_indicator = df.loc[next_start, 'in']

    # get data on whether this bout was in ROSA or RONSA
    in_rosa = df.loc[start, 'in']

    # reset data on past bouts if this is a new bout in a new region
    if bout_indicator != in_rosa:
      bout_num += 1  # track bouts by change in region, not change in bins
      bout_indicator = in_rosa  # update prev. bout indicator for next bout
      running_max_depth, running_sample_timer = 0, 0  # reset previous depth and sample data

    # compute bout duration
    start_time, end_time = df.loc[start, 'time'], df.loc[end-1, 'time']
    duration = end_time - start_time

    # get max depth for bout
    depth = np.max([np.max(np.abs(df.loc[start:end, 'y']))/normalizer,  # max depth for this bout
                    running_max_depth,  # max depth for previous bout, non-zero when previous bout was in same region
                    np.max(np.abs(df.loc[next_start:next_end, 'y']))/normalizer if future_bout_indicator == in_rosa else 0])  # max depth for future bout, non-zero if future bout is in same region
    running_max_depth = depth  # update the prev. depth for next bout

    # compute total distance traveled in a bout
    dist = np.sum(np.linalg.norm(np.subtract(df.iloc[start+1:end][['x', 'y']].reset_index(drop=True),
                                              df.iloc[start:end-1][['x', 'y']].reset_index(drop=True)),
                                  axis=1))/normalizer

    # compute speed for a bout
    speed = dist/duration if duration > 0 else 0

    # determine if this bout was an entry
    entry = 1 if in_rosa == 1 and i != 0 else 0

    # determine if this bout was a sample
    future_bout_time = df.loc[next_end-1, 'time'] - df.loc[next_start, 'time'] if future_bout_indicator == in_rosa else 0  # compute time for next bout if in same region
    sample = 1 if duration + running_sample_timer + future_bout_time <= 2 and in_rosa == 1 else 0  # sample if duration of this bout summed with the duration of the next/past bouts that are also in same region, is less 2 seconds
    running_sample_timer = duration  # update the prev. sample timer for next bout

    # create row for this bout
    bouts.loc[bouts.shape[0]] = [bout_num, in_rosa, start_time, end_time, duration, depth, dist, speed, entry, sample]
  return bouts

In [None]:
# read in all the data and put them into dataframes for analysis
tracking_df, bouts_df, bins_df = None, None, None
for condition in conditions:
  for day in days:
    temp_path = os.path.join(data_path, condition, day)
    print('Processing files in', temp_path)
    for a_file in os.listdir(temp_path):
      print(a_file)
      temp_df = preprocess(os.path.join(temp_path, a_file)).reset_index(drop=True)

      # take preprocessed file, label, and add to the big preprocessed df
      temp_df.loc[:, ['condition', 'day', 'id']] = condition, day, a_file
      if tracking_df is None:
        tracking_df = temp_df.copy()
      else:
        tracking_df = pd.concat([tracking_df, temp_df], ignore_index=True)

      # take preprocessed file, get bouts, label, and add to bouts df
      temp_bouts = get_bouts(temp_df)
      temp_bouts.loc[:, ['condition', 'day', 'id']] = condition, day, a_file
      if bouts_df is None:
        bouts_df = temp_bouts.copy()
      else:
        bouts_df = pd.concat([bouts_df, temp_bouts], ignore_index=True)

      # take preprocessed file, get bouts, label, and add to bouts df
      temp_bins = get_bouts(temp_df, bins=bins_array[:-1])
      temp_bins.loc[:, ['condition', 'day', 'id']] = condition, day, a_file
      if bouts_df is None:
        bins_df = temp_bins.copy()
      else:
        bins_df = pd.concat([bins_df, temp_bins], ignore_index=True)

    print(len(os.listdir(temp_path)), 'files processed\n')

# uncomment below if you want to save newly built dfs for reference
# tracking_df.to_csv(os.path.join(output_path, 'tracking_df.csv'), index=False)
# bouts_df.to_csv(os.path.join(output_path, 'bouts_df.csv'), index=False)
# bins_df.to_csv(os.path.join(output_path, 'bins_df.csv'), index=False)

# Graph the data

## Scatter Bar Plots



In [177]:
def scatter_bar_plot(bouts_data, conditions, days, rosa, target_col, agg_func, colors, ylim_top=None, ylab='default ylab', plot_title='default title'):
  """Create a scatter bar plot.

  Creates a scatter bar plot using bout data. The data plotted is specified by
  commonly analyzed variables such as when the fly is in ROSA, bout duration,
  distance traveled, etc. Aggregate functions can also be specified and are used
  to summarize your variable of interest. Data used to produce the graph is
  outputed and a pdf of the graph is saved to your specified output path.

  Parameters
  ----------
  bouts_data : pandas.DataFrame
    A DataFrame containing bout data created in the "Preprocess the data" step
  conditions : list
    A list of condition names specified initially
  days : list
    A list of day names specified initially
  rosa : int
    1 (meaning in ROSA) or 0 (meaning in RONSA)
  target_col : str
    The name of the column of interest from the bouts data
  agg_func : str
    The name of a pandas aggregate function like "sum" and "mean"
  colors : list
    2d list of colors to use for each condition and day
  ylim_top : int, default = None
    The max value of the y-axis for the plot
  ylab : str, default = 'default ylab'
    The y-axis label
  plot_title : str, default = 'default title'
    The title of the plot
  """
  # variables for plot transparency, width of columns, and angle of x labels
  t, w, ang = 0.8, 0.25, 20

  # create the grid for the multiple plots
  fig, ax = plt.subplots(1, len(days), sharey=True, figsize=(2*len(days), 6))
  x = np.arange(len(conditions))*w

  # aggregate data for graphing
  agg_bouts_df = bouts_data.groupby(['condition', 'day', 'in', 'id']).agg({target_col:agg_func})

  for i, day in enumerate(days):
    for j, condition in enumerate(conditions):
      # select condition, day, and rosa from the aggregated data
      temp_data = agg_bouts_df.loc[(condition, day, rosa)]

      # use the selected data to scatter plot a single "bar"
      ax[i].scatter(j*w+np.random.uniform(-0.05, 0.05, len(temp_data)), temp_data, color=colors[j][i], alpha=t)

    # labeling and aesthetic for a day's particular graph
    ax[i].set_xticks(x, labels=conditions, rotation=ang)
    ax[i].set_title(day)
    ax[i].spines[['right', 'top']].set_visible(False)

  # more labeling and standardizing of yaxis
  ax[0].set_ylim(0, ylim_top)
  ax[0].set_ylabel(ylab)
  fig.suptitle(plot_title)

  # save the data used for graphing
  save = agg_bouts_df.reset_index()
  save.to_csv(os.path.join(output_path, f'{plot_title}.csv'), index=False)

  # save figure
  fig.savefig(os.path.join(output_path, f'{plot_title}.pdf'), transparent=True)

### Plot the Scatter Bar Plots

In [None]:
# time
# total time spent in rosa/ronsa
scatter_bar_plot(bouts_df, conditions, days, rosa=1, target_col='duration', agg_func='sum',
                 colors=colors, ylab='s', plot_title='Total Time Spent in ROSA')
scatter_bar_plot(bouts_df, conditions, days, rosa=0, target_col='duration', agg_func='sum',
                 colors=colors, ylab='s', plot_title='Total Time Spent in RONSA')

# average time spent in rosa/ronsa, i also specify ylim_top in the rosa case \
# if you need to change the max value of the y-axis
scatter_bar_plot(bouts_df, conditions, days, rosa=1, target_col='duration', agg_func='mean', ylim_top=50,
                 colors=colors, ylab='s', plot_title='Average Time Spent in ROSA')
scatter_bar_plot(bouts_df, conditions, days, rosa=0, target_col='duration', agg_func='mean',
                 colors=colors, ylab='s', plot_title='Average Time Spent in RONSA')

# distance
# total distance traveled in rosa/ronsa
scatter_bar_plot(bouts_df, conditions, days, rosa=1, target_col='distance', agg_func='sum',
                 colors=colors, ylab='cm', plot_title='Total Distance Traveled in ROSA')
scatter_bar_plot(bouts_df, conditions, days, rosa=0, target_col='distance', agg_func='sum',
                 colors=colors, ylab='cm', plot_title='Total Distance Traveled in RONSA')

# average distance traveled in rosa/ronsa
scatter_bar_plot(bouts_df, conditions, days, rosa=1, target_col='distance', agg_func='mean',
                 colors=colors, ylab='cm', plot_title='Average Distance Traveled in ROSA')
scatter_bar_plot(bouts_df, conditions, days, rosa=0, target_col='distance', agg_func='mean',
                 colors=colors, ylab='cm', plot_title='Average Distance Traveled in RONSA')

# average speed across bouts
scatter_bar_plot(bouts_df, conditions, days, rosa=1, target_col='speed', agg_func='mean',
                 colors=colors, ylab='cm/s', plot_title='Average Speed in ROSA')
scatter_bar_plot(bouts_df, conditions, days, rosa=0, target_col='speed', agg_func='mean',
                 colors=colors, ylab='cm/s', plot_title='Average Speed in RONSA')

# total entries to rosa
scatter_bar_plot(bouts_df, conditions, days, rosa=1, target_col='entry', agg_func='sum',
                 colors=colors, ylab='Entries', plot_title='Total Entries to ROSA')

# average max depth in rosa/ronsa, set ylim_top=4 for alignment with openbarr
scatter_bar_plot(bouts_df, conditions, days, rosa=1, target_col='max depth', agg_func='mean', ylim_top=4,
                 colors=colors, ylab='cm', plot_title='Average Depth in ROSA')
scatter_bar_plot(bouts_df, conditions, days, rosa=0, target_col='max depth', agg_func='mean', ylim_top=4,
                 colors=colors, ylab='cm', plot_title='Average Depth in RONSA')

# samples
scatter_bar_plot(bouts_df, conditions, days, rosa=1, target_col='sample', agg_func='sum',
                 colors=colors, ylab='Samples', plot_title='Total Samples')

## Behavior Across Time Plots

In [191]:
# add bins to the bins df beforehand
bins_df['bin'] = pd.cut(bins_df['start'], bins_array, labels=np.arange(total_time/bin_size)+1)

def beh_time_plot(bins_data, bins, conditions, days, rosa, target_col, agg_func, colors, ylims=(None, None), ylab='default ylab', plot_title='default title'):
  """Create a behavior across time plot.

  Creates a behavior across time plot using binned data. The data plotted is
  specified by commonly analyzed variables such as when the fly is in ROSA,
  duration of bout, etc. Aggregate functions can also be specified and are used
  to summarize your variable of interest. Data used to produce the graph is
  outputed and a pdf of the graph is saved to your specified output path.

  Parameters
  ----------
  bins_data : pandas.DataFrame
    A DataFrame containing binned data created in the "Preprocess the data"
    step with bins added beforehand
  bins : np.array
    An array of bin edges
  conditions : list
    A list of condition names specified initially
  days : list
    A list of day names specified initially
  rosa : int
    1 (meaning in ROSA) or 0 (meaning in RONSA)
  target_col : str
    The name of the column of interest from the bins data
  agg_func : str
    The name of a pandas aggregate function
  colors : list
    2d list of colors to use for each condition and day
  ylims : tuple, default = (None, None)
    The min and max values of the y-axis
  ylab : str, default = 'default ylab'
    The y-axis label
  plot_title : str, default = 'default title'
    The title of the plot
  """
  # variables for plot transparency
  t = 0.8

  # create the grid for the multiple plots
  fig, ax = plt.subplots(1, len(days), sharey=True, figsize=(8*len(days), 4.5))
  x = np.arange(total_time/bin_size) + 1

  # aggregate data for graphing
  agg_bins_df = bins_data.groupby(['condition', 'day', 'in', 'bin', 'id'], observed=True).agg({target_col:agg_func})

  for i, day in enumerate(days):
    for j, condition in enumerate(conditions):
      # select condition, day, and rosa from the aggregated data
      temp_data = agg_bins_df.loc[(condition, day, rosa)].reset_index().groupby('bin', observed=True).agg({target_col:'mean'}).sort_index()
      temp_se = agg_bins_df.loc[(condition, day, rosa)].reset_index().groupby('bin', observed=True).agg({target_col:'std'}).sort_index()

      # use the selected data to plot a single line for a condition and day
      ax[i].plot(x, temp_data, label=condition, color=colors[j][i])

      # plot error bars
      error_lb = (temp_data-temp_se)[target_col]
      error_ub = (temp_data+temp_se)[target_col]
      ax[i].fill_between(error_lb.index, error_lb, error_ub, color=colors[j][i], alpha=0.2)

    # labeling and aesthetic for a day's particular graph
    ax[i].set_xticks(x)
    ax[i].set_xlim(x[0], x[-1])
    ax[i].set_title(day)
    ax[i].spines[['right', 'top']].set_visible(False)

  # more labeling and standardizing of yaxis
  ax[0].set_ylim(ylims)
  ax[0].set_ylabel(ylab)
  ax[0].legend()
  fig.suptitle(plot_title)

  # save the data used for graphing
  save = agg_bins_df.reset_index()
  save.to_csv(os.path.join(output_path, f'{plot_title}.csv'), index=False)

  # save figure
  fig.savefig(os.path.join(output_path, f'{plot_title}.pdf'), transparent=True)

In [None]:
beh_time_plot(bins_df, bins_array, conditions, days, rosa=1, target_col='duration', agg_func='sum', colors=colors, ylab='seconds', plot_title='Time Spent in ROSA')