# Jupyter Notebook UI to analyze baseline data from tap-habituation experiments!

Version 1.3 - Joseph Liang, Rankin Lab
Updated:
1. Upgraded folder path selection application
2. Upgraded dataset management (less moving parts for end-user)
3. output changed from tif -> png

## Known bug: Step 2 an empty windows displays in Mac. May also apply to linux/windows.

## Beginner Essentials:
1. Shift-Enter to run each cell. After you run, you should see an output "done step #". If not, an error has occured
2. When inputting your own code/revising the code, make sure you close all your quotation marks '' and brackets (), [], {}.
3. Don't leave any commas (,) hanging! (make sure an object always follows a comma. If there is nothing after a comma, remove the comma!
4. Learning to code? Each line of code is annotated to help you understand how this code works!

## 3. Run all cells/steps sequentially, even the ones that do not require input

## Steps that require input: #3, #6.1, #7

# 1. Importing Packages Required (No input required, just run)

In [1]:
import pandas as pd #<- package used to import and organize data
import numpy as np #<- package used to import and organize data
import seaborn as sns #<- package used to plot graphs
from matplotlib import pyplot as plt #<- package used to plot graphs
import os #<- package used to work with system filepaths
from ipywidgets import widgets #<- widget tool to generate button
from IPython.display import display #<- displays button
from ipyfilechooser import FileChooser
# from tkinter import Tk, filedialog #<- Tkinter is a GUI package
from tqdm.notebook import tqdm
# import dask.dataframe as dd
import pingouin as pg
pd.set_option('display.max_columns', 50)
print("done step 1")

done step 1


# 2. Pick filepath (just run and click button)

Run the following cell and clicke the button 'Select Folder' to pick a filepath.

## Important: Later on, this script uses the total file path for each file to import and group data. That means if your folder has whatever your strain is named, the script will not work.

(ex. if your folder has "N2" in it this script sees all files inside this folder as having the "N2" search key)

## An easy fix is to just rename your folder to something else (make your strains lower-case, or just have the date)

In [2]:
starting_directory = '/Users/Joseph'
chooser = FileChooser(starting_directory)
display(chooser)

FileChooser(path='/Users/Joseph', filename='', title='', show_hidden=False, select_desc='Select', change_desc=…

In [3]:
print(chooser.selected_path)
folder_path=chooser.selected_path

/Volumes/JosephSSD2T/PD_Screen_Pt2


In [None]:
# Obsolete
# ### Select Folder App - After you run, you can select your folder for filepath
# button = widgets.Button(description = 'Select Folder') #<- creates a button variable
# display(button) #<- displays the button in output

# def select_folder(b): #<- defines an action. This action requires a variable, so an arbitrary one 'b' is there
#     global folder_path #<- sets a variable as a global variable, not just within this action
#     #Tk().withdraw() #<- Tkinter likes to create annoying empty windows. This removes them
#     folder_path = filedialog.askdirectory() #<- Opens up a file explorer window, and determines folder path
#     #Tk().update() #<- below
#     #Tk().destroy() #<- this and the line above it removes the file explorer window after a selection is made
#     print(folder_path) #<- this helps confirm that this action was performed
#     print('done step 2')
# button.on_click(select_folder)

# 3. User Defined Variables (Add input here)

Here, we add some constants to help you blaze through this code.

## 3.1: Setting time bins
pretty self explanatory



## 3.2: Setting view range for your graph
Top, bottom = y axis view range
left, right = x axis view range



In [4]:
# Setting 1s Bins: for 0.5s Bins: 0,360,721
# bins = np.linspace(0,360,721) #<- np.linspace(start, end, steps in between)
# bins = np.linspace(0,420,841) #<- np.linspace(start, end, steps in between)
# bins = np.linspace(0,420,421) # 1 second bins (smoother lines on graph)
bins = np.linspace(0,1200,1201) # 1 second bins (smoother lines on graph)
print(bins)


print("done step 3")

[0.000e+00 1.000e+00 2.000e+00 ... 1.198e+03 1.199e+03 1.200e+03]
done step 3


# 4. Construct filelist from folder path (No input required, just run)

In [5]:
#folder_path = '/Users/Joseph/Desktop/lrk1_ATR_NoATR_08_15_2019'
print(folder_path)
os.chdir(folder_path) #<- setting your working directory so that your images will be saved here

filelist = list() #<- empty list
for root, dirs, files in os.walk(folder_path): #<- this for loop goes through your folder 
    for name in files:
        if name.endswith('.dat'): #<- and takes out all files with a .dat (file that contains your data)
            if "_" in name.split(".")[-2]:
                filepath = os.path.join(root, name) #<- Notes down the file path of each data file
                filelist.append(filepath) #<- saves it into the list

    
print(filelist)
print('done step 4')

/Volumes/JosephSSD2T/PD_Screen_Pt2
['/Volumes/JosephSSD2T/PD_Screen_Pt2/gly-5_gk3119/20231116_144810/VC3230_10x2_f96h20C_600s31x10s10s_A1112cc.dat', '/Volumes/JosephSSD2T/PD_Screen_Pt2/gly-5_gk3119/20231116_153132/VC3230_10x2_f96H20C_600s31x10s10s_B1112ce.dat', '/Volumes/JosephSSD2T/PD_Screen_Pt2/gly-5_gk3119/20231116_135938/VC3230_10x2_f96H20C_600s31x10s10s_B1112cb.dat', '/Volumes/JosephSSD2T/PD_Screen_Pt2/gly-5_gk3119/20231116_132201/VC3230_9x2_f96h20C_600s31x10s10s_C1112ca.dat', '/Volumes/JosephSSD2T/PD_Screen_Pt2/gly-5_gk3119/20231116_145544/VC3230_10x2_f96h20C_600s31x10s10s_C1112cd.dat', '/Volumes/JosephSSD2T/PD_Screen_Pt2/klu-1_ok1306/20231215_191355/RB1241_10x2_f96h20C_600s31x10s10s_B1211dc.dat', '/Volumes/JosephSSD2T/PD_Screen_Pt2/klu-1_ok1306/20231215_171723/RB1241_9X2_f96h20C_600s31x10s10s_C1211db.dat', '/Volumes/JosephSSD2T/PD_Screen_Pt2/klu-1_ok1306/20231215_204042/RB1241_10X2_f96h20C_600s31x10s10s_C1211de.dat', '/Volumes/JosephSSD2T/PD_Screen_Pt2/klu-1_ok1306/20231215_2040

In [9]:
print(filelist[3])
test=filelist[3]
test1=filelist[0]
print(test.split(".")[-2])
print(test1.split(".")[-2])

/Volumes/JosephSSD2T/ASD_Screen/vab-10_gk45/20160406_155046/VC117_5x4_f94h20C_600s30x10s10s_A0406bc.00004.dat
00004
/Volumes/JosephSSD2T/ASD_Screen/vab-10_gk45/20160406_155046/VC117_5x4_f94h20C_600s30x10s10s_A0406bc


# 5. Process Data Function (No input required, just run)

In [1]:
# string='/Volumes/JOSEPH/PD_Screen/cat-2_e1112/20220510_190557/CB1112_10x2_f72h20C_600s31x10s10s_B0506cb.dat'
# print(string.split('/')[-3])
# print(string.split('/')[-2].split('_')[0])
# print(string.split('/')[-1].split('_')[-1].split('.')[0])
# print(string.split('/')[-4])


# genotype=[]
# for f in filelist:
#     genotype.append(f.split('/')[-3])
# # print(np.unique(genotype))
# genotypes=np.unique(genotype)
# print(genotypes)
# print(len(np.unique(genotypes)))
# strainNs = enumerate(genotypes,1)
# strain_enum=list(strainNs)
# print(strain_enum)
# StrainNames=dict(strain_enum)
# print(StrainNames)

In [7]:
def ProcessData(strain): #<- an example of a user-defined function
    strain_filelist = [x for x in filelist if strain in x] #<- goes through the list and filters for keyword
    Strain_N = len(strain_filelist) #<- N per strain, or number of plates
    Plate_N = 1
    print(f'this strain/treatment has {Strain_N} plates') #<- will output as the first number
    if Strain_N == 0:
        raise AssertionError ('{} is not a good identifier'.format(strain))
    else:
        pass
        print(f'now working on strain {strain}')
        strain_filelist = [file for file in filelist if strain in file]
        df_list=[]
        for file in strain_filelist:
            if file.split('/')[-1].startswith('._'):
                pass
            else:
                # print(f"now processing file {file}")
                try:
                    df= pd.read_csv(file, sep=' ', header = None, encoding_errors='ignore')
                    df['Plate_id'] = file.split('/')[-1].split('_')[-1].split('.')[0]
                    df['Date'] = file.split('/')[-2].split('_')[0]
                    df['Screen'] = file.split('/')[-4]
                    df_list.append(df)
                except:
                    print(f"error in file {file}")
                    pass
        DF_Total = pd.concat(df_list, ignore_index = True)
#     for f in strain_filelist:
#         DF_Total = pd.concat(pd.read_csv(f, sep=' ', skiprows = 4, header = None))
        # DF_Total = pd.concat([pd.read_csv(f, sep=' ', header = None) for f in strain_filelist],
        #               ignore_index=True) #<- imports your data files
#         DF_Total = DF_Total.dropna(axis = 1) #<- cleans your data
        DF_Total = DF_Total.rename( #<- more cleaning
                    {0:'Time',
                    1:'n',
                    2:'Number',
                    3:'Instantaneous Speed',
                    4:'Interval Speed',
                    5:'Bias',
                    6:'Tap',
                    7:'Puff',
                    8:'x',
                    9:'y',
                    10:'Width',
                    11:'Length',
                    12:'Area',
                    13:'Angular Speed',
                    14:'Aspect Ratio',
                    15:'Kink',
                    16:'Curve',
                    17:'Crab'}, axis=1)
        #check function here for NaN Columns
        DF_Total['plate'] = 0
        # DF_Total['prob'] = DF_Total['stim_rev']/ (DF_Total['no_rev'] + DF_Total['stim_rev']) #<- calculate prob
        # DF_Total['speed'] = DF_Total['dist']/DF_Total['dura'] #<- calculate speed
        # DF_Total_rows = int(DF_Total.shape[0])
        # print(f'this strain/treatment has {DF_Total_rows} total taps') #<- Outputs as the second number. Check if you are missing taps!
        # DF_Final = DF_Total[["time", "dura", "dist", "prob", "speed", "plate"]].copy()

    return{
            'N': Strain_N,
            'Confirm':DF_Total
            # 'Final': DF_Final
    }
    print('done!')


print('done step 5')

done step 5


# 6.1 Processing Data (Please Read, Input Here)

Here is the hardest part - From your naming convention, you need to pick a unique identifier for each group.

This means that all of names of your files for that strain should have that in common but is not commone with across all other files! If you did a good job naming your files and following a good naming convention, this should be easy.

## Be careful and really look hard in your naming structure. Note you want an unique identifier in the entire file path for the same group of files. An easy mistake is to have the strain name in the folder name, in this case if you use your strain name as a keyword it would include all files in that folder!

For example, if all your N2 files have a certain pattern like "N2_5x4" in this following example:
'/Users/Joseph/Desktop/AVR14_10sISI_TapHab_0710_2019/N2/20190710_141740/N2_5x4_f94h20c_100s30x10s10s_C0710ab.trv'

Then you need to set that identifier for the strain keyword:
'Strain_1' = 'N2_5x4'

In the same example, if the identifier for your second strain is 'AVR14', then the N2 files will also be included, as this identifier can also be found in this file path.

## Depending on how many strains you are running for comparison, you may need to add/delete some lines.

You are not naming your data groups here, we have a step for that later.
## Here, you want to note down ALL the strains you have in the folder

If you have just 2 strains, add hashtags (#) in front of the lines you do not need.
If you need more strains, just add more lines, following the same format!

In [8]:
# TotalConcatenated[['Genotype', 'Treatment']] = TotalConcatenated['Strain'].str.split(' - ', 1, expand=True)
# print(list(set(filelist)))
# nfiles = list(range(1, len(filelist.unique())+1))
# print(nfiles)
# strainnames = []
strainnames=[]
for f in filelist:
    strainnames.append(f.split('/')[-3])
ustrainnames=list(set(strainnames))
print(ustrainnames)
# ustrainnames.sort()
# # print(ustrainnames)
ustrainnames.insert(0, ustrainnames.pop(ustrainnames.index("N2")))
# ustrainnames.insert(0, ustrainnames.pop(ustrainnames.index("N2_XJ1")))
# ustrainnames.insert(0, ustrainnames.pop(ustrainnames.index("N2_N2")))
# print(ustrainnames)
# ustrainnames.insert(0, ustrainnames.pop(ustrainnames.index("N2_OffFood")))
# # print(ustrainnames)
nstrains=list(range(1,len(ustrainnames)+1))
print(nstrains)

StrainNames = {}
StrainNames = {nstrains[i]: ustrainnames[i] for i in range(len(nstrains))}
print(StrainNames)
print("done step 6.1")

# <---------------- Test element to use for dictionary buidling -------------------
# s = '/Users/Joseph/Desktop/OnFoodOffFoodTest/N2_OnFood/20220401_163048/N2_10x1_n96h20C_360sA0401_ka.00065.dat'
# slist=s.split('/')[5]
# print(slist)
# print(list(range(1,5+1)))

['smrc-1_tm13785', 'unc-70_e524', 'klp-12_tm5176', 'immt-1_tm1730', 'ubh-3_tm2550', 'sek-4_gk3642', 'ubh-2_tm2267', 'N2', 'klp-12_tm15618', 'klu-1_ok1306', 'T01G9_tm5159', 'immt-1_tm11505', 'scav-5_ok1606', 'ipla-3_tm1584', 'imp-1_ok2362', 'mig-15_tm13123', 'har-1_gk3124', 'ubh-1_tm526', 'gly-5_gk3119', 'miro-1_tm1966', 'catp-7_tm4438', 'gly-7_gk374', 'rme-8_b1023', 'immt-1_tm2366', 'mbk-1_ok402', 'F35C8_tm4028']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
{1: 'N2', 2: 'smrc-1_tm13785', 3: 'unc-70_e524', 4: 'klp-12_tm5176', 5: 'immt-1_tm1730', 6: 'ubh-3_tm2550', 7: 'sek-4_gk3642', 8: 'ubh-2_tm2267', 9: 'klp-12_tm15618', 10: 'klu-1_ok1306', 11: 'T01G9_tm5159', 12: 'immt-1_tm11505', 13: 'scav-5_ok1606', 14: 'ipla-3_tm1584', 15: 'imp-1_ok2362', 16: 'mig-15_tm13123', 17: 'har-1_gk3124', 18: 'ubh-1_tm526', 19: 'gly-5_gk3119', 20: 'miro-1_tm1966', 21: 'catp-7_tm4438', 22: 'gly-7_gk374', 23: 'rme-8_b1023', 24: 'immt-1_tm2366', 25: 'mbk-1_ok4

In [None]:
# # ------------This Cell Is Now Defunct ----------------


# #Format: Strain_# = 'unique_identifier'

# ### Make into dictionary
# StrainNames = {
#     'Strain_1' : 'No_Parafilm',   #<- each strain will be designated to a unique identifier here
#     'Strain_2' : 'Yes_Parafilm',
# #     'Strain_3' : 'e1112_OffFood',
# #     'Strain_4' : 'e1112_OnFood',
# #     'Strain_5' : 'Test_OffFood',
# #     'Strain_6' : 'Test_OnFood',
# #     'Strain_5' : 'N2_NoFood',
# #     'Strain_6' : 'N2_Food',
# #     'Strain_9' : 'LX636_NoFood',  #<- empty entries are for those hardcore trackers that tracking this many strains
# #     'Strain_10' : 'LX636_Food',
# #     'Strain_11' : '',
# #     'Strain_12' : '',
# #     'Strain_13' : '',
# #     'Strain_14' : '',
# #     'Strain_15' : '',
# }
# #...etc, etc

# print('done step 6.1')
# print(StrainNames)

# Cell below is to run testing/debugging. Do not need to run (commented out)

In [None]:


# DF_Read = pd.read_csv('/Users/Joseph/Desktop/OnFoodOffFoodTest/N2_OnFood/20220401_163048/N2_10x1_n96h20C_360sA0401_ka.00065.dat'
#                       , sep=' ', header = None, index_col=False) #<- imports and cleans data
# DF_Read["worm"]=1
# # print(DF_Read)
# DF_Total = DF_Read #<- more data cleaning
# DF_Total = DF_Total.rename( #<- more data cleaning
#             {0:'time',
#             1:'speed',
#             2: "x",
#             3: "y",
#             4: "angularspeed",
#             5: "pathlength",
#             6: "kink",
#             7: "bias",
#             8: "curve",
#             9: "dir",
#             10: "crab",
#             11: "length",
#             12: "midline",
#             13: "width",
#             14: "morphwidth",
#             15: "area"
#             }, axis=1)
# DF_Total["x_0"] = DF_Total.x - DF_Total.x.iloc[0]
# DF_Total["y_0"] = DF_Total.y - DF_Total.y.iloc[0]

# DF_Total["x_test"] = DF_Total.iloc[:,2] - DF_Total.iloc[0,2]
# DF_Total["y_test"] = DF_Total.iloc[:,3] - DF_Total.iloc[0,3]
# DF_Total["distance"]= 0
# print(DF_Total)

# A_i = np.array(DF_Total['pathlength'][1:])
# A_i_1 = np.array(DF_Total['pathlength'][0:-1])
# result = np.abs(A_i - A_i_1).tolist()
# result.insert(0,0)
# curr_sum = 0
# new_list = []
# for i in range(len(result)):
#     curr_sum += result[i]
#     new_list.append(curr_sum)
# DF_Total["distance"]=new_list
# print(DF_Total)



        


# A_i = np.array(DF_Total.iloc[1:,5])
# A_i_1 = np.array(DF_Total.iloc[0:-1,5])
# result = np.abs(A_i - A_i_1).tolist()
# result.insert(0,0)
# curr_sum = 0
# new_list = []
# for i in range(len(result)):
#     curr_sum += result[i]
#     new_list.append(curr_sum)
# print(A_i)
# print(A_i_1)
# print(len(A_i))
# print(len(A_i_1))
# # resultS=pd.Series(result)
# # print(resultS.cumsum())
# # print(new_list)

# curr_sum = 0
# new_list = []
# for i in range(len(result)):
#     curr_sum += result[i]
#     new_list.append(curr_sum)
    
    
# import matplotlib.pyplot as plt
# plt.plot(new_list)

# 6.2 Process Data (just run this cell)

In [9]:
DataLists = [0] #<- generates empty list. 0 is there to account for python's index starting at 0. 
# we want indexing to start at 1 (when I say #1 I want the first point, not the second point)

for s in tqdm(StrainNames.values()): #<- goes through the dictionary in step 6.1 and processes data
    if not s == '':
        DataLists.append(ProcessData(s)['Confirm']) #<- appends all data into a list of dataframes

# print(DataLists[2])
print('done step 6.2')

  0%|          | 0/26 [00:00<?, ?it/s]

this strain/treatment has 49 plates
now working on strain N2
this strain/treatment has 5 plates
now working on strain smrc-1_tm13785
this strain/treatment has 5 plates
now working on strain unc-70_e524
this strain/treatment has 4 plates
now working on strain klp-12_tm5176
this strain/treatment has 5 plates
now working on strain immt-1_tm1730
this strain/treatment has 5 plates
now working on strain ubh-3_tm2550
this strain/treatment has 5 plates
now working on strain sek-4_gk3642
this strain/treatment has 5 plates
now working on strain ubh-2_tm2267
this strain/treatment has 8 plates
now working on strain klp-12_tm15618
this strain/treatment has 5 plates
now working on strain klu-1_ok1306
this strain/treatment has 5 plates
now working on strain T01G9_tm5159
this strain/treatment has 4 plates
now working on strain immt-1_tm11505
this strain/treatment has 5 plates
now working on strain scav-5_ok1606
this strain/treatment has 5 plates
now working on strain ipla-3_tm1584
this strain/treatmen

# Convert float64 data to float32 to reduce memory load (can also convert to 16 if needed)

For plain english:

float16 = 4 decimal points

float32 = 8 decimal points

float64 = 16 decimal points

more decimal points = more data/memory that computer has to keep track of

In [None]:
print(DataLists[1])

In [None]:
#No need to run here
for n in tqdm(DataLists[1:]):
#     print(n)
    TestData=n
    TestData[TestData.select_dtypes(np.float64).columns] = TestData.select_dtypes(np.float64).astype(np.float16)
    print("done this strain")

In [None]:
#No need to run here

# print(TotalConcatenated.dtypes)
TotalConcatenated['time_bin'] = TotalConcatenated['time_bin'].astype(np.float16)
# print(TotalConcatenated.dtypes)
# TotalConcatenated.dtypes
# Test Cell
# DataLists[1].to_csv("test.csv")
Test = TotalConcatenated.reset_index(drop=True)
print(Test)

# 7. Grouping Data and Naming (Optional: Add input here)

Here, you get to name your data groups/strain! Name your groups however you like under between the quotation marks for each strain.

For example: If your Strain1 is N2 and you wish for the group to be called N2,
your line should look like:

DataLists[x].assign(dataset = 'N2')

## Go back to step 6.1 to check which strain is which item on the DataLists.
In this example, the first item on DataLists is AQ2028_b.


## Remember: Put your name in quotes. (ex: 'N2' and not N2)

As default, the names are set to the unique identifier labels.

## Depending on the number of strains you are running the comparison, you may have to delete/add lines of code (following the same format). 
## Remember to add/delete commas too.

# If you want to change your groups, you do that here. 
For example, if you have 5 strains in your folder but only want to compare between 2 or 3 strains, designate that here and follow through with steps 6 and 7. Once you are done, come back to step 6 and change your groups again (You are going to have to change your graph titles for the second run-through though)!

In [10]:
TotalConcatenated=pd.concat(df.assign(dataset=StrainNames.get(i+1)) for i,df in enumerate(DataLists[1:]))
TotalConcatenated[['Gene', 'Allele']] = TotalConcatenated['dataset'].str.split('_', n=1, expand=True)
TotalConcatenated['Allele']=TotalConcatenated['Allele'].fillna('N2')
Baseline_data=TotalConcatenated[((TotalConcatenated.Time<=590)&(TotalConcatenated.Time >=490))]
Baseline_data=Baseline_data.drop(columns=["plate", "Tap", "Puff", "x","y"]).reset_index()
# TotalConcatenated=TotalConcatenated.dropna()
# TotalConcatenated = TotalConcatenated.reset_index(drop=True)
print(Baseline_data)
# TotalConcatenated.to_csv("tap_baseline_output.csv")
# print("done output")
print('done step 7')

        index     Time   n  Number  Instantaneous Speed  Interval Speed  \
0        8197  490.047  89      59               0.0346          0.0380   
1        8198  490.110  89      59               0.0355          0.0391   
2        8199  490.173  89      59               0.0427          0.0419   
3        8200  490.233  89      59               0.0416          0.0441   
4        8201  490.298  89      59               0.0441          0.0432   
...       ...      ...  ..     ...                  ...             ...   
296691  66847  589.763  92      64               0.0388          0.0552   
296692  66848  589.827  92      64               0.0375          0.0490   
296693  66849  589.887  92      64               0.0377          0.0496   
296694  66850  589.941  92      64               0.0473          0.0579   
296695  66851  589.992  92      64               0.0397          0.0571   

         Bias   Width  Length      Area  Angular Speed  Aspect Ratio  Kink  \
0       0.034  0.0917

In [11]:
Baseline_data['Screen']='PD_Screen'
print(Baseline_data)

        index     Time   n  Number  Instantaneous Speed  Interval Speed  \
0        8197  490.047  89      59               0.0346          0.0380   
1        8198  490.110  89      59               0.0355          0.0391   
2        8199  490.173  89      59               0.0427          0.0419   
3        8200  490.233  89      59               0.0416          0.0441   
4        8201  490.298  89      59               0.0441          0.0432   
...       ...      ...  ..     ...                  ...             ...   
296691  66847  589.763  92      64               0.0388          0.0552   
296692  66848  589.827  92      64               0.0375          0.0490   
296693  66849  589.887  92      64               0.0377          0.0496   
296694  66850  589.941  92      64               0.0473          0.0579   
296695  66851  589.992  92      64               0.0397          0.0571   

         Bias   Width  Length      Area  Angular Speed  Aspect Ratio  Kink  \
0       0.034  0.0917

In [None]:
Baseline_data.to_csv("PD2_new_tap_baseline_output.csv")
print('done')

# END

# Done!