# Summary analysis for ongoing selection


- For this to work, you need to adapt the folder where raw data and metaData live.
- Also, you need to clone the jlsocialbehavior repository and create the environment jlsocial from the yml file.

In [None]:
import os

#define folders

# base = 'Y:\\Johannes\\b\\2019\\' This is the original base folder where the meta data was stored
metaFolder = 'Y:\\Carlos\\shoaling_assay_data\\'  # Updated base folder. Metadata should live here.
codeDir = 'D:\\Documents\\jlsocialbehavior' #adapt this to your code folder
metaFile='MetaData_CR.xlsx'
ProcessingDir = 'Y:\\03 Common Use\\temp\\temp_processing\\'
outputDir = 'Y:\\03 Common Use\\temp\\temp_output\\'

print('searching for meta info here: ' + metaFolder)

os.chdir(codeDir)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.stats.api as sms

%config InteractiveShellApp.pylab_import_all = False
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import sys
import fnmatch

import math
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
import glob
#import h5py
from datetime import datetime
import PythonMeta as PMA
from matplotlib.ticker import AutoLocator

import functions.matrixUtilities_joh as mu
import functions.notebookHelper as nh
import functions.metaTree as mt

import models.experiment as xp
import models.experiment_set as es
import functions.paperFigureProps as pfp
pfp.paper()

In [None]:
info=pd.read_excel(metaFolder+metaFile, sheet_name='AllExp')
info

In [None]:
infoAn=pd.read_excel(metaFolder+metaFile, sheet_name='AllAn',parse_dates=['bd','expDate'])
infoAn['bd'] = pd.to_datetime(infoAn['bd'], format='%Y%m%d')
infoAn['expDate'] = pd.to_datetime(infoAn['expDate'], format='%Y%m%d')
infoAn.tail()

In [None]:
infoAn.genotype.unique()

In [None]:
# collect meta information and save to new csv file for batch processing

aviPath=[]
posPath=[]
PLPath=[]
expTime = []
birthDayAll=[]
anIDsAll=[]
camHeightAll=[]

camHeight=[105,180] # these are outdated values, but we keep them for compatibility with old data

for index,row in info.iterrows():

    startDir=row.path+'\\'+row.folder+'\\'
    print('processing: ' + startDir)
    if not os.path.exists(startDir):
        print('WARNING: path does not exist: ' + startDir)
        continue

    
    posPath.append(glob.glob(startDir+'PositionTxt*')[0]) #this is the trajectory file
    PLPath.append(glob.glob(startDir+'PL*')[0]) # this is the pair list file
    
    head, tail = os.path.split(posPath[-1])
    currTime='dummy' #datetime.strptime(tail[-23:-4], '%Y-%m-%dT%H_%M_%S')
    expTime.append(currTime)
    
    camHeightAll.append(camHeight[('_dn_' in head)*1]) ######### this needs to be adapted for new data, since the camHeight is not stored in the meta data anymore.
    
    anNrs=row.anNr #Note that anNrs are 1 based!
    if ':' in anNrs:
        a,b=anNrs.split(sep=':')
        anNrs=np.arange(int(a),int(b)+1)
    else:
        anNrs=np.array(anNrs.split()).astype(int)
        
    anIDs=anNrs #-1 no more 0-based since using pandas merge to find animal numbers
    anIDsAll.extend(anIDs)

    bd=infoAn[infoAn.anNr.isin(anIDs)].bd.values.astype(str) # get birth dates of animals in this experiment
    #bd=infoAn.bd.values[anIDs-1] #a bit dirty to use anIDs directly here. Should merge
    birthDayAll.append(' '.join(list(bd)))

info['camHeight']=camHeightAll
info['txtPath']=posPath
info['pairList']=PLPath
info['aviPath']='default'
info['birthDayAll']=birthDayAll
info['epiDur'] = 5      # duration of individual episodes (default: 5 minutes)
info['episodes'] = 24   # number of episodes to process: -1 to load all episodes (default: -1)
info['inDish'] = 10#np.arange(len(posPath))*120     # time in dish before experiments started (default: 10)
info['arenaDiameter_mm'] = 70 # arena diameter (default: 100 mm)
info['minShift'] = 60 # minimum number of seconds to shift for control IAD
info['episodePLcode'] = 0 # flag if first two characters of episode name encode animal pair matrix (default: 0)
info['recomputeAnimalSize'] = 0 # flag to compute animals size from avi file (takes time, default: 1)
info['SaveNeighborhoodMaps'] = 0 # flag to save neighborhood maps for subsequent analysis (takes time, default: 1)
info['computeLeadership'] = 0 # flag to compute leadership index (takes time, default: 1)
info['ComputeBouts'] = 1 # flag to compute swim bout frequency (takes time, default: 1)
#info['set'] = np.arange(len(posPath))   # experiment set: can label groups of experiments (default: 0)
info['ProcessingDir']=ProcessingDir
info['outputDir']=outputDir
info['expTime']=expTime
info['readLim'] = 24*5*60*30+11

In [None]:
csvFile=os.path.join(ProcessingDir,'processingSettings.csv')
info.to_csv(csvFile,encoding='utf-8')
info.tail()

In [None]:
rereadData=1
if rereadData:
    def readExperiment(keepData=True):
        tmp=es.experiment_set(csvFile=csvFile,MissingOnly=True)
        if keepData:
            return tmp
        else:
            return 1

    expSet=readExperiment(keepData=False)

In [None]:
csvPath = []
for f in [mu.splitall(x)[-1][:-4] for x in info.txtPath]:
    csvPath.append(glob.glob(ProcessingDir+f+'*siSummary*.csv')[0])

df=pd.DataFrame()
i=0
for fn in csvPath:
    print(fn)
    tmp=pd.read_csv(fn,index_col=0,sep=',')
    tmp.animalSet=i
    tmp.animalIndex=tmp.animalIndex+((i)*35)
    tmp.animalIndex=np.array(anIDsAll)[tmp.animalIndex]
    df=pd.concat([df,tmp])
    i+=1
df['episode']=[x.strip().replace('_','') for x in df['episode']]
df=pd.merge(df,infoAn[['anNr','line','genotype']],left_on='animalIndex',right_on='anNr',how='left')
df=pd.merge(df,info[['date']],left_on='animalSet',right_on=info.index,how='left')
df['setup'] = info['setup'].values[df['animalSet'].values]
print('df shape',df.shape)
df['lineSet']=[x+'_'+y for x,y in zip(df.line, df.date)]


df.tail(10)

In [None]:
sns.set_palette('viridis',3)
co=sns.color_palette("viridis", 3)
idx=(df['inDishTime']<240) & (df['inDishTime']>80)
dfDR=df[idx]
dfEpiAn=dfDR.groupby(['episode','animalIndex','line','setup','genotype','date','lineSet'],sort=True).mean(numeric_only=True).reset_index()

In [None]:
dfEpiAn.head()

In [None]:

def sem(x):
    return np.std(x, ddof=1) / np.sqrt(len(x))

x=np.random.random(100)

def ci95(x):
    return np.nanmean(x)-sms.DescrStatsW(x[np.isfinite(x)]).tconfint_mean()[0]

print('std of uniform = 0.2886751345948129. STDdata:',np.std(x), 'semData:',sem(x),'samples:',x.shape) 
print('ci95:',ci95(x))

In [None]:
dfPlot=(df.groupby(['inDishTime','episode','genotype','lineSet']).si.agg(['mean','std',sem,ci95])
    .unstack()
    .stack(dropna=True)
    .reset_index())

dfPlot.head()

In [None]:
dfPlot_filtered = dfPlot[dfPlot['genotype'] != 'esc']

markers = {'hi': 'o', 'lo': 's'}  # adjust to your genotypes


g = sns.FacetGrid(
    dfPlot_filtered,
    col='lineSet',         # Facet by genotype (change as needed)
    col_wrap=3,             # Adjust for compactness
    sharex=True,
    sharey=True,
    height=2,
    aspect=1
)
g.map_dataframe(
    sns.scatterplot,
    x='inDishTime',
    y='mean',
    hue='episode',
    style='genotype',
    size=10,
    markers=markers,
    legend='full',

)
g.set(xlim=(0, 2.5*60), ylim=(0, .5))
g.set_axis_labels('Time (Minutes)', 'Attraction')
g.set_titles('{col_name}')
g.figure.subplots_adjust(top=0.85)
g.figure.suptitle('Mean attraction, all animals', fontsize=14)

# Remove the default legend and add a correct one
for ax in g.axes.flatten():
    handles, labels = ax.get_legend_handles_labels()
    ax.legend_.remove() if hasattr(ax, "legend_") and ax.legend_ else None

# Add a single legend to the figure
handles, labels = g.axes[0].get_legend_handles_labels()
g.figure.legend(handles, labels, ncol=1, handletextpad=0, bbox_to_anchor=(1, 1), loc='upper left')







In [None]:
dfPlot_filtered2 = dfPlot[(dfPlot['genotype'] != 'esc') & (dfPlot['episode'] != '01k01f')]

g = sns.FacetGrid(
    dfPlot_filtered2,
    col='lineSet',         # Facet by genotype (change as needed)
    col_wrap=3,             # Adjust for compactness
    sharex=True,
    sharey=True,
    height=2,
    aspect=1
)
g.map_dataframe(
    sns.scatterplot,
    x='inDishTime',
    y='mean',
    hue='genotype',
    
)
g.set(xlim=(0, 2.5*60), ylim=(0, .5))
g.set_axis_labels('Time (Minutes)', 'Attraction')
g.set_titles('{col_name}')
g.figure.subplots_adjust(top=0.85)
g.figure.suptitle('Mean attraction, all animals', fontsize=14)

# Remove the default legend and add a correct one
for ax in g.axes.flatten():
    handles, labels = ax.get_legend_handles_labels()
    ax.legend_.remove() if hasattr(ax, "legend_") and ax.legend_ else None

# Add a single legend to the figure
handles, labels = g.axes[0].get_legend_handles_labels()
g.figure.legend(handles, labels, ncol=1, handletextpad=0, bbox_to_anchor=(1, 1), loc='upper left')


In [None]:
sns.pointplot(data=dfEpiAn[dfEpiAn.episode=='02k20f'],x='lineSet',y='si',hue='setup',linestyle='none')
plt.ylim([0,.4])
plt.xticks(rotation=90);
plt.ylabel('Shoaling Index');


In [None]:
fig, axes = plt.subplots(figsize=(8,3))
sns.pointplot(data=dfEpiAn[dfEpiAn.episode=='02k20f'],
              x='lineSet',
              y='si',
              hue='genotype',
              
             linestyle='none',
             errorbar='sd',
             dodge=.5)
sns.despine()

axes.set_ylabel('Shoaling Index')
axes.axhline(0,ls=':',color='k')
axes.set_title('Frequency tuning per age group');
plt.legend(title='age',ncol=2,handletextpad=0,bbox_to_anchor=(1, 1.05))
plt.xticks(rotation=90);

In [None]:
fig, axes = plt.subplots(figsize=(8, 3))
ix=(dfEpiAn.episode=='02k20f')#&(~dfEpiAn.lineSet.str.contains('_2_'))

selDat=dfEpiAn[ix]

allCat=selDat.lineSet.unique()
allCat.sort()
allCat=allCat[::-1]

sns.swarmplot(data=selDat,
              x='lineSet',
              y='si',
              hue='genotype',
              zorder=-1,
              dodge=.5,
              size=5,
              alpha=.5,
              #order=allCat,
             hue_order=["hi", "lo",'wt','esc','mid'])

sns.pointplot(data=selDat,
              x='lineSet',
              y='si',
              hue='genotype',
              dodge=.5,
              linestyle='none',
              errorbar='sd',
              hue_order=["hi", "lo",'wt','esc','mid'])


sns.despine()

axes.set_ylabel('Attraction')
axes.set_xlabel('Cohort')

axes.axhline(0,ls=':',color='k')
#axes.set_title('Selection F1');

plt.xticks(rotation=90);

handles, labels = axes.get_legend_handles_labels()

l = plt.legend(handles[0:5], labels[0:5], title='Parents',ncol=1,handletextpad=0,
               bbox_to_anchor=(1, 1.05),
              frameon=False)

#figPath=base+'SelectionAllToF1.png'
#plt.savefig(figPath,bbox_inches='tight')

plt.title('Selection per experiment');


# Generate Grid overview which animals to select

select one experimental condition to be analyzed and plotted based on lineSet as 'ShowGroup' below.
This will typically be from two setups.

In [None]:
dfEpiAn.lineSet.unique()

In [None]:
ShowGroup='Sel1_10_18-06-2025'
nSel=10 #how many fish to select, typically 10

In [None]:
ix=(dfEpiAn.episode=='02k20f')&(dfEpiAn.lineSet==ShowGroup)

selDat=dfEpiAn[ix].copy()
selDat['plotGroup']=0

selDat['rank']=selDat.groupby(['lineSet','genotype'])['si'].rank()
selDat['rankInverse']=selDat.groupby(['lineSet','genotype'])['si'].rank(ascending=False)
selDat['pick']=((((selDat['rank']<=nSel)&(selDat['genotype']=='lo')) | ((selDat.rankInverse<=nSel)&(selDat['genotype']=='hi'))))


sns.swarmplot(selDat,x='genotype',y='si',color='gray');
sns.swarmplot(selDat[selDat['pick']==1 ],x='genotype',y='si',hue='genotype');
plt.title(ShowGroup + ' selection of ' + str(nSel) + ' fish per group');
plt.ylabel('Attraction');
plt.xlabel('Group');



In [None]:
ix=(dfEpiAn.episode=='02k20f')&(dfEpiAn.lineSet==ShowGroup)
g=sns.FacetGrid(dfEpiAn[ix].groupby(['setup','animalID','genotype']).mean(numeric_only=True).reset_index(),col='genotype',hue='setup')
#g.set(xlim=(0,.5))

g=g.map(plt.scatter,'si','anSize',s=50,alpha=0.7)
plt.legend()

In [None]:
ixlo=(dfEpiAn.episode=='02k20f')&(dfEpiAn.genotype=='lo')&(dfEpiAn.lineSet==ShowGroup)
dataLo=dfEpiAn[ixlo].groupby(['setup','animalID','genotype']).si.mean().reset_index()
dataLo['rank']=dataLo.si.rank()
dataLo=dataLo.sort_values(by='rank').reset_index()
dataLo

In [None]:

ixhi=(dfEpiAn.episode=='02k20f')&(dfEpiAn.genotype=='hi')&(dfEpiAn.lineSet==ShowGroup)
dataHi=dfEpiAn[ixhi].groupby(['setup','animalID','genotype']).si.mean(numeric_only=True).reset_index()
dataHi['rank']=dataHi.si.rank()
dataHi=dataHi.sort_values(by='rank',ascending =False).reset_index()
dataHi

In [None]:
fig, axes = plt.subplots(2,1,figsize=(3.5, 5))


major_ticks = np.arange(0, 7, 7)
minor_ticks = np.arange(0, 7, 1)



for n,ax in enumerate(axes):
    for i in range(35):
        ax.text(i%7+.5,4-np.floor_divide(i,7)+.5,str(i))
        
    
    ax.set_xticks(major_ticks)
    ax.set_xticks(minor_ticks, minor=True)
    ax.set_yticks(major_ticks)
    ax.set_yticks(minor_ticks, minor=True)

    ax.set_xlim([0,7])
    ax.set_ylim([0,5])
    ax.grid(which='both')
    ax.tick_params(labelbottom=False)    
    ax.tick_params(labelleft=False)  
    
for n,row in dataHi[:nSel].iterrows():
    a=int(row.setup==2)
    i=row.animalID
    axes[a].plot(i%7+.5,4-np.floor_divide(i,7)+.5,'r.')
    axes[a].text(i%7+.2,4-np.floor_divide(i,7)+.1,'hi:'+str(n),color='r',fontsize=8)

for n,row in dataLo[:nSel].iterrows():
    a=int(row.setup==2)
    i=row.animalID
    axes[a].plot(i%7+.5,4-np.floor_divide(i,7)+.5,'b.')
    axes[a].text(i%7+.2,4-np.floor_divide(i,7)+.1,'lo:'+str(n),color='b',fontsize=8)
    

axes[0].set_title('setup1 '+ShowGroup + ' ' +dfEpiAn[ixhi].date.unique()[0]);
axes[1].set_title('setup2 '+ShowGroup + ' ' +dfEpiAn[ixhi].date.unique()[0]);

In [None]:
ix=(dfEpiAn.episode=='02k20f')
g=sns.FacetGrid(dfEpiAn[ix].groupby(['setup','animalID','genotype','lineSet']).mean(numeric_only=True).reset_index(),col='genotype')
g=g.map(plt.scatter,'si','boutDur',s=10,alpha=0.5)
for ax in g.axes.ravel():
    ax.set_ylim([0.2,1.5])
    ax.axhline(20/30)
#figPath=base+'SelectionSizeVsShoal.png'
#plt.savefig(figPath,bbox_inches='tight')


In [None]:
ix=(dfEpiAn.episode=='01k01f')
g=sns.FacetGrid(dfEpiAn[ix].groupby(['setup','animalID','genotype','lineSet']).mean(numeric_only=True).reset_index(),col='genotype')
g=g.map(plt.scatter,'si','boutDur',s=10,alpha=0.5)
for ax in g.axes.ravel():
    ax.set_ylim([0.2,1.5])
    ax.axhline(20/30)