# Summary analysis for selection history


- For this to work, you need to adapt the folder where raw data and metaData live.
- Also, you need to clone the jlsocialbehavior repository and create the environment jlsocial from the yml file.

In [None]:
import os

#define folders

# base = 'Y:\\Johannes\\b\\2019\\' This is the original base folder where the meta data was stored
metaFolder = 'Y:\\Johannes\\b\\'  # Updated base folder. Metadata should live here.
codeDir = 'D:\\Documents\\jlsocialbehavior' #adapt this to your code folder
metaFile='MetaData_JL_2019_consolidated.xlsx'
ProcessingDir = 'Y:\\Johannes\\analysis_output\\b\\selection\\'
outputDir = ProcessingDir # 'Y:\\03 Common Use\\temp\\temp_output\\'

print('searching for meta info here: ' + metaFolder)

os.chdir(codeDir)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.stats.api as sms

%config InteractiveShellApp.pylab_import_all = False
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import sys
import fnmatch

import math
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
import glob
#import h5py
from datetime import datetime
import PythonMeta as PMA
from matplotlib.ticker import AutoLocator

import functions.matrixUtilities_joh as mu
import functions.notebookHelper as nh
import functions.metaTree as mt

import models.experiment as xp
import models.experiment_set as es
import functions.paperFigureProps as pfp
pfp.paper()

In [None]:
info=pd.read_excel(metaFolder+metaFile, sheet_name='AllExp')
ix=(info.stimulusProtocol=='selection')#&(info.date.isin(['2019-08-05','2019-08-06','2019-09-25','2019-10-04']))
info=info[ix].reset_index(drop=True)
info

In [None]:
infoAn=pd.read_excel(metaFolder+metaFile, sheet_name='AllAn',parse_dates=['bd'])
infoAn['bd'] = pd.to_datetime(infoAn['bd'], format='%Y%m%d')
#infoAn['expDate'] = pd.to_datetime(infoAn['expDate'], format='%Y%m%d')
infoAn.tail()

In [None]:
infoAn.genotype.unique()

In [None]:
# collect meta information and save to new csv file for batch processing

aviPath=[]
posPath=[]
PLPath=[]
expTime = []
birthDayAll=[]
anIDsAll=[]
camHeightAll=[]

camHeight=[105,180] # these are outdated values, but we keep them for compatibility with old data

for index,row in info.iterrows():

    startDir=row.path+row.folder+'\\'
    print('processing: ' + startDir)
    if not os.path.exists(startDir):
        print('WARNING: path does not exist: ' + startDir)
        continue

    
    posPath.append(glob.glob(startDir+'PositionTxt*')[0]) #this is the trajectory file
    PLPath.append(glob.glob(startDir+'PL*')[0]) # this is the pair list file
    
    head, tail = os.path.split(posPath[-1])
    currTime='dummy' #datetime.strptime(tail[-23:-4], '%Y-%m-%dT%H_%M_%S')
    expTime.append(currTime)
    
    camHeightAll.append(camHeight[('_dn_' in head)*1]) ######### this needs to be adapted for new data, since the camHeight is not stored in the meta data anymore.
    
    anNrs=row.anNr #Note that anNrs are 1 based!
    if ':' in anNrs:
        a,b=anNrs.split(sep=':')
        anNrs=np.arange(int(a),int(b)+1)
    else:
        anNrs=np.array(anNrs.split()).astype(int)
        
    anIDs=anNrs #-1 no more 0-based since using pandas merge to find animal numbers
    anIDsAll.extend(anIDs)

    bd=infoAn[infoAn.anNr.isin(anIDs)].bd.values.astype(str) # get birth dates of animals in this experiment
    #bd=infoAn.bd.values[anIDs-1] #a bit dirty to use anIDs directly here. Should merge
    birthDayAll.append(' '.join(list(bd)))

info['camHeight']=camHeightAll
info['txtPath']=posPath
info['pairList']=PLPath
info['aviPath']='default'
info['birthDayAll']=birthDayAll
info['epiDur'] = 5      # duration of individual episodes (default: 5 minutes)
info['episodes'] = 24   # number of episodes to process: -1 to load all episodes (default: -1)
info['inDish'] = 10#np.arange(len(posPath))*120     # time in dish before experiments started (default: 10)
info['arenaDiameter_mm'] = 70 # arena diameter (default: 100 mm)
info['minShift'] = 60 # minimum number of seconds to shift for control IAD
info['episodePLcode'] = 0 # flag if first two characters of episode name encode animal pair matrix (default: 0)
info['recomputeAnimalSize'] = 0 # flag to compute animals size from avi file (takes time, default: 1)
info['SaveNeighborhoodMaps'] = 0 # flag to save neighborhood maps for subsequent analysis (takes time, default: 1)
info['computeLeadership'] = 0 # flag to compute leadership index (takes time, default: 1)
info['ComputeBouts'] = 1 # flag to compute swim bout frequency (takes time, default: 1)
#info['set'] = np.arange(len(posPath))   # experiment set: can label groups of experiments (default: 0)
info['ProcessingDir']=ProcessingDir
info['outputDir']=outputDir
info['expTime']=expTime
info['readLim'] = 24*5*60*30+11

In [None]:
csvFile=os.path.join(ProcessingDir,'processingSettings.csv')
info.to_csv(csvFile,encoding='utf-8')
info.tail()

In [None]:
rereadData=1
if rereadData:
    def readExperiment(keepData=True):
        tmp=es.experiment_set(csvFile=csvFile,MissingOnly=True)
        if keepData:
            return tmp
        else:
            return 1

    expSet=readExperiment(keepData=False)

In [None]:
csvPath = []
for f in [mu.splitall(x)[-1][:-4] for x in info.txtPath]:
    csvPath.append(glob.glob(ProcessingDir+f+'*siSummary*.csv')[0])

df=pd.DataFrame()
i=0
for fn in csvPath:
    print(fn)
    tmp=pd.read_csv(fn,index_col=0,sep=',')
    tmp.animalSet=i
    tmp.animalIndex=tmp.animalIndex+((i)*35)
    tmp.animalIndex=np.array(anIDsAll)[tmp.animalIndex]
    df=pd.concat([df,tmp])
    i+=1

df.tail(10)



In [None]:

df['episode']=[x.strip().replace('_','') for x in df['episode']]
df=pd.merge(df,infoAn[['anNr','line','genotype','bg','cohort','generation','repeat']],left_on='animalIndex',right_on='anNr',how='left')
df=pd.merge(df,info[['date','selected','set','setup','rnaSet']],left_on='animalSet',right_on=info.index,how='left')
df['setup'] = info['setup'].values[df['animalSet'].values]
print('df shape',df.shape)
df['lineSet']=[x+'_'+y for x,y in zip(df.line, df.date)]


df.tail(10)

In [None]:
sns.set_palette('viridis',3)
co=sns.color_palette("viridis", 3)
idx=(df['inDishTime']<240) & (df['inDishTime']>80)
dfDR=df[idx]
dfEpiAn=dfDR.groupby(['episode','animalIndex','line','setup','genotype','date','lineSet','bg','cohort','generation','repeat','set','rnaSet'],sort=True).mean(numeric_only=True).reset_index()

In [None]:
dfEpiAn.head()

In [None]:

def sem(x):
    return np.std(x, ddof=1) / np.sqrt(len(x))

x=np.random.random(100)

def ci95(x):
    return np.nanmean(x)-sms.DescrStatsW(x[np.isfinite(x)]).tconfint_mean()[0]

print('std of uniform = 0.2886751345948129. STDdata:',np.std(x), 'semData:',sem(x),'samples:',x.shape) 
print('ci95:',ci95(x))

In [None]:
dfPlot=(df.groupby(['inDishTime','episode','genotype','lineSet']).si.agg(['mean','std',sem,ci95])
    .unstack()
    .stack(dropna=True)
    .reset_index())

dfPlot.head()

In [None]:
order = dfEpiAn[dfEpiAn.episode=='02k20f'].groupby('lineSet')['date'].first().sort_values().index.tolist()

fig, axes = plt.subplots(figsize=(15,3))

sns.pointplot(data=dfEpiAn[dfEpiAn.episode=='02k20f'],
              x='lineSet',
              y='si',
              order = order,
              hue='genotype',
              
             linestyle='none',
             errorbar='sd',
             dodge=.5)
sns.despine()

axes.set_ylabel('Shoaling Index')
axes.axhline(0,ls=':',color='k')
axes.set_title('Shoaling selection history');
plt.legend(title='group',ncol=2,handletextpad=0,bbox_to_anchor=(1, 1.05))
plt.xticks(rotation=90);

In [None]:
order = dfEpiAn[dfEpiAn.episode=='02k20f'].groupby('lineSet')['date'].first().sort_values().index.tolist()

fig, axes = plt.subplots(figsize=(25, 3))
ix=(dfEpiAn.episode=='02k20f')#&(~dfEpiAn.lineSet.str.contains('_2_'))

selDat=dfEpiAn[ix]

allCat=selDat.lineSet.unique()
allCat.sort()
allCat=allCat[::-1]

sns.swarmplot(data=selDat,
              x='lineSet',
              y='si',
              hue='genotype',
              order=order,
              zorder=-1,
              dodge=.5,
              size=3,
              alpha=.5,
              #order=allCat,
             hue_order=["hi", "lo",'wt','esc','mid','cross'])

sns.pointplot(data=selDat,
              x='lineSet',
              y='si',
              order=order,
              hue='genotype',
              dodge=.5,
              linestyle='none',
              errorbar='sd',
              hue_order=["hi", "lo",'wt','esc','mid','cross'])


sns.despine()

axes.set_ylabel('Attraction')
axes.set_xlabel('Cohort')

axes.axhline(0,ls=':',color='k')
#axes.set_title('Selection F1');

plt.xticks(rotation=90);

handles, labels = axes.get_legend_handles_labels()

l = plt.legend(handles[0:5], labels[0:5], title='Parents',ncol=1,handletextpad=0,
               bbox_to_anchor=(1, 1.05),
              frameon=False)

#figPath=base+'SelectionAllToF1.png'
#plt.savefig(figPath,bbox_inches='tight')

plt.title('Shoaling selection history');


In [None]:

fig, axes = plt.subplots(figsize=(25, 3))
ix=(dfEpiAn.episode=='02k20f')
selDat=dfEpiAn[ix]


order = selDat.sort_values(['bg', 'cohort', 'generation', 'repeat'])['lineSet'].unique().tolist()
allCat=selDat.lineSet.unique()
allCat.sort()
allCat=allCat[::-1]

sns.swarmplot(data=selDat,
              x='lineSet',
              y='si',
              hue='genotype',
              order=order,
              zorder=-1,
              dodge=.5,
              size=3,
              alpha=.5,
              #order=allCat,
             hue_order=["hi", "lo",'wt','esc','mid','cross'])

sns.pointplot(data=selDat,
              x='lineSet',
              y='si',
              order=order,
              hue='genotype',
              dodge=.5,
              linestyle='none',
              errorbar='sd',
              hue_order=["hi", "lo",'wt','esc','mid','cross'])


sns.despine()

axes.set_ylabel('Attraction')
axes.set_xlabel('Cohort')

axes.axhline(0,ls=':',color='k')
#axes.set_title('Selection F1');

plt.xticks(rotation=90);

handles, labels = axes.get_legend_handles_labels()

l = plt.legend(handles[0:5], labels[0:5], title='Parents',ncol=1,handletextpad=0,
               bbox_to_anchor=(1, 1.05),
              frameon=False)

#figPath=base+'SelectionAllToF1.png'
#plt.savefig(figPath,bbox_inches='tight')

plt.title('Shoaling selection history');


In [None]:
def plot_selection_history(dfEpiAn, episode=None, bg=None, cohort=None, selected=None, splitSetup=False, figsize=(5, 3), forceLabel=None):
    fig, axes = plt.subplots(figsize=figsize)

    ix = pd.Series([True] * len(dfEpiAn), index=dfEpiAn.index)
    if episode is not None:
        ix &= (dfEpiAn.episode == episode)
    if bg is not None:
        ix &= (dfEpiAn.bg == bg)
    if cohort is not None:
        ix &= (dfEpiAn.cohort == cohort)
    if selected is not None:
        ix &= (dfEpiAn.selected == selected)

    if forceLabel is not None:
        lineTxt = forceLabel
    else:
        lineTxt = f"{bg}{cohort}" if bg is not None and cohort is not None else ""

    selDat = dfEpiAn[ix]   

    # if splitSetup: lineSet will be concatenated with setup to create a unique identifier for plotting.
    if splitSetup:
        selDat = selDat.copy()
        selDat['lineSet'] = selDat['lineSet'].astype(str) + '_' + selDat['setup'].astype(str)


    order = selDat.sort_values(['bg', 'cohort', 'generation', 'repeat'])['lineSet'].unique().tolist()

    sns.swarmplot(
        data=selDat,
        x='lineSet',
        y='si',
        hue='genotype',
        order=order,
        zorder=-1,
        dodge=.5,
        size=3,
        alpha=.5,
        hue_order=["hi", "lo", 'wt', 'esc', 'mid', 'cross']
    )

    sns.pointplot(
        data=selDat,
        x='lineSet',
        y='si',
        order=order,
        hue='genotype',
        dodge=.5,
        linestyle='none',
        errorbar='sd',
        hue_order=["hi", "lo", 'wt', 'esc', 'mid', 'cross']
    )

    sns.despine()
    axes.set_ylabel('Attraction')
    axes.set_xlabel('Cohort')
    axes.axhline(0, ls=':', color='k')
    plt.xticks(rotation=90)

    handles, labels = axes.get_legend_handles_labels()
    plt.legend(
        handles[:5], labels[:5], title='Parents', ncol=1, handletextpad=0,
        bbox_to_anchor=(1, 1.05), frameon=False
    )
    plt.title('Shoaling selection history ' + lineTxt)
    return fig, axes


In [None]:
plot_selection_history(dfEpiAn, episode='02k20f', bg='TL', cohort='1', selected=1);

In [None]:
plot_selection_history(dfEpiAn, episode='02k20f', bg='TL', cohort='3', selected=1);

In [None]:
subset=dfEpiAn[dfEpiAn.rnaSet.isin(['a','b'])]
plot_selection_history(subset, 
                       episode='02k20f', 
                       bg='TL', 
                       selected=0, 
                       splitSetup=True,
                       forceLabel='RNAseq');