In [None]:
# Library imports
import os, pandas as pd, numpy as np, matplotlib.pyplot as plt, datetime, \
    regex as re, joblib as jl, statsmodels.stats.multitest
from tqdm.notebook import tqdm

In [None]:
os.chdir('data/')
os.getcwd()

## Load data

In [None]:
def getData():
    # Specify source data file
    fname = 'tracking-predictions-aggregated-startendonly-filtered-50_10_20_0_20.csv'
    
    # Read 2022 data
    data = pd.read_csv(fname)
    data = data[data.experiment_year == 2022]
    data = data.loc[:, ['experiment_year', 'experiment_date', 'participant_position', 
        'landing_id', 'frame_rel', 'frame', 'x', 'y']]
    data = data[~data.experiment_date.isin(['04-07-2022', '04-14-2022'])]

    # Parse position index integer from participant_position
    data.loc[:, 'position_id'] = [int(re.search('[0-9]+', x).group(0)) for x in data.participant_position]

    # Done
    return data

data = getData()

In [None]:
def computeLandingStatistics(data, shufflePositions=False, locationWeights=None, mode=''):
    # Compute landing durations
    dataAgg = data.groupby(['experiment_year', 'experiment_date', 'participant_position', 'landing_id']).agg(['min', 'max'])
    dataAgg.columns = ['_'.join(col).strip() for col in dataAgg.columns.values]
    dataAgg.loc[:, 'duration'] = dataAgg.frame_max - dataAgg.frame_min
    dataAgg.loc[:, 'duration_rel'] = dataAgg.frame_rel_max - dataAgg.frame_rel_min
    dataAgg = dataAgg.reset_index()
    dataAgg.head()
    
    # Create score column that is either fixed per trajectory (1), or set to duration
    if mode == 'OCCUPANCY':
        dataAgg.loc[:, 'weight'] = dataAgg.duration
    elif mode == 'LANDINGS':
        dataAgg.loc[:, 'weight'] = 1.0
    else:
        raise Exception('Invalid Analysis Mode')
        
    # Compute number of landings by day
    dataAgg.loc[:, 'total_landings_by_day'] = dataAgg.groupby([
        'experiment_year', 'experiment_date']).transform('sum').weight.astype(float)

    # Parse position index integer from participant_position
    dataAgg.loc[:, 'position_id'] = [int(re.search('[0-9]+', x).group(0)) for x in dataAgg.participant_position]

    # Shuffle?
    if shufflePositions and locationWeights is not None:
        for p in dataAgg.experiment_date.unique():
            # Randomly divide landings from day X according to overall location biases
            dataAgg.loc[dataAgg.experiment_date==p, 'position_id'] = \
                np.random.choice(locationWeights.index.values, p=locationWeights.values, replace=True,
                    size=(dataAgg.experiment_date==p).sum())
        
    # Compute number of landings by individual/position for each day
    dataAgg.loc[:, 'position_landings_by_day'] = dataAgg.groupby([
        'experiment_year', 'experiment_date', 'position_id']).transform('sum').weight.astype(float)

    # Compute percent landings by individual/position for each day
    dataAgg.loc[:, 'percent_landings'] = 100 * dataAgg.position_landings_by_day / dataAgg.total_landings_by_day

    # Look up what person was at each position
    personAssignments = {}
    newDate = None
    personAssignmentsRaw = pd.read_excel('human_positions.xlsx', header=None).values
    for row in personAssignmentsRaw:
        if isinstance(row[0], datetime.datetime):
            newDate = row[0]
        elif str(row[0]).isnumeric():
            position, human = row
            newDateStr = newDate.strftime('%m-%d-%Y')
            if newDateStr in personAssignments:
                personAssignments[newDateStr][position] = human
            else:
                personAssignments[newDateStr] = {position: human}
    
    # Convert position index and day to person index
    dataAgg.loc[:, 'participant_id'] = [personAssignments[_d][_p]for _p, _d in zip(
        dataAgg.position_id, dataAgg.experiment_date)]

    # Get landing statistics only (no trajectory data)
    dataLandings = dataAgg.loc[dataAgg.landing_id==0, [
        'experiment_date', 'percent_landings', 'participant_id', 
            'position_id', 'total_landings_by_day']].reset_index(drop=True)
    
    # Get mean percent landings by person
    dataLandings.loc[:, 'percent_landings_mean_participant'] = dataLandings.groupby(
        'participant_id').transform('mean').percent_landings

    # Get mean percent landings by position
    dataLandings.loc[:, 'percent_landings_mean_position'] = dataLandings.groupby(
        'position_id').transform('mean').percent_landings
    
    # Sort by participant
    dataLandingsSortedParticipants = dataLandings.sort_values(
        'percent_landings_mean_participant', ascending=False)

    # Sort by position
    dataLandingsSortedPositions = dataLandings.sort_values(
        'percent_landings_mean_position', ascending=False)

    # Get participant ranking
    participantRanking = dataLandings.groupby(
        'participant_id').first().percent_landings_mean_participant.rank(ascending=False)

    # Get position ranking
    positionRanking = dataLandings.groupby(
        'position_id').first().percent_landings_mean_position.rank(ascending=False)
    
    # Return results
    return dataAgg, dataLandings, dataLandingsSortedParticipants, \
        dataLandingsSortedPositions, participantRanking, positionRanking

## Plot landings

In [None]:
# Specify source data file
fname = 'tracking-predictions-aggregated-filtered-50_10_20_0_20.csv'

# Read 2022 data
data = pd.read_csv(fname)
data = data[data.experiment_year == 2022]
data = data.loc[:, ['experiment_year', 'experiment_date', 'participant_position', 
    'landing_id', 'frame_rel', 'frame', 'x', 'y']]
data = data[~data.experiment_date.isin(['04-07-2022', '04-14-2022'])]
print(data.shape)

# Parse position index integer from participant_position
data.loc[:, 'position_id'] = [int(re.search('[0-9]+', x).group(0)) for x in data.participant_position]

In [None]:
numPos, numDates = 6, data.experiment_date.unique().size
fig, ax = plt.subplots(numPos, numDates, figsize=(numDates * 3, numPos * 3), facecolor='white')
for ipos, pos in enumerate(range(1, 7)):
    for idate, date in enumerate(data.experiment_date.unique()):
        xys = data.loc[(
            data.experiment_date==date)&(
            data.position_id==pos), ['landing_id', 'x', 'y']].values
        for lid in np.unique(xys[:, 0]):
            _xys = xys[xys[:, 0] == lid]
            ax[ipos, idate].plot(_xys[::4, 1], _xys[::4, 2], c='black', linewidth=0.5)
        ax[ipos, idate].set_xlim(0, 1080)
        ax[ipos, idate].set_ylim(0, 1080)
        ax[ipos, idate].set_axis_off()
        
        ax[ipos, idate].set_title('{} - Pos {}'.format(date, pos))
fig.savefig('LandingOverview.pdf')

## Plot preferences by participant and position

In [None]:
# Get mean-based ranking of participants
data = getData()
dataAgg, dataLandings, dataLandingsSortedParticipants, dataLandingsSortedPositions, \
    participantRanking, positionRanking = computeLandingStatistics(data, shufflePositions=False, mode='LANDINGS')
participantsSorted = dataLandingsSortedParticipants.participant_id.unique()

In [None]:
# Randomization test for 5000 (location-weighted) random trajectories
def getRankings(mode):
    data = getData()
    
    # Compute location weights
    dataLandings = computeLandingStatistics(data, shufflePositions=False, mode=mode)[1]
    weightsPositions = dataLandings.groupby('position_id').sum().percent_landings_mean_position.copy()
    weightsPositions = weightsPositions / weightsPositions.sum()
    
    # Compute shuffled rankings
    participantRankings = []
    for k in range(100): 
        dataAgg, dataLandings, dataLandingsSortedParticipants, dataLandingsSortedPositions, \
            participantRanking, positionRanking = computeLandingStatistics(
                data, shufflePositions=True, locationWeights=weightsPositions, mode=mode)
        participantRankings.append(dataLandings.groupby('participant_id').mean().percent_landings)
    return participantRankings

# Run in parallel
participantRankingsL = jl.Parallel(n_jobs=10)(jl.delayed(getRankings)('LANDINGS') for i in tqdm(range(50)))
participantRankingsO = jl.Parallel(n_jobs=10)(jl.delayed(getRankings)('OCCUPANCY') for i in tqdm(range(50)))

In [None]:
# Non-shuffled rankings
participantRankingsTrueL = computeLandingStatistics(data, shufflePositions=False, mode='LANDINGS')[1]
participantRankingsTrueL = participantRankingsTrueL.groupby('participant_id').mean().percent_landings

participantRankingsTrueO = computeLandingStatistics(data, shufflePositions=False, mode='OCCUPANCY')[1]
participantRankingsTrueO = participantRankingsTrueO.groupby('participant_id').mean().percent_landings

In [None]:
# Helper function for getting landing % difference
def getLandingDiff(x, pA, pB):
    vA, vB = 0, 0
    if pA in x.index:
        vA = x.loc[pA]
    if pB in x.index:
        vB = x.loc[pB]
    return vA - vB

In [None]:
def randomizationTest(participantsSorted, participantRankings, participantRankingsTrue):
    numDts = 5
    pvals = {}
    for dti, dt in enumerate(range(1, 1 + numDts)):
        for c in range(len(participantsSorted) - dt):
            pA = participantsSorted[c]
            pB = participantsSorted[c+dt]

            randvals = [getLandingDiff(x, pA, pB) for y in participantRankings for x in y]
            trueval = getLandingDiff(participantRankingsTrue, pA, pB)
            pvals[(pA, pB)] = (np.mean(randvals > trueval), randvals, trueval)
    
    adj = statsmodels.stats.multitest.multipletests(
        [pvals[x][0] for x in pvals], alpha=0.05, method='fdr_bh')[1]
    
    npvals = {}
    for xi, x in enumerate([x for x in pvals]):
        npvals[x] = (pvals[x][0], pvals[x][1], pvals[x][2], adj[xi])
    
    return npvals

testsL = randomizationTest(participantsSorted, participantRankingsL, participantRankingsTrueL)
testsO = randomizationTest(participantsSorted, participantRankingsO, participantRankingsTrueO)

In [None]:
def plotRankingTests(tests, MODE):
    numDts = 5

    fig, ax = plt.subplots(numDts, len(participantsSorted) - 1, figsize=(14, 1.75 * numDts), facecolor='white')
    
    plotted = {}
    
    for dti, dt in enumerate(range(1, 1 + numDts)):
        for c in range(len(participantsSorted) - dt):
            pA = participantsSorted[c]
            pB = participantsSorted[c+dt]

            pval, randvals, trueval, pvaladj = tests[(pA, pB)]

            plotted[(pA, pB)] = (trueval, ax[dti, c].hist(
                randvals, facecolor='#666666', bins=100, 
                fill=True, histtype='step', edgecolor='#666666'))
            ax[dti, c].axvline(trueval, c='red')
            
            #ax[dti, c].set_title('#{} - #{} \n p={:.4f}'.format(pA, pB, pval))
            ax[dti, c].set_title('Human #{} vs. #{}'.format(pA, pB))

            ax[dti, c].text(0.05, 0.95, 'p=\n{:.3f}'.format(pval),
                 horizontalalignment='left',
                 verticalalignment='top',
                 transform = ax[dti, c].transAxes, 
                 fontsize=12)

            ax[dti, c].text(0.05, 0.5, 'adj. p=\n{:.3f}'.format(pvaladj),
                 horizontalalignment='left',
                 verticalalignment='top',
                 transform = ax[dti, c].transAxes, 
                 fontsize=12)

            if c == 0:
                ax[dti, c].set_ylabel('Occurrences')
            else:
                ax[dti, c].yaxis.set_ticklabels([])
            if c == len(participantsSorted) - dt - 1:
                ax[dti, c].set_xlabel('{} Percentage Difference\n(First - Second)'.format(
                    'Landing' if MODE == 'LANDINGS' else 'Occupancy'))
            else:
                ax[dti, c].xaxis.set_ticklabels([])

            ax[dti, c].set_xlim(-25, 25)
            ax[dti, c].set_ylim(0, 250)

        for c in range(len(participantsSorted) - dt, len(participantsSorted) - 1):
            ax[dti, c].set_axis_off()

    fig.tight_layout()
    fig.subplots_adjust(left=0.06, bottom=0.1, right=0.94, top=0.94, wspace=0.1, hspace=0.25)
    fig.savefig('LandingPrefs_Ranking_Randomization_Tests_{}.pdf'.format(MODE))
    
    return plotted

In [None]:
plottedL = plotRankingTests(testsL, 'LANDINGS')
plottedO = plotRankingTests(testsO, 'OCCUPANCY')

### Store histogram data

In [None]:
plottedData = []
for measure in ['landings', 'occupancy']:
    pl = plottedL if measure == 'landings' else plottedO
    for pA, pB in pl:
        bins = 0.5 * pl[(pA, pB)][1][1][1:] + 0.5 * pl[(pA, pB)][1][1][:100]
        for binn, count in zip(bins, pl[(pA, pB)][1][0]):
            plottedData.append((measure, 'randomized', pA, pB, binn, int(count)))
        plottedData.append((measure, 'observed', pA, pB, pl[(pA, pB)][0], 1))
        
plottedData = pd.DataFrame(plottedData, columns=['measure', 'randomized_or_observed', 'humanA', 
    'humanB', 'percent_difference_histogram_bin', 'count'])

plottedData.to_csv('LandingPrefs_Randomization_Tests_Histogram_Data.csv')