# Overlapping analysis

In [1]:
import os
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
import pandas as pd

from snl_stats_extraction_data import *
from snl_stats_visualization_database import *
DIR, databases_pair_paths, databases_paths, tier_lists, databases, databases_pairs, tiers = get_parameters()

### Parameters

In [2]:
databases_name = [key.replace('_paths','').upper() for key in databases.keys()]
databases_pairs = [key for key in databases_pairs.keys()]
expressions = ["Smiles_0", "Laughs_0"]
# entities = {expression : tier_lists[expression] for expression in expressions}
laughs_intensities = tier_lists['Laughs_0']
smiles_intensities = tier_lists['Smiles_0']

## Pourcentage of overlap

For each pair of file, we compute the percentage of overlap for S&L between the two person in the interaction regardless the entity of the tier studied. 

Here we watch the pourcentage of overlap for each pair of file for list A to list B. (The total duration come from the files of list A)

In [3]:
lstA = {}
lstB = {}
overlapping_segments_dict = {}
for i, database in enumerate(databases_name):
    if database == databases_pairs[i].replace('_pairs', '').upper():
        databases_list = databases_pair_paths[databases_pairs[i]]
        dataset_dict = {}
        for i in range(0, len(databases_list), 2):
            filepath_A = databases_list[i]
            filepath_B = databases_list[i+1]
            pair_file_A = os.path.basename(filepath_A)
            pair_file_B = os.path.basename(filepath_B)

            if pair_file_A and pair_file_B:
                pair_name = f"{pair_file_A}_&_{pair_file_B}"

            pair_dict = {}
            for tier in expressions:
                lstA_tier = get_tier_from_file(filepath_A, tier)
                lstB_tier = get_tier_from_file(filepath_B, tier)

                if tier in lstA:
                    lstA[tier].extend(lstA_tier[tier])
                else:
                    lstA[tier] = lstA_tier[tier]

                if tier in lstB:
                    lstB[tier].extend(lstB_tier[tier])
                else:
                    lstB[tier] = lstB_tier[tier]

                overlapping_segments = get_overlapping_segments(lstA_tier[tier], lstB_tier[tier])
                pair_dict[tier] = {'Segments': overlapping_segments}

            dataset_dict[pair_name] = pair_dict

        overlapping_segments_dict[database] = dataset_dict

dataframes = {}  
for database, dataset_dict in overlapping_segments_dict.items():
    data = [] 
    for pair_name, pair_dict in dataset_dict.items():
        percentages = {}
        for tier, tier_dict in pair_dict.items():
            segments = tier_dict['Segments']
            if not segments:
                percentage = 0 
            else:
                total_duration = 0
                overlap_duration = 0
                for segmentA, segmentB in segments.items():
                    total_duration += segmentA[1] - segmentA[0]
                    for seg in segmentB:
                        if seg[0] > segmentA[0] and seg[1] < segmentA[1]:
                            overlap_duration += seg[1] - seg[0]
                        elif seg[0] < segmentA[0] and seg[1] > segmentA[1]:
                            overlap_duration += segmentA[1] - segmentA[0]
                        elif seg[0] < segmentA[0] and seg[1] < segmentA[1]:
                            overlap_duration += seg[1] - segmentA[0]
                        elif seg[0] > segmentA[0] and seg[1] > segmentA[1]:
                            overlap_duration += segmentA[1] - seg[0]
                    percentage = overlap_duration / total_duration * 100
            percentages[tier] = percentage
        
        data.append({'Pairs filenames': pair_name, **percentages})

    df = pd.DataFrame(data)
    df = df.applymap(lambda x: f"{x:.5f}%" if isinstance(x, float) else x)
    dataframes[database] = df

for database, df in dataframes.items():
    display(Markdown(f"**Dataset: {database}**"))
    display(df)

**Dataset: CCDB**

Unnamed: 0,Pairs filenames,Smiles_0,Laughs_0
0,P12_P2_1402_dizzy.eaf_&_P12_P2_1402_monk.eaf,95.88244%,40.68441%
1,P12_P3_2202_dizzy.eaf_&_P12_P3_2202_monk.eaf,85.34570%,0.00000%
2,P14_P7_2502_dizzy.eaf_&_P14_P7_2502_monk.eaf,97.94341%,0.00000%
3,P15_P13_2402_dizzy.eaf_&_P15_P13_2402_monk.eaf,95.60331%,16.41791%
4,P16_P6_0903_dizzy.eaf_&_P16_P6_0903_monk.eaf,97.94403%,0.00000%
5,P18_P10_2102_dizzy.eaf_&_P18_P10_2102_monk.eaf,74.07790%,0.00000%
6,P18_P1_2102_dizzy.eaf_&_P18_P1_2102_monk.eaf,85.56386%,0.00000%


**Dataset: IFADV**

Unnamed: 0,Pairs filenames,Smiles_0,Laughs_0
0,DVA1A.eaf_&_DVB1B.eaf,93.28553%,0.00000%
1,DVA2C.eaf_&_DVB2D.eaf,80.58425%,0.00000%
2,DVA3E.eaf_&_DVB3F.eaf,94.11178%,67.18750%
3,DVA4C.eaf_&_DVB4G.eaf,70.10042%,0.00000%
4,DVA5G.eaf_&_DVB5H.eaf,89.24509%,0.00000%
5,DVA6H.eaf_&_DVB6I.eaf,75.31746%,0.00000%
6,DVA7B.eaf_&_DVB7J.eaf,99.99099%,0.00000%
7,DVA8K.eaf_&_DVB8L.eaf,82.83465%,30.96801%


**Dataset: NDC**

Unnamed: 0,Pairs filenames,Smiles_0,Laughs_0
0,13_1_A_M.eaf_&_13_1_B_F.eaf,83.71214%,74.40945%
1,13_2_A_M.eaf_&_13_2_B_F.eaf,95.16508%,0.00000%
2,13_4_A_M.eaf_&_13_4_B_F.eaf,90.73593%,66.43300%
3,14_1_A_M.eaf_&_14_1_B_F.eaf,73.79814%,0.00000%
4,14_2_A_M.eaf_&_14_2_B_F.eaf,91.68541%,96.42857%
5,17_1_A_F.eaf_&_17_1_B_F.eaf,47.71844%,53.30189%
6,17_2_A_F.eaf_&_17_2_B_F.eaf,85.63169%,100.00000%
7,17_3_A_F.eaf_&_17_3_B_F.eaf,72.95328%,47.28305%
8,17_4_A_F.eaf_&_17_4_B_F.eaf,94.63274%,87.27273%
9,18_1_A_M.eaf_&_18_1_B_M.eaf,75.82890%,71.25000%


Now let's look for an overlap between the same intensity of smiles or laughs. For a more datailed analysis, we show here the duration of overlap for each pair of file for list A to list B, the total duration of the tier of list A and the pourcentage of overlap between A and B.

In [4]:
lstA = {}
lstB = {}
overlapping_segments_dict = {}
for i, database in enumerate(databases_name):
    if database == databases_pairs[i].replace('_pairs', '').upper():
        databases_list = databases_pair_paths[databases_pairs[i]]
        dataset_dict = {}
        for i in range(0, len(databases_list), 2):
            filepath_A = databases_list[i]
            filepath_B = databases_list[i+1] 
            pair_file_A = os.path.basename(filepath_A)
            pair_file_B = os.path.basename(filepath_B)

            if pair_file_A and pair_file_B:
                pair_name = f"{pair_file_A}_&_{pair_file_B}"

            pair_dict = {}
            for tier in expressions:
                lstA_tier = get_tier_from_file(filepath_A, tier)
                lstB_tier = get_tier_from_file(filepath_B, tier)

                if tier in lstA:
                    lstA[tier].extend(lstA_tier[tier])
                else:
                    lstA[tier] = lstA_tier[tier]

                if tier in lstB:
                    lstB[tier].extend(lstB_tier[tier])
                else:
                    lstB[tier] = lstB_tier[tier]

                overlapping_segments = get_overlapping_segments(lstA_tier[tier], lstB_tier[tier])
                pair_dict[tier] = {'Segments': overlapping_segments}

            dataset_dict[pair_name] = pair_dict

        overlapping_segments_dict[database] = dataset_dict

dataframes = {}
for database, dataset_dict in overlapping_segments_dict.items():
    tiers = expressions
    data = {tier: [] for tier in tiers}
    for pair_name, pair_dict in dataset_dict.items():
        for tier in tiers:
            segments = pair_dict[tier]['Segments']
            if not segments:
                overlap_duration = 0
                total_duration = 0
                percentage = 0
            else:
                total_duration = 0
                overlap_duration = 0
                for segmentA, segmentB in segments.items():
                    for seg in segmentB:
                        if seg[2].replace(" ", "") == segmentA[2].replace(" ", ""):
                            if seg[0] > segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - seg[0]
                            elif seg[0] < segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - segmentA[0]
                            elif seg[0] < segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - segmentA[0]
                            elif seg[0] > segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - seg[0]
                            total_duration += segmentA[1] - segmentA[0]
                if total_duration == 0:
                    percentage = 0
                else:
                    percentage = overlap_duration / total_duration * 100
            data[tier].append({
                'Pairs filenames': pair_name,
                f'Overlap Percentage for {tier} (%)': percentage,
                f'Total Tier Duration for {tier} (ms)': total_duration,
                f'Overlap Duration for {tier} (ms)': overlap_duration
            })

    dfs = []
    for tier in tiers:
        df = pd.DataFrame(data[tier])
        dfs.append(df)

    df_merged = pd.concat(dfs, axis=1)
    df_merged = df_merged.loc[:, ~df_merged.columns.duplicated()]
    df_filter = df_merged.filter(like='Overlap Percentage for')
    df_merged = df_merged.drop(df_filter.columns, axis=1)
    df_total = df_merged.sum(numeric_only=True)
    df_total['Pairs filenames'] = 'Total'

    for tier in expressions:
        overlap_duration_col = f'Overlap Duration for {tier} (ms)'
        total_duration_col = f'Total Tier Duration for {tier} (ms)'
        overlap_percentage_col = f'Overlap Percentage for {tier} (%)'
        df_total[overlap_percentage_col] = (df_total[overlap_duration_col] / df_total[total_duration_col]) * 100
    df_merged = pd.concat([df_merged, df_filter], axis=1)
    df_merged = pd.concat([df_merged, pd.DataFrame(df_total).T], ignore_index=True)
    dataframes[database] = df_merged

for database, df in dataframes.items():
    display(Markdown(f"**Database: {database}**"))
    display(df)

**Database: CCDB**

Unnamed: 0,Pairs filenames,Total Tier Duration for Smiles_0 (ms),Overlap Duration for Smiles_0 (ms),Total Tier Duration for Laughs_0 (ms),Overlap Duration for Laughs_0 (ms),Overlap Percentage for Smiles_0 (%),Overlap Percentage for Laughs_0 (%)
0,P12_P2_1402_dizzy.eaf_&_P12_P2_1402_monk.eaf,94540,38680,0,0,40.913899,0.0
1,P12_P3_2202_dizzy.eaf_&_P12_P3_2202_monk.eaf,60135,22040,0,0,36.650869,0.0
2,P14_P7_2502_dizzy.eaf_&_P14_P7_2502_monk.eaf,10645,4505,0,0,42.320338,0.0
3,P15_P13_2402_dizzy.eaf_&_P15_P13_2402_monk.eaf,11685,6828,670,110,58.43389,16.41791
4,P16_P6_0903_dizzy.eaf_&_P16_P6_0903_monk.eaf,21620,9290,0,0,42.969473,0.0
5,P18_P10_2102_dizzy.eaf_&_P18_P10_2102_monk.eaf,19195,9525,0,0,49.622297,0.0
6,P18_P1_2102_dizzy.eaf_&_P18_P1_2102_monk.eaf,51220,16360,0,0,31.940648,0.0
7,Total,269040,107228,670,110,39.855784,16.41791


**Database: IFADV**

Unnamed: 0,Pairs filenames,Total Tier Duration for Smiles_0 (ms),Overlap Duration for Smiles_0 (ms),Total Tier Duration for Laughs_0 (ms),Overlap Duration for Laughs_0 (ms),Overlap Percentage for Smiles_0 (%),Overlap Percentage for Laughs_0 (%)
0,DVA1A.eaf_&_DVB1B.eaf,31755,12290,0,0,38.702567,0.0
1,DVA2C.eaf_&_DVB2D.eaf,107005,30428,0,0,28.436054,0.0
2,DVA3E.eaf_&_DVB3F.eaf,1110,1110,2220,1440,100.0,64.864865
3,DVA4C.eaf_&_DVB4G.eaf,57585,19720,0,0,34.245029,0.0
4,DVA5G.eaf_&_DVB5H.eaf,22470,10180,0,0,45.304851,0.0
5,DVA6H.eaf_&_DVB6I.eaf,11840,5890,0,0,49.746622,0.0
6,DVA7B.eaf_&_DVB7J.eaf,41320,18345,0,0,44.397386,0.0
7,DVA8K.eaf_&_DVB8L.eaf,330,330,12190,3775,100.0,30.968007
8,Total,273415,98293,14410,5215,35.950112,36.190146


**Database: NDC**

Unnamed: 0,Pairs filenames,Total Tier Duration for Smiles_0 (ms),Overlap Duration for Smiles_0 (ms),Total Tier Duration for Laughs_0 (ms),Overlap Duration for Laughs_0 (ms),Overlap Percentage for Smiles_0 (%),Overlap Percentage for Laughs_0 (%)
0,13_1_A_M.eaf_&_13_1_B_F.eaf,2670,2240,0,0,83.895131,0.0
1,13_2_A_M.eaf_&_13_2_B_F.eaf,24110,9477,0,0,39.307341,0.0
2,13_4_A_M.eaf_&_13_4_B_F.eaf,46850,22937,2294,1390,48.958378,60.592851
3,14_1_A_M.eaf_&_14_1_B_F.eaf,9255,5490,0,0,59.319287,0.0
4,14_2_A_M.eaf_&_14_2_B_F.eaf,13055,9750,0,0,74.684029,0.0
5,17_1_A_F.eaf_&_17_1_B_F.eaf,7937,1452,0,0,18.294066,0.0
6,17_2_A_F.eaf_&_17_2_B_F.eaf,56370,25220,0,0,44.74011,0.0
7,17_3_A_F.eaf_&_17_3_B_F.eaf,9500,3910,1490,1420,41.157895,95.302013
8,17_4_A_F.eaf_&_17_4_B_F.eaf,168410,33253,1440,1090,19.745265,75.694444
9,18_1_A_M.eaf_&_18_1_B_M.eaf,50235,14924,1320,1000,29.708371,75.757576


Now, we watch the pourcentage of overlap for each pair of file for list B to list A. (The total duration come from the files of list B)

In [5]:
lstA = {}
lstB = {}
overlapping_segments_dict = {}
for i, database in enumerate(databases_name):
    if database == databases_pairs[i].replace('_pairs', '').upper():
        databases_list = databases_pair_paths[databases_pairs[i]]
        dataset_dict = {}
        for i in range(0, len(databases_list), 2):
            filepath_A = databases_list[i+1]
            filepath_B = databases_list[i] 
            pair_file_A = os.path.basename(filepath_A)
            pair_file_B = os.path.basename(filepath_B)

            if pair_file_A and pair_file_B:
                pair_name = f"{pair_file_A}_&_{pair_file_B}"

            pair_dict = {}
            for tier in expressions:
                lstA_tier = get_tier_from_file(filepath_A, tier)
                lstB_tier = get_tier_from_file(filepath_B, tier)

                if tier in lstA:
                    lstA[tier].extend(lstA_tier[tier])
                else:
                    lstA[tier] = lstA_tier[tier]

                if tier in lstB:
                    lstB[tier].extend(lstB_tier[tier])
                else:
                    lstB[tier] = lstB_tier[tier]

                overlapping_segments = get_overlapping_segments(lstA_tier[tier], lstB_tier[tier])
                pair_dict[tier] = {'Segments': overlapping_segments}

            dataset_dict[pair_name] = pair_dict

        overlapping_segments_dict[database] = dataset_dict

dataframes = {}  
for database, dataset_dict in overlapping_segments_dict.items():
    data = [] 
    for pair_name, pair_dict in dataset_dict.items():
        percentages = {}
        for tier, tier_dict in pair_dict.items():
            segments = tier_dict['Segments']
            if not segments:
                percentage = 0 
            else:
                total_duration = 0
                overlap_duration = 0
                for segmentA, segmentB in segments.items():
                    total_duration += segmentA[1] - segmentA[0]
                    for seg in segmentB:
                        if seg[0] > segmentA[0] and seg[1] < segmentA[1]:
                            overlap_duration += seg[1] - seg[0]
                        elif seg[0] < segmentA[0] and seg[1] > segmentA[1]:
                            overlap_duration += segmentA[1] - segmentA[0]
                        elif seg[0] < segmentA[0] and seg[1] < segmentA[1]:
                            overlap_duration += seg[1] - segmentA[0]
                        elif seg[0] > segmentA[0] and seg[1] > segmentA[1]:
                            overlap_duration += segmentA[1] - seg[0]
                    percentage = overlap_duration / total_duration * 100
            percentages[tier] = percentage
        
        data.append({'Pairs filenames': pair_name, **percentages})

    df = pd.DataFrame(data)
    df = df.applymap(lambda x: f"{x:.5f}%" if isinstance(x, float) else x)
    dataframes[database] = df

for database, df in dataframes.items():
    display(Markdown(f"**Dataset: {database}**"))
    display(df)

**Dataset: CCDB**

Unnamed: 0,Pairs filenames,Smiles_0,Laughs_0
0,P12_P2_1402_monk.eaf_&_P12_P2_1402_dizzy.eaf,88.84152%,100.00000%
1,P12_P3_2202_monk.eaf_&_P12_P3_2202_dizzy.eaf,91.59406%,0.00000%
2,P14_P7_2502_monk.eaf_&_P14_P7_2502_dizzy.eaf,70.10125%,0.00000%
3,P15_P13_2402_monk.eaf_&_P15_P13_2402_dizzy.eaf,45.53073%,9.16667%
4,P16_P6_0903_monk.eaf_&_P16_P6_0903_dizzy.eaf,88.33376%,0.00000%
5,P18_P10_2102_monk.eaf_&_P18_P10_2102_dizzy.eaf,99.96899%,0.00000%
6,P18_P1_2102_monk.eaf_&_P18_P1_2102_dizzy.eaf,99.97024%,0.00000%


**Dataset: IFADV**

Unnamed: 0,Pairs filenames,Smiles_0,Laughs_0
0,DVB1B.eaf_&_DVA1A.eaf,71.47342%,0.00000%
1,DVB2D.eaf_&_DVA2C.eaf,84.23850%,0.00000%
2,DVB3F.eaf_&_DVA3E.eaf,88.13084%,42.90466%
3,DVB4G.eaf_&_DVA4C.eaf,88.95166%,0.00000%
4,DVB5H.eaf_&_DVA5G.eaf,73.10462%,0.00000%
5,DVB6I.eaf_&_DVA6H.eaf,86.03808%,0.00000%
6,DVB7J.eaf_&_DVA7B.eaf,83.88024%,0.00000%
7,DVB8L.eaf_&_DVA8K.eaf,29.30362%,93.32509%


**Dataset: NDC**

Unnamed: 0,Pairs filenames,Smiles_0,Laughs_0
0,13_1_B_F.eaf_&_13_1_A_M.eaf,47.12083%,44.83986%
1,13_2_B_F.eaf_&_13_2_A_M.eaf,63.25786%,0.00000%
2,13_4_B_F.eaf_&_13_4_A_M.eaf,61.05466%,38.35019%
3,14_1_B_F.eaf_&_14_1_A_M.eaf,70.07648%,0.00000%
4,14_2_B_F.eaf_&_14_2_A_M.eaf,67.07475%,52.94118%
5,17_1_B_F.eaf_&_17_1_A_F.eaf,54.12557%,67.93587%
6,17_2_B_F.eaf_&_17_2_A_F.eaf,68.36400%,68.32117%
7,17_3_B_F.eaf_&_17_3_A_F.eaf,32.90797%,100.00000%
8,17_4_B_F.eaf_&_17_4_A_F.eaf,75.65514%,71.00592%
9,18_1_B_M.eaf_&_18_1_A_M.eaf,66.22210%,100.00000%


Now let's look for an overlap between the same intensity of smiles or laughs. For a more datailed analysis, we show here the duration of overlap for each pair of file for list B to list A, the total duration of the tier of list B and the pourcentage of overlap between B and A.

In [6]:
lstA = {}
lstB = {}
overlapping_segments_dict = {}
for i, database in enumerate(databases_name):
    if database == databases_pairs[i].replace('_pairs', '').upper():
        databases_list = databases_pair_paths[databases_pairs[i]]
        dataset_dict = {}
        for i in range(0, len(databases_list), 2):
            filepath_A = databases_list[i+1]
            filepath_B = databases_list[i]
            pair_file_A = os.path.basename(filepath_A)
            pair_file_B = os.path.basename(filepath_B)

            if pair_file_A and pair_file_B:
                pair_name = f"{pair_file_A}_&_{pair_file_B}"

            pair_dict = {}
            for tier in expressions:
                lstA_tier = get_tier_from_file(filepath_A, tier)
                lstB_tier = get_tier_from_file(filepath_B, tier)

                if tier in lstA:
                    lstA[tier].extend(lstA_tier[tier])
                else:
                    lstA[tier] = lstA_tier[tier]

                if tier in lstB:
                    lstB[tier].extend(lstB_tier[tier])
                else:
                    lstB[tier] = lstB_tier[tier]

                overlapping_segments = get_overlapping_segments(lstA_tier[tier], lstB_tier[tier])
                pair_dict[tier] = {'Segments': overlapping_segments}

            dataset_dict[pair_name] = pair_dict

        overlapping_segments_dict[database] = dataset_dict

dataframes = {}
for database, dataset_dict in overlapping_segments_dict.items():
    tiers = expressions
    data = {tier: [] for tier in tiers}
    for pair_name, pair_dict in dataset_dict.items():
        for tier in tiers:
            segments = pair_dict[tier]['Segments']
            if not segments:
                overlap_duration = 0
                total_duration = 0
                percentage = 0
            else:
                total_duration = 0
                overlap_duration = 0
                for segmentA, segmentB in segments.items():
                    for seg in segmentB:
                        if seg[2].replace(" ", "") == segmentA[2].replace(" ", ""):
                            if seg[0] > segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - seg[0]
                            elif seg[0] < segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - segmentA[0]
                            elif seg[0] < segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - segmentA[0]
                            elif seg[0] > segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - seg[0]
                            total_duration += segmentA[1] - segmentA[0]
                if total_duration == 0:
                    percentage = 0
                else:
                    percentage = overlap_duration / total_duration * 100
            data[tier].append({
                'Pairs filenames': pair_name,
                f'Overlap Percentage for {tier} (%)': percentage,
                f'Total Tier Duration for {tier} (ms)': total_duration,
                f'Overlap Duration for {tier} (ms)': overlap_duration
            })

    dfs = []
    for tier in tiers:
        df = pd.DataFrame(data[tier])
        dfs.append(df)

    df_merged = pd.concat(dfs, axis=1)
    df_merged = df_merged.loc[:, ~df_merged.columns.duplicated()]
    df_filter = df_merged.filter(like='Overlap Percentage for')
    df_merged = df_merged.drop(df_filter.columns, axis=1)
    df_total = df_merged.sum(numeric_only=True)
    df_total['Pairs filenames'] = 'Total'

    for tier in expressions:
        overlap_duration_col = f'Overlap Duration for {tier} (ms)'
        total_duration_col = f'Total Tier Duration for {tier} (ms)'
        overlap_percentage_col = f'Overlap Percentage for {tier} (%)'
        df_total[overlap_percentage_col] = (df_total[overlap_duration_col] / df_total[total_duration_col]) * 100
    df_merged = pd.concat([df_merged, df_filter], axis=1)
    df_merged = pd.concat([df_merged, pd.DataFrame(df_total).T], ignore_index=True)
    dataframes[database] = df_merged

for database, df in dataframes.items():
    display(Markdown(f"**Database: {database}**"))
    display(df)

**Database: CCDB**

Unnamed: 0,Pairs filenames,Total Tier Duration for Smiles_0 (ms),Overlap Duration for Smiles_0 (ms),Total Tier Duration for Laughs_0 (ms),Overlap Duration for Laughs_0 (ms),Overlap Percentage for Smiles_0 (%),Overlap Percentage for Laughs_0 (%)
0,P12_P2_1402_monk.eaf_&_P12_P2_1402_dizzy.eaf,98415,38680,0,0,39.302952,0.0
1,P12_P3_2202_monk.eaf_&_P12_P3_2202_dizzy.eaf,56605,22040,0,0,38.93649,0.0
2,P14_P7_2502_monk.eaf_&_P14_P7_2502_dizzy.eaf,16290,4505,0,0,27.655003,0.0
3,P15_P13_2402_monk.eaf_&_P15_P13_2402_dizzy.eaf,49191,6828,1200,110,13.880588,9.166667
4,P16_P6_0903_monk.eaf_&_P16_P6_0903_dizzy.eaf,24005,9290,0,0,38.700271,0.0
5,P18_P10_2102_monk.eaf_&_P18_P10_2102_dizzy.eaf,19980,9525,0,0,47.672673,0.0
6,P18_P1_2102_monk.eaf_&_P18_P1_2102_dizzy.eaf,54270,16360,0,0,30.145568,0.0
7,Total,318756,107228,1200,110,33.639524,9.166667


**Database: IFADV**

Unnamed: 0,Pairs filenames,Total Tier Duration for Smiles_0 (ms),Overlap Duration for Smiles_0 (ms),Total Tier Duration for Laughs_0 (ms),Overlap Duration for Laughs_0 (ms),Overlap Percentage for Smiles_0 (%),Overlap Percentage for Laughs_0 (%)
0,DVB1B.eaf_&_DVA1A.eaf,44740,12290,0,0,27.469826,0.0
1,DVB2D.eaf_&_DVA2C.eaf,60476,30428,0,0,50.314174,0.0
2,DVB3F.eaf_&_DVA3E.eaf,1460,1110,3905,1440,76.027397,36.8758
3,DVB4G.eaf_&_DVA4C.eaf,35640,19720,0,0,55.331089,0.0
4,DVB5H.eaf_&_DVA5G.eaf,28840,10180,0,0,35.298197,0.0
5,DVB6I.eaf_&_DVA6H.eaf,35260,5890,0,0,16.704481,0.0
6,DVB7J.eaf_&_DVA7B.eaf,67440,18345,0,0,27.201957,0.0
7,DVB8L.eaf_&_DVA8K.eaf,4030,330,4045,3775,8.188586,93.325093
8,Total,277886,98293,7950,5215,35.371699,65.597484


**Database: NDC**

Unnamed: 0,Pairs filenames,Total Tier Duration for Smiles_0 (ms),Overlap Duration for Smiles_0 (ms),Total Tier Duration for Laughs_0 (ms),Overlap Duration for Laughs_0 (ms),Overlap Percentage for Smiles_0 (%),Overlap Percentage for Laughs_0 (%)
0,13_1_B_F.eaf_&_13_1_A_M.eaf,3740,2240,0,0,59.893048,0.0
1,13_2_B_F.eaf_&_13_2_A_M.eaf,27285,9477,0,0,34.73337,0.0
2,13_4_B_F.eaf_&_13_4_A_M.eaf,68022,22937,1675,1390,33.719973,82.985075
3,14_1_B_F.eaf_&_14_1_A_M.eaf,17930,5490,0,0,30.619074,0.0
4,14_2_B_F.eaf_&_14_2_A_M.eaf,28600,9750,0,0,34.090909,0.0
5,17_1_B_F.eaf_&_17_1_A_F.eaf,6689,1452,0,0,21.707281,0.0
6,17_2_B_F.eaf_&_17_2_A_F.eaf,84570,25220,0,0,29.82145,0.0
7,17_3_B_F.eaf_&_17_3_A_F.eaf,17565,3910,1420,1420,22.260176,100.0
8,17_4_B_F.eaf_&_17_4_A_F.eaf,129920,33253,1480,1090,25.594982,73.648649
9,18_1_B_M.eaf_&_18_1_A_M.eaf,44470,14924,1140,1000,33.559703,87.719298


### Focus on speaker/listener overlap :

We compute the pourcentage of overlap between the speaker and the listener for each pair of file for list A to list B.The total duration is calculate here if both person are speaking or listening at the same time.

In [7]:
lstA = {}
lstB = {}
overlapping_segments_dict = {}
for i, database in enumerate(databases_name):
    if database == databases_pairs[i].replace('_pairs', '').upper():
        databases_list = databases_pair_paths[databases_pairs[i]]
        dataset_dict = {}
        for i in range(0, len(databases_list), 2):
            filepath_A = databases_list[i]
            filepath_B = databases_list[i+1] 
            pair_file_A = os.path.basename(filepath_A)
            pair_file_B = os.path.basename(filepath_B)

            if pair_file_A and pair_file_B:
                pair_name = f"{pair_file_A}_&_{pair_file_B}"

            pair_dict = {}
            lstA_tier = get_tier_from_file(filepath_A, "Role")
            lstB_tier = get_tier_from_file(filepath_B, "Role")

            if "Role" in lstA:
                lstA["Role"].extend(lstA_tier["Role"])
            else:
                lstA["Role"] = lstA_tier["Role"]

            if "Role" in lstB:
                lstB["Role"].extend(lstB_tier["Role"])
            else:
                lstB["Role"] = lstB_tier["Role"]

            overlapping_segments = get_overlapping_segments(lstA_tier["Role"], lstB_tier["Role"])
            pair_dict["Role"] = {'Segments': overlapping_segments}

            dataset_dict[pair_name] = pair_dict

        overlapping_segments_dict[database] = dataset_dict

dataframes = {}
for database, dataset_dict in overlapping_segments_dict.items():
    tiers = ["Role"]
    data = {tier: [] for tier in tiers}
    for pair_name, pair_dict in dataset_dict.items():
        for tier in tiers:
            segments = pair_dict[tier]['Segments']
            if not segments:
                overlap_duration = 0
                total_duration = 0
                percentage = 0
            else:
                total_duration = 0
                overlap_duration = 0
                for segmentA, segmentB in segments.items():
                    for seg in segmentB:
                        if seg[2].replace(" ", "") == segmentA[2].replace(" ", ""):
                            if seg[0] > segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - seg[0]
                            elif seg[0] < segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - segmentA[0]
                            elif seg[0] < segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - segmentA[0]
                            elif seg[0] > segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - seg[0]
                            total_duration += segmentA[1] - segmentA[0]
                percentage = overlap_duration / total_duration * 100
            data[tier].append({
                'Pairs filenames': pair_name,
                f'Overlap Percentage for {tier} (%)': percentage,
                f'Total Tier Duration for {tier} (ms)': total_duration,
                f'Overlap Duration for {tier} (ms)': overlap_duration
            })

    dfs = []
    for tier in tiers:
        df = pd.DataFrame(data[tier])
        dfs.append(df)

    df_merged = pd.concat(dfs, axis=1)
    df_merged = df_merged.loc[:, ~df_merged.columns.duplicated()]
    df_filter = df_merged.filter(like='Overlap Percentage for')
    df_merged = df_merged.drop(df_filter.columns, axis=1)
    df_total = df_merged.sum(numeric_only=True)
    df_total['Pairs filenames'] = 'Total'

    for tier in ["Role"]:
        overlap_duration_col = f'Overlap Duration for {tier} (ms)'
        total_duration_col = f'Total Tier Duration for {tier} (ms)'
        overlap_percentage_col = f'Overlap Percentage for {tier} (%)'
        df_total[overlap_percentage_col] = (df_total[overlap_duration_col] / df_total[total_duration_col]) * 100
    df_merged = pd.concat([df_merged, df_filter], axis=1)
    df_merged = pd.concat([df_merged, pd.DataFrame(df_total).T], ignore_index=True)
    dataframes[database] = df_merged

for database, df in dataframes.items():
    display(Markdown(f"**Database: {database}**"))
    display(df)

**Database: CCDB**

Unnamed: 0,Pairs filenames,Total Tier Duration for Role (ms),Overlap Duration for Role (ms),Overlap Percentage for Role (%)
0,P12_P2_1402_dizzy.eaf_&_P12_P2_1402_monk.eaf,143600,21090,14.68663
1,P12_P3_2202_dizzy.eaf_&_P12_P3_2202_monk.eaf,209410,6000,2.865193
2,P14_P7_2502_dizzy.eaf_&_P14_P7_2502_monk.eaf,169540,1500,0.884747
3,P15_P13_2402_dizzy.eaf_&_P15_P13_2402_monk.eaf,124600,9540,7.656501
4,P16_P6_0903_dizzy.eaf_&_P16_P6_0903_monk.eaf,18500,2080,11.243243
5,P18_P10_2102_dizzy.eaf_&_P18_P10_2102_monk.eaf,94750,3570,3.76781
6,P18_P1_2102_dizzy.eaf_&_P18_P1_2102_monk.eaf,36040,2230,6.187569
7,Total,796440,46010,5.776957


**Database: IFADV**

Unnamed: 0,Pairs filenames,Total Tier Duration for Role (ms),Overlap Duration for Role (ms),Overlap Percentage for Role (%)
0,DVA1A.eaf_&_DVB1B.eaf,184145,10530,5.71832
1,DVA2C.eaf_&_DVB2D.eaf,216245,22595,10.448797
2,DVA3E.eaf_&_DVB3F.eaf,53540,5410,10.104595
3,DVA4C.eaf_&_DVB4G.eaf,83085,4990,6.005898
4,DVA5G.eaf_&_DVB5H.eaf,221255,9010,4.072224
5,DVA6H.eaf_&_DVB6I.eaf,141130,7040,4.988309
6,DVA7B.eaf_&_DVB7J.eaf,86290,5543,6.423688
7,DVA8K.eaf_&_DVB8L.eaf,119570,3400,2.843523
8,Total,1105260,68518,6.199265


**Database: NDC**

Unnamed: 0,Pairs filenames,Total Tier Duration for Role (ms),Overlap Duration for Role (ms),Overlap Percentage for Role (%)
0,13_1_A_M.eaf_&_13_1_B_F.eaf,67517,14782,21.893745
1,13_2_A_M.eaf_&_13_2_B_F.eaf,52389,2475,4.724274
2,13_4_A_M.eaf_&_13_4_B_F.eaf,77130,11260,14.598729
3,14_1_A_M.eaf_&_14_1_B_F.eaf,207730,3090,1.487508
4,14_2_A_M.eaf_&_14_2_B_F.eaf,261470,11880,4.543542
5,17_1_A_F.eaf_&_17_1_B_F.eaf,480,480,100.0
6,17_2_A_F.eaf_&_17_2_B_F.eaf,460,460,100.0
7,17_3_A_F.eaf_&_17_3_B_F.eaf,112210,4360,3.885572
8,17_4_A_F.eaf_&_17_4_B_F.eaf,195073,2885,1.478934
9,18_1_A_M.eaf_&_18_1_B_M.eaf,160611,8248,5.135389


Let's focus on the speaker part only to see the overlap between two speakers for each pair of file for list A to list B.

In [8]:
lstA = {}
lstB = {}
overlapping_segments_dict = {}
for i, database in enumerate(databases_name):
    if database == databases_pairs[i].replace('_pairs', '').upper():
        databases_list = databases_pair_paths[databases_pairs[i]]
        dataset_dict = {}
        for i in range(0, len(databases_list), 2):
            filepath_A = databases_list[i]
            filepath_B = databases_list[i+1] 
            pair_file_A = os.path.basename(filepath_A)
            pair_file_B = os.path.basename(filepath_B)

            if pair_file_A and pair_file_B:
                pair_name = f"{pair_file_A}_&_{pair_file_B}"

            pair_dict = {}
            lstA_tier = get_tier_from_file(filepath_A, "Role")
            lstB_tier = get_tier_from_file(filepath_B, "Role")

            if "Role" in lstA:
                lstA["Role"].extend(lstA_tier["Role"])
            else:
                lstA["Role"] = lstA_tier["Role"]

            if "Role" in lstB:
                lstB["Role"].extend(lstB_tier["Role"])
            else:
                lstB["Role"] = lstB_tier["Role"]

            overlapping_segments = get_overlapping_segments(lstA_tier["Role"], lstB_tier["Role"])
            pair_dict["Role"] = {'Segments': overlapping_segments}

            dataset_dict[pair_name] = pair_dict

        overlapping_segments_dict[database] = dataset_dict

dataframes = {}
for database, dataset_dict in overlapping_segments_dict.items():
    tiers = ["Role"]
    data = {tier: [] for tier in tiers}
    for pair_name, pair_dict in dataset_dict.items():
        for tier in tiers:
            segments = pair_dict[tier]['Segments']
            if not segments:
                overlap_duration = 0
                total_duration = 0
                percentage = 0
            else:
                total_duration = 0
                overlap_duration = 0
                for segmentA, segmentB in segments.items():
                    for seg in segmentB:
                        if seg[2].replace(" ", "") == "spk" and segmentA[2].replace(" ", "") == "spk":
                            if seg[0] > segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - seg[0]
                            elif seg[0] < segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - segmentA[0]
                            elif seg[0] < segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - segmentA[0]
                            elif seg[0] > segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - seg[0]
                            total_duration += segmentA[1] - segmentA[0]
                if total_duration == 0:
                    percentage = 0
                else:
                    percentage = overlap_duration / total_duration * 100
            data[tier].append({
                'Pairs filenames': pair_name,
                'Overlap Percentage for speaker (%)': percentage,
                'Total Tier Duration for speaker (ms)': total_duration,
                'Overlap Duration for speaker (ms)': overlap_duration
            })

    dfs = []
    for tier in tiers:
        df = pd.DataFrame(data[tier])
        dfs.append(df)

    df_merged = pd.concat(dfs, axis=1)
    df_merged = df_merged.loc[:, ~df_merged.columns.duplicated()]
    df_filter = df_merged.filter(like='Overlap Percentage for')
    df_merged = df_merged.drop(df_filter.columns, axis=1)
    df_total = df_merged.sum(numeric_only=True)
    df_total['Pairs filenames'] = 'Total'

    for tier in ["Role"]:
        overlap_duration_col = 'Overlap Duration for speaker (ms)'
        total_duration_col = 'Total Tier Duration for speaker (ms)'
        overlap_percentage_col = 'Overlap Percentage for speaker (%)'
        df_total[overlap_percentage_col] = (df_total[overlap_duration_col] / df_total[total_duration_col]) * 100
    df_merged = pd.concat([df_merged, df_filter], axis=1)
    df_merged = pd.concat([df_merged, pd.DataFrame(df_total).T], ignore_index=True)
    dataframes[database] = df_merged

for database, df in dataframes.items():
    display(Markdown(f"**Database: {database}**"))
    display(df)

**Database: CCDB**

Unnamed: 0,Pairs filenames,Total Tier Duration for speaker (ms),Overlap Duration for speaker (ms),Overlap Percentage for speaker (%)
0,P12_P2_1402_dizzy.eaf_&_P12_P2_1402_monk.eaf,140860,20120,14.283686
1,P12_P3_2202_dizzy.eaf_&_P12_P3_2202_monk.eaf,209410,6000,2.865193
2,P14_P7_2502_dizzy.eaf_&_P14_P7_2502_monk.eaf,158130,1270,0.803137
3,P15_P13_2402_dizzy.eaf_&_P15_P13_2402_monk.eaf,72450,5985,8.26087
4,P16_P6_0903_dizzy.eaf_&_P16_P6_0903_monk.eaf,10670,2070,19.400187
5,P18_P10_2102_dizzy.eaf_&_P18_P10_2102_monk.eaf,88850,3535,3.978616
6,P18_P1_2102_dizzy.eaf_&_P18_P1_2102_monk.eaf,7150,1830,25.594406
7,Total,687520,40810,5.935827


**Database: IFADV**

Unnamed: 0,Pairs filenames,Total Tier Duration for speaker (ms),Overlap Duration for speaker (ms),Overlap Percentage for speaker (%)
0,DVA1A.eaf_&_DVB1B.eaf,159545,4445,2.786048
1,DVA2C.eaf_&_DVB2D.eaf,205495,19790,9.630405
2,DVA3E.eaf_&_DVB3F.eaf,23260,3370,14.488392
3,DVA4C.eaf_&_DVB4G.eaf,52305,4760,9.100468
4,DVA5G.eaf_&_DVB5H.eaf,217950,8270,3.794448
5,DVA6H.eaf_&_DVB6I.eaf,127450,6835,5.362887
6,DVA7B.eaf_&_DVB7J.eaf,64680,4418,6.83055
7,DVA8K.eaf_&_DVB8L.eaf,108450,2540,2.342093
8,Total,959135,54428,5.674696


**Database: NDC**

Unnamed: 0,Pairs filenames,Total Tier Duration for speaker (ms),Overlap Duration for speaker (ms),Overlap Percentage for speaker (%)
0,13_1_A_M.eaf_&_13_1_B_F.eaf,48845,9395,19.234313
1,13_2_A_M.eaf_&_13_2_B_F.eaf,20750,2100,10.120482
2,13_4_A_M.eaf_&_13_4_B_F.eaf,48120,10085,20.958022
3,14_1_A_M.eaf_&_14_1_B_F.eaf,179010,2440,1.363052
4,14_2_A_M.eaf_&_14_2_B_F.eaf,41040,6330,15.423977
5,17_1_A_F.eaf_&_17_1_B_F.eaf,480,480,100.0
6,17_2_A_F.eaf_&_17_2_B_F.eaf,460,460,100.0
7,17_3_A_F.eaf_&_17_3_B_F.eaf,94290,840,0.890869
8,17_4_A_F.eaf_&_17_4_B_F.eaf,31878,950,2.980112
9,18_1_A_M.eaf_&_18_1_B_M.eaf,112141,3663,3.266424


Let's focus now on the listener part only to see the overlap between two listeners for each pair of file for list A to list B.

In [9]:
lstA = {}
lstB = {}
overlapping_segments_dict = {}
for i, database in enumerate(databases_name):
    if database == databases_pairs[i].replace('_pairs', '').upper():
        databases_list = databases_pair_paths[databases_pairs[i]]
        dataset_dict = {}
        for i in range(0, len(databases_list), 2):
            filepath_A = databases_list[i]
            filepath_B = databases_list[i+1] 
            pair_file_A = os.path.basename(filepath_A)
            pair_file_B = os.path.basename(filepath_B)

            if pair_file_A and pair_file_B:
                pair_name = f"{pair_file_A}_&_{pair_file_B}"

            pair_dict = {}
            lstA_tier = get_tier_from_file(filepath_A, "Role")
            lstB_tier = get_tier_from_file(filepath_B, "Role")

            if "Role" in lstA:
                lstA["Role"].extend(lstA_tier["Role"])
            else:
                lstA["Role"] = lstA_tier["Role"]

            if "Role" in lstB:
                lstB["Role"].extend(lstB_tier["Role"])
            else:
                lstB["Role"] = lstB_tier["Role"]

            overlapping_segments = get_overlapping_segments(lstA_tier["Role"], lstB_tier["Role"])
            pair_dict["Role"] = {'Segments': overlapping_segments}

            dataset_dict[pair_name] = pair_dict

        overlapping_segments_dict[database] = dataset_dict

dataframes = {}
for database, dataset_dict in overlapping_segments_dict.items():
    tiers = ["Role"]
    data = {tier: [] for tier in tiers}
    for pair_name, pair_dict in dataset_dict.items():
        for tier in tiers:
            segments = pair_dict[tier]['Segments']
            if not segments:
                overlap_duration = 0
                total_duration = 0
                percentage = 0
            else:
                total_duration = 0
                overlap_duration = 0
                for segmentA, segmentB in segments.items():
                    for seg in segmentB:
                        if seg[2].replace(" ", "") == "lsn" and segmentA[2].replace(" ", "") == "lsn":
                            if seg[0] > segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - seg[0]
                            elif seg[0] < segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - segmentA[0]
                            elif seg[0] < segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - segmentA[0]
                            elif seg[0] > segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - seg[0]
                            total_duration += segmentA[1] - segmentA[0]
                if total_duration == 0:
                    percentage = 0
                else:
                    percentage = overlap_duration / total_duration * 100
            data[tier].append({
                'Pairs filenames': pair_name,
                'Overlap Percentage for listener (%)': percentage,
                'Total Tier Duration for listener (ms)': total_duration,
                'Overlap Duration for listener (ms)': overlap_duration
            })

    dfs = []
    for tier in tiers:
        df = pd.DataFrame(data[tier])
        dfs.append(df)

    df_merged = pd.concat(dfs, axis=1)
    df_merged = df_merged.loc[:, ~df_merged.columns.duplicated()]
    df_filter = df_merged.filter(like='Overlap Percentage for')
    df_merged = df_merged.drop(df_filter.columns, axis=1)
    df_total = df_merged.sum(numeric_only=True)
    df_total['Pairs filenames'] = 'Total'

    for tier in ["Role"]:
        overlap_duration_col = 'Overlap Duration for listener (ms)'
        total_duration_col = 'Total Tier Duration for listener (ms)'
        overlap_percentage_col = 'Overlap Percentage for listener (%)'
        df_total[overlap_percentage_col] = (df_total[overlap_duration_col] / df_total[total_duration_col]) * 100
    df_merged = pd.concat([df_merged, df_filter], axis=1)
    df_merged = pd.concat([df_merged, pd.DataFrame(df_total).T], ignore_index=True)
    dataframes[database] = df_merged

for database, df in dataframes.items():
    display(Markdown(f"**Database: {database}**"))
    display(df)

**Database: CCDB**

Unnamed: 0,Pairs filenames,Total Tier Duration for listener (ms),Overlap Duration for listener (ms),Overlap Percentage for listener (%)
0,P12_P2_1402_dizzy.eaf_&_P12_P2_1402_monk.eaf,2740,970,35.40146
1,P12_P3_2202_dizzy.eaf_&_P12_P3_2202_monk.eaf,0,0,0.0
2,P14_P7_2502_dizzy.eaf_&_P14_P7_2502_monk.eaf,11410,230,2.015776
3,P15_P13_2402_dizzy.eaf_&_P15_P13_2402_monk.eaf,52150,3555,6.816874
4,P16_P6_0903_dizzy.eaf_&_P16_P6_0903_monk.eaf,7830,10,0.127714
5,P18_P10_2102_dizzy.eaf_&_P18_P10_2102_monk.eaf,5900,35,0.59322
6,P18_P1_2102_dizzy.eaf_&_P18_P1_2102_monk.eaf,28890,400,1.384562
7,Total,108920,5200,4.774146


**Database: IFADV**

Unnamed: 0,Pairs filenames,Total Tier Duration for listener (ms),Overlap Duration for listener (ms),Overlap Percentage for listener (%)
0,DVA1A.eaf_&_DVB1B.eaf,24600,6085,24.735772
1,DVA2C.eaf_&_DVB2D.eaf,10750,2805,26.093023
2,DVA3E.eaf_&_DVB3F.eaf,30280,2040,6.73712
3,DVA4C.eaf_&_DVB4G.eaf,30780,230,0.747238
4,DVA5G.eaf_&_DVB5H.eaf,3305,740,22.390318
5,DVA6H.eaf_&_DVB6I.eaf,13680,205,1.498538
6,DVA7B.eaf_&_DVB7J.eaf,21610,1125,5.205923
7,DVA8K.eaf_&_DVB8L.eaf,11120,860,7.733813
8,Total,146125,14090,9.642429


**Database: NDC**

Unnamed: 0,Pairs filenames,Total Tier Duration for listener (ms),Overlap Duration for listener (ms),Overlap Percentage for listener (%)
0,13_1_A_M.eaf_&_13_1_B_F.eaf,18672,5387,28.850686
1,13_2_A_M.eaf_&_13_2_B_F.eaf,31639,375,1.185246
2,13_4_A_M.eaf_&_13_4_B_F.eaf,29010,1175,4.050327
3,14_1_A_M.eaf_&_14_1_B_F.eaf,28720,650,2.263231
4,14_2_A_M.eaf_&_14_2_B_F.eaf,220430,5550,2.517806
5,17_1_A_F.eaf_&_17_1_B_F.eaf,0,0,0.0
6,17_2_A_F.eaf_&_17_2_B_F.eaf,0,0,0.0
7,17_3_A_F.eaf_&_17_3_B_F.eaf,17920,3520,19.642857
8,17_4_A_F.eaf_&_17_4_B_F.eaf,163195,1935,1.185698
9,18_1_A_M.eaf_&_18_1_B_M.eaf,48470,4585,9.459459


Let's study the overlap of laughs and smiles between speaker and listener for each pair of file for list A to list B:

In [13]:
lstA = {}
lstB = {}
overlapping_segments_dict = {}
for i, database in enumerate(databases_name):
    if database == databases_pairs[i].replace('_pairs', '').upper():
        databases_list = databases_pair_paths[databases_pairs[i]]
        dataset_dict = {}
        for i in range(0, len(databases_list), 2):
            filepath_A = databases_list[i]
            filepath_B = databases_list[i+1]
            pair_file_A = os.path.basename(filepath_A)
            pair_file_B = os.path.basename(filepath_B)

            if pair_file_A and pair_file_B:
                pair_name = f"{pair_file_A}_&_{pair_file_B}"

            pair_dict = {}
            for tier in expressions + ["Role"]:
                lstA_tier = get_tier_from_file(filepath_A, tier)
                lstB_tier = get_tier_from_file(filepath_B, tier)

                if tier in lstA:
                    lstA[tier].extend(lstA_tier[tier])
                else:
                    lstA[tier] = lstA_tier[tier]

                if tier in lstB:
                    lstB[tier].extend(lstB_tier[tier])
                else:
                    lstB[tier] = lstB_tier[tier]

                overlapping_segments = get_overlapping_segments(lstA_tier[tier], lstB_tier[tier])
                pair_dict[tier] = {'Segments': overlapping_segments}

            dataset_dict[pair_name] = pair_dict

        overlapping_segments_dict[database] = dataset_dict

dataframes = {}
for database, dataset_dict in overlapping_segments_dict.items():
    tiers = expressions
    data = {}
    for pair_name, pair_dict in dataset_dict.items():
        segments = pair_dict["Role"]["Segments"]
        overlap_duration_smiles = 0
        total_duration_smiles = 0
        overlap_duration_laughs = 0
        total_duration_laughs = 0

        for segmentA, segmentB in segments.items():
            for segB in segmentB:
                if (segmentA[2].replace(" ", "") == "spk" and segB[2].replace(" ", "") == "lsn") or (segmentA[2].replace(" ", "") == "lsn" and segB[2].replace(" ", "") == "spk"):
                    segments_A = segmentA
                    segments_B = segB
                    if not segments_A or not segments_B:
                        continue
                    
                    for tier in expressions:
                        segments_tier = pair_dict[tier]["Segments"]
                        for tierA, tierB in segments_tier.items():
                            if tierA[0] < segments_A[1] and tierA[1] > segments_A[0] and tierA[0] < segments_B[1] and tierA[1] > segments_B[0]:
                                if tierA[0] > segments_A[0] and tierA[1] < segments_A[1]:
                                    for B in tierB:
                                        if B[0] < segments_A[1] and B[1] > segments_A[0] and B[0] < segments_B[1] and B[1] > segments_B[0]:
                                            if B[0] > tierA[0] and B[1] < tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += B[1] - B[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += B[1] - B[0]
                                            elif B[0] < tierA[0] and B[1] > tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += tierA[1] - tierA[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += tierA[1] - tierA[0]
                                            elif B[0] < tierA[0] and B[1] < tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += B[1] - tierA[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += B[1] - tierA[0]
                                            elif B[0] > tierA[0] and B[1] > tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += tierA[1] - B[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += tierA[1] - B[0]
                                elif tierA[0] < segments_A[0] and tierA[1] > segments_A[1]:
                                    for B in tierB:
                                        if B[0] < segments_A[1] and B[1] > segments_A[0] and B[0] < segments_B[1] and B[1] > segments_B[0]:
                                            if B[0] > tierA[0] and B[1] < tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += B[1] - B[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += B[1] - B[0]
                                            elif B[0] < tierA[0] and B[1] > tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += tierA[1] - tierA[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += tierA[1] - tierA[0]
                                            elif B[0] < tierA[0] and B[1] < tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += B[1] - tierA[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += B[1] - tierA[0]
                                            elif B[0] > tierA[0] and B[1] > tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += tierA[1] - B[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += tierA[1] - B[0]
                                elif tierA[0] < segments_A[0] and tierA[1] < segments_A[1]:
                                    for B in tierB:
                                        if B[0] < segments_A[1] and B[1] > segments_A[0] and B[0] < segments_B[1] and B[1] > segments_B[0]:
                                            if B[0] > tierA[0] and B[1] < tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += B[1] - B[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += B[1] - B[0]
                                            elif B[0] < tierA[0] and B[1] > tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += tierA[1] - tierA[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += tierA[1] - tierA[0]
                                            elif B[0] < tierA[0] and B[1] < tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += B[1] - tierA[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += B[1] - tierA[0]
                                            elif B[0] > tierA[0] and B[1] > tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += tierA[1] - B[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += tierA[1] - B[0]
                                elif tierA[0] > segments_A[0] and tierA[1] > segments_A[1]:
                                    for B in tierB:
                                        if B[0] < segments_A[1] and B[1] > segments_A[0] and B[0] < segments_B[1] and B[1] > segments_B[0]:
                                            if B[0] > tierA[0] and B[1] < tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += B[1] - B[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += B[1] - B[0]
                                            elif B[0] < tierA[0] and B[1] > tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += tierA[1] - tierA[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += tierA[1] - tierA[0]
                                            elif B[0] < tierA[0] and B[1] < tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += B[1] - tierA[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += B[1] - tierA[0]
                                            elif B[0] > tierA[0] and B[1] > tierA[1]:
                                                if tier == "Smiles_0":
                                                    overlap_duration_smiles += tierA[1] - B[0]
                                                elif tier == "Laughs_0":
                                                    overlap_duration_laughs += tierA[1] - B[0]
                                if tier == "Smiles_0":
                                    total_duration_smiles += tierA[1] - tierA[0]
                                elif tier == "Laughs_0":
                                    total_duration_laughs += tierA[1] - tierA[0]
        percentage_smiles = 0 if total_duration_smiles == 0 else overlap_duration_smiles / total_duration_smiles * 100
        percentage_laughs = 0 if total_duration_laughs == 0 else overlap_duration_laughs / total_duration_laughs * 100
    

        data[pair_name] = {
            'Pairs filenames': pair_name,
            'Overlap Percentage spk/lsn for Smiles_0 (%)': percentage_smiles,
            'Total Tier Duration spk/lsn for Smiles_0 (ms)': total_duration_smiles,
            'Overlap Duration spk/lsn for Smiles_0 (ms)': overlap_duration_smiles,
            'Overlap Percentage spk/lsn for Laughs_0 (%)': percentage_laughs,
            'Total Tier Duration spk/lsn for Laughs_0 (ms)': total_duration_laughs,
            'Overlap Duration spk/lsn for Laughs_0 (ms)': overlap_duration_laughs,
        }
    df = pd.DataFrame(data.values())
    
    df_total = df.sum(numeric_only=True)
    df_total['Pairs filenames'] = 'Total'
    for tier in expressions:
        overlap_duration_col = f'Overlap Duration spk/lsn for {tier} (ms)'
        total_duration_col = f'Total Tier Duration spk/lsn for {tier} (ms)'
        overlap_percentage_col = f'Overlap Percentage spk/lsn for {tier} (%)'
        df_total[overlap_percentage_col] = (df_total[overlap_duration_col] / df_total[total_duration_col]) * 100

    df = pd.concat([df, pd.DataFrame(df_total).T], ignore_index=True)
    dataframes[database] = df

for database, df in dataframes.items():
    display(Markdown(f"**Database: {database}**"))
    display(df)

**Database: CCDB**

Unnamed: 0,Pairs filenames,Overlap Percentage spk/lsn for Smiles_0 (%),Total Tier Duration spk/lsn for Smiles_0 (ms),Overlap Duration spk/lsn for Smiles_0 (ms),Overlap Percentage spk/lsn for Laughs_0 (%),Total Tier Duration spk/lsn for Laughs_0 (ms),Overlap Duration spk/lsn for Laughs_0 (ms)
0,P12_P2_1402_dizzy.eaf_&_P12_P2_1402_monk.eaf,61.511387,214505.0,131945.0,40.684411,1315.0,535.0
1,P12_P3_2202_dizzy.eaf_&_P12_P3_2202_monk.eaf,70.385371,59008.0,41533.0,0.0,0.0,0.0
2,P14_P7_2502_dizzy.eaf_&_P14_P7_2502_monk.eaf,93.91703,44715.0,41995.0,0.0,0.0,0.0
3,P15_P13_2402_dizzy.eaf_&_P15_P13_2402_monk.eaf,78.947765,26515.0,20933.0,0.0,670.0,0.0
4,P16_P6_0903_dizzy.eaf_&_P16_P6_0903_monk.eaf,93.900889,39350.0,36950.0,0.0,0.0,0.0
5,P18_P10_2102_dizzy.eaf_&_P18_P10_2102_monk.eaf,62.382542,80880.0,50455.0,0.0,0.0,0.0
6,P18_P1_2102_dizzy.eaf_&_P18_P1_2102_monk.eaf,61.943087,86975.0,53875.0,0.0,0.0,0.0
7,Total,68.427823,551948.0,377686.0,26.952141,1985.0,535.0


**Database: IFADV**

Unnamed: 0,Pairs filenames,Overlap Percentage spk/lsn for Smiles_0 (%),Total Tier Duration spk/lsn for Smiles_0 (ms),Overlap Duration spk/lsn for Smiles_0 (ms),Overlap Percentage spk/lsn for Laughs_0 (%),Total Tier Duration spk/lsn for Laughs_0 (ms),Overlap Duration spk/lsn for Laughs_0 (ms)
0,DVA1A.eaf_&_DVB1B.eaf,66.160359,80320.0,53140.0,0.0,0.0,0.0
1,DVA2C.eaf_&_DVB2D.eaf,57.707801,168310.0,97128.0,0.0,0.0,0.0
2,DVA3E.eaf_&_DVB3F.eaf,40.433213,19390.0,7840.0,64.864865,2220.0,1440.0
3,DVA4C.eaf_&_DVB4G.eaf,53.639088,88415.0,47425.0,0.0,0.0,0.0
4,DVA5G.eaf_&_DVB5H.eaf,47.040335,57270.0,26940.0,0.0,0.0,0.0
5,DVA6H.eaf_&_DVB6I.eaf,71.779534,46810.0,33600.0,0.0,0.0,0.0
6,DVA7B.eaf_&_DVB7J.eaf,63.992802,144485.0,92460.0,0.0,0.0,0.0
7,DVA8K.eaf_&_DVB8L.eaf,82.834646,6350.0,5260.0,12.03379,31370.0,3775.0
8,Total,59.506502,611350.0,363793.0,15.525454,33590.0,5215.0


**Database: NDC**

Unnamed: 0,Pairs filenames,Overlap Percentage spk/lsn for Smiles_0 (%),Total Tier Duration spk/lsn for Smiles_0 (ms),Overlap Duration spk/lsn for Smiles_0 (ms),Overlap Percentage spk/lsn for Laughs_0 (%),Total Tier Duration spk/lsn for Laughs_0 (ms),Overlap Duration spk/lsn for Laughs_0 (ms)
0,13_1_A_M.eaf_&_13_1_B_F.eaf,77.46845,21157.0,16390.0,64.285714,1820.0,1170.0
1,13_2_A_M.eaf_&_13_2_B_F.eaf,74.781692,77757.0,58148.0,0.0,0.0,0.0
2,13_4_A_M.eaf_&_13_4_B_F.eaf,73.369192,99092.0,72703.0,56.461962,4364.0,2464.0
3,14_1_A_M.eaf_&_14_1_B_F.eaf,73.798137,19865.0,14660.0,0.0,0.0,0.0
4,14_2_A_M.eaf_&_14_2_B_F.eaf,78.934242,60145.0,47475.0,96.428571,560.0,540.0
5,17_1_A_F.eaf_&_17_1_B_F.eaf,26.755809,38515.0,10305.0,53.301887,3180.0,1695.0
6,17_2_A_F.eaf_&_17_2_B_F.eaf,69.842025,96661.0,67510.0,100.0,4680.0,4680.0
7,17_3_A_F.eaf_&_17_3_B_F.eaf,71.809675,25115.0,18035.0,47.283049,6165.0,2915.0
8,17_4_A_F.eaf_&_17_4_B_F.eaf,82.851902,136295.0,112923.0,87.272727,2750.0,2400.0
9,18_1_A_M.eaf_&_18_1_B_M.eaf,52.908305,144053.0,76216.0,71.25,1600.0,1140.0


To calculate the overlapping between list B to list A, we can do the same analysis as before using : 
* filepath_A = databases_list[i+1]
* filepath_B = databases_list[i]

### Change of the total duration

Now the total duration is the total duration of person A speaking regardless of whether the segment overlaps with person B or not.

In [10]:
lstA = {}
lstB = {}
overlapping_segments_dict = {}
total_durations = []
for i, database in enumerate(databases_name):
    if database == databases_pairs[i].replace('_pairs', '').upper():
        databases_list = databases_pair_paths[databases_pairs[i]]
        dataset_dict = {}
        for i in range(0, len(databases_list), 2):
            filepath_A = databases_list[i]
            filepath_B = databases_list[i+1] 
            pair_file_A = os.path.basename(filepath_A)
            pair_file_B = os.path.basename(filepath_B)

            if pair_file_A and pair_file_B:
                pair_name = f"{pair_file_A}_&_{pair_file_B}"

            pair_dict = {}
            lstA_tier = get_tier_from_file(filepath_A, "Role")
            lstB_tier = get_tier_from_file(filepath_B, "Role")

            if "Role" in lstA:
                lstA["Role"].extend(lstA_tier["Role"])
            else:
                lstA["Role"] = lstA_tier["Role"]

            if "Role" in lstB:
                lstB["Role"].extend(lstB_tier["Role"])
            else:
                lstB["Role"] = lstB_tier["Role"]
            total_duration = 0
            for segA in lstA_tier["Role"]:
                if segA[2].replace(" ", "") == "spk":
                    total_duration += segA[1] - segA[0]
            total_durations.append(total_duration)
            overlapping_segments = get_overlapping_segments(lstA_tier["Role"], lstB_tier["Role"])
            pair_dict["Role"] = {'Segments': overlapping_segments}

            dataset_dict[pair_name] = pair_dict

        overlapping_segments_dict[database] = dataset_dict
dataframes = {}
i=0
for database, dataset_dict in overlapping_segments_dict.items():
    tiers = ["Role"]
    data = {tier: [] for tier in tiers}
    for pair_name, pair_dict in dataset_dict.items():
        for tier in tiers:
            segments = pair_dict[tier]['Segments']
            if not segments:
                overlap_duration = 0
                percentage = 0
            else:
                overlap_duration = 0
                for segmentA, segmentB in segments.items():
                    for seg in segmentB:
                        if seg[2].replace(" ", "") == "spk" and segmentA[2].replace(" ", "") == "spk":
                            if seg[0] > segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - seg[0]
                            elif seg[0] < segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - segmentA[0]
                            elif seg[0] < segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - segmentA[0]
                            elif seg[0] > segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - seg[0]
                percentage = overlap_duration / total_durations[i] * 100
            data[tier].append({
                'Pairs filenames': pair_name,
                'Overlap Percentage for speaker (%)': percentage,
                'Total Tier Duration for speaker (ms)': total_durations[i],
                'Overlap Duration for speaker (ms)': overlap_duration
            })
        i+=1
    dfs = []
    for tier in tiers:
        df = pd.DataFrame(data[tier])
        dfs.append(df)

    df_merged = pd.concat(dfs, axis=1)
    df_merged = df_merged.loc[:, ~df_merged.columns.duplicated()]
    df_filter = df_merged.filter(like='Overlap Percentage for')
    df_merged = df_merged.drop(df_filter.columns, axis=1)
    df_total = df_merged.sum(numeric_only=True)
    df_total['Pairs filenames'] = 'Total'

    for tier in ["Role"]:
        overlap_duration_col = 'Overlap Duration for speaker (ms)'
        total_duration_col = 'Total Tier Duration for speaker (ms)'
        overlap_percentage_col = 'Overlap Percentage for speaker (%)'
        df_total[overlap_percentage_col] = (df_total[overlap_duration_col] / df_total[total_duration_col]) * 100
    df_merged = pd.concat([df_merged, df_filter], axis=1)
    df_merged = pd.concat([df_merged, pd.DataFrame(df_total).T], ignore_index=True)
    dataframes[database] = df_merged

for database, df in dataframes.items():
    display(Markdown(f"**Database: {database}**"))
    display(df)

**Database: CCDB**

Unnamed: 0,Pairs filenames,Total Tier Duration for speaker (ms),Overlap Duration for speaker (ms),Overlap Percentage for speaker (%)
0,P12_P2_1402_dizzy.eaf_&_P12_P2_1402_monk.eaf,61940,20120,32.483048
1,P12_P3_2202_dizzy.eaf_&_P12_P3_2202_monk.eaf,55000,6000,10.909091
2,P14_P7_2502_dizzy.eaf_&_P14_P7_2502_monk.eaf,94310,1270,1.346623
3,P15_P13_2402_dizzy.eaf_&_P15_P13_2402_monk.eaf,52040,5985,11.500769
4,P16_P6_0903_dizzy.eaf_&_P16_P6_0903_monk.eaf,10670,2070,19.400187
5,P18_P10_2102_dizzy.eaf_&_P18_P10_2102_monk.eaf,39860,3535,8.86854
6,P18_P1_2102_dizzy.eaf_&_P18_P1_2102_monk.eaf,31950,1830,5.7277
7,Total,345770,40810,11.802643


**Database: IFADV**

Unnamed: 0,Pairs filenames,Total Tier Duration for speaker (ms),Overlap Duration for speaker (ms),Overlap Percentage for speaker (%)
0,DVA1A.eaf_&_DVB1B.eaf,98490,4445,4.513149
1,DVA2C.eaf_&_DVB2D.eaf,85005,19790,23.280983
2,DVA3E.eaf_&_DVB3F.eaf,15600,3370,21.602564
3,DVA4C.eaf_&_DVB4G.eaf,24240,4760,19.636964
4,DVA5G.eaf_&_DVB5H.eaf,53910,8270,15.340382
5,DVA6H.eaf_&_DVB6I.eaf,40340,6835,16.94348
6,DVA7B.eaf_&_DVB7J.eaf,32020,4418,13.797626
7,DVA8K.eaf_&_DVB8L.eaf,46990,2540,5.405405
8,Total,396595,54428,13.723824


**Database: NDC**

Unnamed: 0,Pairs filenames,Total Tier Duration for speaker (ms),Overlap Duration for speaker (ms),Overlap Percentage for speaker (%)
0,13_1_A_M.eaf_&_13_1_B_F.eaf,77980,9395,12.047961
1,13_2_A_M.eaf_&_13_2_B_F.eaf,19940,2100,10.531595
2,13_4_A_M.eaf_&_13_4_B_F.eaf,33785,10085,29.850525
3,14_1_A_M.eaf_&_14_1_B_F.eaf,239430,2440,1.019087
4,14_2_A_M.eaf_&_14_2_B_F.eaf,87140,6330,7.264173
5,17_1_A_F.eaf_&_17_1_B_F.eaf,55757,480,0.860878
6,17_2_A_F.eaf_&_17_2_B_F.eaf,12891,460,3.568381
7,17_3_A_F.eaf_&_17_3_B_F.eaf,95538,840,0.879231
8,17_4_A_F.eaf_&_17_4_B_F.eaf,21719,950,4.37405
9,18_1_A_M.eaf_&_18_1_B_M.eaf,135436,3663,2.704598


Same thing for each pair of file for list B to list A, the total duration of the tier of list B and the pourcentage of overlap between B and A

In [11]:
lstA = {}
lstB = {}
overlapping_segments_dict = {}
total_durations = []
for i, database in enumerate(databases_name):
    if database == databases_pairs[i].replace('_pairs', '').upper():
        databases_list = databases_pair_paths[databases_pairs[i]]
        dataset_dict = {}
        for i in range(0, len(databases_list), 2):
            filepath_A = databases_list[i+1]
            filepath_B = databases_list[i] 
            pair_file_A = os.path.basename(filepath_A)
            pair_file_B = os.path.basename(filepath_B)

            if pair_file_A and pair_file_B:
                pair_name = f"{pair_file_A}_&_{pair_file_B}"

            pair_dict = {}
            lstA_tier = get_tier_from_file(filepath_A, "Role")
            lstB_tier = get_tier_from_file(filepath_B, "Role")

            if "Role" in lstA:
                lstA["Role"].extend(lstA_tier["Role"])
            else:
                lstA["Role"] = lstA_tier["Role"]

            if "Role" in lstB:
                lstB["Role"].extend(lstB_tier["Role"])
            else:
                lstB["Role"] = lstB_tier["Role"]
            total_duration = 0
            for segB in lstB_tier["Role"]:
                if segB[2].replace(" ", "") == "spk":
                    total_duration += segB[1] - segB[0]
            total_durations.append(total_duration)
            overlapping_segments = get_overlapping_segments(lstA_tier["Role"], lstB_tier["Role"])
            pair_dict["Role"] = {'Segments': overlapping_segments}

            dataset_dict[pair_name] = pair_dict

        overlapping_segments_dict[database] = dataset_dict
dataframes = {}
i=0
for database, dataset_dict in overlapping_segments_dict.items():
    tiers = ["Role"]
    data = {tier: [] for tier in tiers}
    for pair_name, pair_dict in dataset_dict.items():
        for tier in tiers:
            segments = pair_dict[tier]['Segments']
            if not segments:
                overlap_duration = 0
                percentage = 0
            else:
                overlap_duration = 0
                for segmentA, segmentB in segments.items():
                    for seg in segmentB:
                        if seg[2].replace(" ", "") == "spk" and segmentA[2].replace(" ", "") == "spk":
                            if seg[0] > segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - seg[0]
                            elif seg[0] < segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - segmentA[0]
                            elif seg[0] < segmentA[0] and seg[1] < segmentA[1]:
                                overlap_duration += seg[1] - segmentA[0]
                            elif seg[0] > segmentA[0] and seg[1] > segmentA[1]:
                                overlap_duration += segmentA[1] - seg[0]
                percentage = overlap_duration / total_durations[i] * 100
            data[tier].append({
                'Pairs filenames': pair_name,
                'Overlap Percentage for speaker (%)': percentage,
                'Total Tier Duration for speaker (ms)': total_durations[i],
                'Overlap Duration for speaker (ms)': overlap_duration
            })
        i+=1
    dfs = []
    for tier in tiers:
        df = pd.DataFrame(data[tier])
        dfs.append(df)

    df_merged = pd.concat(dfs, axis=1)
    df_merged = df_merged.loc[:, ~df_merged.columns.duplicated()]
    df_filter = df_merged.filter(like='Overlap Percentage for')
    df_merged = df_merged.drop(df_filter.columns, axis=1)
    df_total = df_merged.sum(numeric_only=True)
    df_total['Pairs filenames'] = 'Total'

    for tier in ["Role"]:
        overlap_duration_col = 'Overlap Duration for speaker (ms)'
        total_duration_col = 'Total Tier Duration for speaker (ms)'
        overlap_percentage_col = 'Overlap Percentage for speaker (%)'
        df_total[overlap_percentage_col] = (df_total[overlap_duration_col] / df_total[total_duration_col]) * 100
    df_merged = pd.concat([df_merged, df_filter], axis=1)
    df_merged = pd.concat([df_merged, pd.DataFrame(df_total).T], ignore_index=True)
    dataframes[database] = df_merged

for database, df in dataframes.items():
    display(Markdown(f"**Database: {database}**"))
    display(df)

**Database: CCDB**

Unnamed: 0,Pairs filenames,Total Tier Duration for speaker (ms),Overlap Duration for speaker (ms),Overlap Percentage for speaker (%)
0,P12_P2_1402_monk.eaf_&_P12_P2_1402_dizzy.eaf,61940,20120,32.483048
1,P12_P3_2202_monk.eaf_&_P12_P3_2202_dizzy.eaf,55000,6000,10.909091
2,P14_P7_2502_monk.eaf_&_P14_P7_2502_dizzy.eaf,94310,1270,1.346623
3,P15_P13_2402_monk.eaf_&_P15_P13_2402_dizzy.eaf,52040,5985,11.500769
4,P16_P6_0903_monk.eaf_&_P16_P6_0903_dizzy.eaf,10670,2070,19.400187
5,P18_P10_2102_monk.eaf_&_P18_P10_2102_dizzy.eaf,39860,3535,8.86854
6,P18_P1_2102_monk.eaf_&_P18_P1_2102_dizzy.eaf,31950,1830,5.7277
7,Total,345770,40810,11.802643


**Database: IFADV**

Unnamed: 0,Pairs filenames,Total Tier Duration for speaker (ms),Overlap Duration for speaker (ms),Overlap Percentage for speaker (%)
0,DVB1B.eaf_&_DVA1A.eaf,98490,4445,4.513149
1,DVB2D.eaf_&_DVA2C.eaf,85005,19790,23.280983
2,DVB3F.eaf_&_DVA3E.eaf,15600,3370,21.602564
3,DVB4G.eaf_&_DVA4C.eaf,24240,4760,19.636964
4,DVB5H.eaf_&_DVA5G.eaf,53910,8270,15.340382
5,DVB6I.eaf_&_DVA6H.eaf,40340,6835,16.94348
6,DVB7J.eaf_&_DVA7B.eaf,32020,4418,13.797626
7,DVB8L.eaf_&_DVA8K.eaf,46990,2540,5.405405
8,Total,396595,54428,13.723824


**Database: NDC**

Unnamed: 0,Pairs filenames,Total Tier Duration for speaker (ms),Overlap Duration for speaker (ms),Overlap Percentage for speaker (%)
0,13_1_B_F.eaf_&_13_1_A_M.eaf,77980,9395,12.047961
1,13_2_B_F.eaf_&_13_2_A_M.eaf,19940,2100,10.531595
2,13_4_B_F.eaf_&_13_4_A_M.eaf,33785,10085,29.850525
3,14_1_B_F.eaf_&_14_1_A_M.eaf,239430,2440,1.019087
4,14_2_B_F.eaf_&_14_2_A_M.eaf,87140,6330,7.264173
5,17_1_B_F.eaf_&_17_1_A_F.eaf,55757,480,0.860878
6,17_2_B_F.eaf_&_17_2_A_F.eaf,12891,460,3.568381
7,17_3_B_F.eaf_&_17_3_A_F.eaf,95538,840,0.879231
8,17_4_B_F.eaf_&_17_4_A_F.eaf,21719,950,4.37405
9,18_1_B_M.eaf_&_18_1_A_M.eaf,135436,3663,2.704598


Two ways of calculating the total duration have been presented here:
- taking only the segments that have an overlap
- by taking all the segments of the concerned tier


We could also take for a future study, the union of the two segments where the overlapping occurs. I did not consider it necessary to develop it for the moment.

Attention, here I study exclusively the overlapping between role, smiles or laughs but I do not look if there is an overlapping of smiles with laughs.