In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager

DATA_PATH = Path('/home/engaclew/neurogen')

# Read measures
human_measures = pd.read_csv(DATA_PATH / 'human_measures_chunks.csv').fillna(0)

# Read metadata
children = pd.read_csv(DATA_PATH / 'data/L3_HIPAA_LENA_cleaned/metadata/children.csv')
recordings = pd.read_csv(DATA_PATH / 'data/L3_HIPAA_LENA_cleaned/metadata/recordings.csv')
recordings_data = recordings.merge(children, on='child_id')[['group_id', 'recording_filename', 'child_sex']]
human_measures = human_measures.merge(recordings_data, how='left', on='recording_filename')

def compute_CVC(data):
    if 'can_voc_CHI' in data.columns and 'non_can_voc_CHI' in data.columns:
        data['CVC'] = data['can_voc_CHI'] + data['non_can_voc_CHI']
    return data

human_measures = compute_CVC(human_measures)

def group_data(data):
    if 'can_voc_CHI' in data.columns and 'non_can_voc_CHI' in data.columns:
        data['CVC'] = data['can_voc_CHI'] + data['non_can_voc_CHI']
    data = data.groupby('recording_filename').agg({
        '5s_CTC': np.sum,
        'voc_dur_chi': np.sum,
        'voc_dur_och': np.sum,
        'voc_dur_mal': np.sum,
        'voc_dur_fem': np.sum,
        'voc_chi': np.sum,
        'wc_adu': np.sum,
        'CVC': np.sum,
        'group_id': 'first',
        'child_id': 'first'
    }).reset_index()
    return data

df = group_data(human_measures)
duration_cols = ['voc_dur_chi', 'voc_dur_och', 'voc_dur_mal', 'voc_dur_fem']
for col in duration_cols:
    df[col] = df[col] / (1000 * 60)  # Convert from milliseconds to minutes

  data = data.groupby('recording_filename').agg({
  data = data.groupby('recording_filename').agg({


In [2]:
human_measures.drop_duplicates('child_id').groupby('group_id')['child_sex'].apply(lambda x: (x == 'f').sum())

group_id
angelman_syndrome     6
autism_sibling        7
down_syndrome         4
fragile_x_syndrome    3
low_risk              4
Name: child_sex, dtype: int64

In [5]:
human_measures

Unnamed: 0,recording_filename,segment_onset,segment_offset,child_id,duration_eaf/an1,voc_dur_chi,voc_dur_mal,voc_dur_fem,voc_dur_och,voc_chi,...,wc_mal,wc_adu,5s_CTC,non_can_voc_CHI,can_voc_CHI,can_voc_dur_CHI,cp_dur,group_id,child_sex,CVC
0,20190828_145547_024884_2.wav,28611000,28731000,5061,120000,11317.0,0.0,35828.0,0.0,15,...,0,132,29,11,4,4028.0,0.355925,angelman_syndrome,f,15
1,20211022_110857_045737_2.wav,49860000,49980000,6161,120000,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0.0,0.000000,fragile_x_syndrome,f,0
2,20180808_110630_024879.wav,2202000,2322000,3501,120000,7945.0,0.0,9164.0,0.0,10,...,0,34,5,10,0,0.0,0.000000,low_risk,m,10
3,20221129_154915_045733_1.wav,2072000,2192000,6641,120000,5540.0,0.0,39229.0,0.0,13,...,0,128,24,7,6,2128.0,0.384116,fragile_x_syndrome,m,13
4,20230110_120835_024882_2.wav,36581000,36701000,7231,120000,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0.0,0.000000,autism_sibling,f,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,20230208_103250_043299_1.wav,49225000,49345000,7261,120000,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0.0,0.000000,fragile_x_syndrome,m,0
746,20230321_124603_024879_1.wav,36581000,36701000,7291,120000,16699.0,47178.0,8202.0,3882.0,19,...,156,196,23,15,3,2038.0,0.125184,autism_sibling,f,18
747,20210624_093317_024883_2.wav,1639000,1759000,5851,120000,1952.0,0.0,9234.0,0.0,4,...,0,28,2,4,0,0.0,0.000000,fragile_x_syndrome,f,4
748,20220712_113417_045737_3.wav,2072000,2192000,7061,120000,502.0,0.0,870.0,6167.0,1,...,0,12,0,1,0,0.0,0.000000,autism_sibling,f,1


In [2]:
import pandas as pd
import numpy as np

# Calculate total duration for each recording and get statistics
df['total_duration'] = df[duration_cols].sum(axis=1)

metrics = {
    'Cumulated speech/vocalization duration (mn)': df['total_duration'],
    "Key child's vocalizations (%)": (df['voc_dur_chi'] / df['total_duration']) * 100,
    "Other children's vocalizations (%)": (df['voc_dur_och'] / df['total_duration']) * 100,
    'Adult female speech (%)': (df['voc_dur_fem'] / df['total_duration']) * 100,
    'Adult male speech (%)': (df['voc_dur_mal'] / df['total_duration']) * 100,
    'Conversational Turn Count': df['5s_CTC'],
    'Adult Word Count': df['wc_adu'],
    'Child Vocalization Count': df['CVC']
}

# Calculate mean, min, and max for each metric
summary = pd.DataFrame({
    'Mean': {k: v.mean() for k, v in metrics.items()},
    'Min': {k: v.min() for k, v in metrics.items()},
    'Max': {k: v.max() for k, v in metrics.items()}
})

summary = summary.round(1)
print(summary)

for index, row in summary.iterrows():
    #print(index)
    print(' '.join(map(str, row.values)))

                                              Mean    Min     Max
Cumulated speech/vocalization duration (mn)    6.6    1.3    13.6
Key child's vocalizations (%)                 33.0    4.0    60.8
Other children's vocalizations (%)            13.5    0.0    39.1
Adult female speech (%)                       40.6   10.9    75.4
Adult male speech (%)                         13.0    0.0    72.4
Conversational Turn Count                     93.2    0.0   274.0
Adult Word Count                             827.2  142.0  1991.0
Child Vocalization Count                     142.5    7.0   312.0
6.6 1.3 13.6
33.0 4.0 60.8
13.5 0.0 39.1
40.6 10.9 75.4
13.0 0.0 72.4
93.2 0.0 274.0
827.2 142.0 1991.0
142.5 7.0 312.0


In [3]:
# Group by group_id and calculate statistics for each group
group_stats = {}
for group in df['group_id'].unique():
    group_data = df[df['group_id'] == group]
    
    group_metrics = {
        'Cumulated speech/vocalization duration (mn)': group_data['total_duration'],
        "Key child's vocalizations (%)": (group_data['voc_dur_chi'] / group_data['total_duration']) * 100,
        "Other children's vocalizations (%)": (group_data['voc_dur_och'] / group_data['total_duration']) * 100,
        'Adult female speech (%)': (group_data['voc_dur_fem'] / group_data['total_duration']) * 100,
        'Adult male speech (%)': (group_data['voc_dur_mal'] / group_data['total_duration']) * 100,
        'Conversational Turn Count': group_data['5s_CTC'],
        'Adult Word Count': group_data['wc_adu'],
        'Child Vocalization Count': group_data['CVC']
    }
    
    group_summary = pd.DataFrame({
        'Mean': {k: v.mean() for k, v in group_metrics.items()},
        'Min': {k: v.min() for k, v in group_metrics.items()},
        'Max': {k: v.max() for k, v in group_metrics.items()}
    })
    
    group_stats[group] = group_summary

# Print statistics for each group
for group, stats in group_stats.items():
    print(f"\nGroup: {group}")
    print("-" * 50)
    print(stats.round(1))


Group: low_risk
--------------------------------------------------
                                              Mean    Min     Max
Cumulated speech/vocalization duration (mn)    6.6    1.3    12.1
Key child's vocalizations (%)                 40.0    4.0    50.6
Other children's vocalizations (%)            14.8    0.0    39.1
Adult female speech (%)                       37.7   10.9    62.0
Adult male speech (%)                          7.5    0.0    26.3
Conversational Turn Count                     94.0   11.0   201.0
Adult Word Count                             721.8  242.0  1266.0
Child Vocalization Count                     142.9    7.0   264.0

Group: down_syndrome
--------------------------------------------------
                                              Mean    Min     Max
Cumulated speech/vocalization duration (mn)    7.1    4.4     9.6
Key child's vocalizations (%)                 25.5   10.3    42.1
Other children's vocalizations (%)            22.2    0.2    36.6
A

In [21]:
# Define the group order we want
group_order = ['low_risk', 'angelman_syndrome', 'down_syndrome', 'fragile_x_syndrome', 'autism_sibling']

# Create a MultiIndex DataFrame to store all statistics
all_stats = pd.DataFrame()

for group in group_order:
    group_data = df[df['group_id'] == group]
    
    group_metrics = {
        'Cumulated speech/vocalization duration (mn)': group_data['total_duration'],
        "Key child's vocalizations (%)": (group_data['voc_dur_chi'] / group_data['total_duration']) * 100,
        "Other children's vocalizations (%)": (group_data['voc_dur_och'] / group_data['total_duration']) * 100,
        'Adult female speech (%)': (group_data['voc_dur_fem'] / group_data['total_duration']) * 100,
        'Adult male speech (%)': (group_data['voc_dur_mal'] / group_data['total_duration']) * 100,
        'Conversational Turn Count': group_data['5s_CTC'],
        'Adult Word Count': group_data['wc_adu'],
        'Child Vocalization Count': group_data['CVC']
    }
    
    group_summary = pd.DataFrame({
        f'{group}_Mean': {k: v.mean() for k, v in group_metrics.items()},
        f'{group}_Min': {k: v.min() for k, v in group_metrics.items()},
        f'{group}_Max': {k: v.max() for k, v in group_metrics.items()}
    })
    
    all_stats = pd.concat([all_stats, group_summary], axis=1)

# Round all values to 1 decimal place
all_stats = all_stats.round(1)

# Print space-separated values
for index, row in all_stats.iterrows():
    #print(index)
    print(' '.join(map(str, row.values)))

6.0 3.5 10.2 5.6 2.0 8.2 7.3 6.6 8.7 5.5 3.7 7.8 6.8 4.7 8.7
43.2 31.4 50.6 23.7 5.8 41.6 24.7 10.3 42.1 33.7 18.5 60.8 40.7 36.6 46.2
16.4 0.0 39.1 17.0 0.0 36.4 26.7 18.4 36.6 0.9 0.0 3.4 8.9 4.0 11.8
37.0 10.9 61.6 46.9 20.5 70.2 32.1 14.0 53.4 52.4 26.3 75.4 31.3 23.1 42.0
3.4 0.0 9.0 12.4 0.9 24.6 16.5 2.0 36.7 13.0 0.0 39.5 19.0 13.9 25.7
91.4 13.0 201.0 71.4 10.0 201.0 102.2 65.0 190.0 90.2 45.0 169.0 118.0 60.0 187.0
674.2 309.0 1139.0 784.4 142.0 1412.0 998.4 659.0 1252.0 711.2 332.0 1284.0 728.0 519.0 852.0
140.2 46.0 264.0 91.1 31.0 157.0 159.6 59.0 248.0 120.0 39.0 234.0 180.7 103.0 261.0


In [5]:
human_measures.columns

Index(['recording_filename', 'segment_onset', 'segment_offset', 'child_id',
       'duration_eaf/an1', 'voc_dur_chi', 'voc_dur_mal', 'voc_dur_fem',
       'voc_dur_och', 'voc_chi', 'avg_voc_dur_chi', 'wc_fem', 'wc_mal',
       'wc_adu', '5s_CTC', 'non_can_voc_CHI', 'can_voc_CHI',
       'speechlike_pitch', 'nonspeechlike_pitch', 'group_id', 'CVC'],
      dtype='object')