# Bachelors

## Read in the features


In [58]:
# install packages 
import pandas as pd
import numpy as np
import os
import csv

In [59]:
def load_data(filename, index_col=0):
    '''
    This function loads the data 
    '''
    filepath = os.path.join("..",
                            "in",
                            filename)


    try:
        data = pd.read_csv(filepath, sep=",",
                           skipinitialspace=True, 
                           engine='python', 
                           index_col=index_col, 
                           encoding='utf-8', 
                           on_bad_lines='warn' 
                           )   
    
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        return None
    except pd.errors.ParserError as e:
        print(f"Error parsing the file: {e}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

    return data

In [60]:
features = load_data("TurnTakingData_opensmile.csv", index_col=False)

In [61]:
features.sample(frac=1, random_state=42)

Unnamed: 0,Participant,Visit,Interlocutor,StartTime,EndTime,Confidence,Duration,StartTimeSec,EndTimeSec,DurationSec,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
33201,Caleb,4,Parent,143815.0,143840.0,0.422295,25.00,1438.15,1438.40,0.25,...,0.031971,-0.011712,0.108713,4.166667,11.111112,0.030000,0.010000,0.090000,0.000000,-27.213427
23594,AS2,2,Child,21530.0,22855.0,0.887694,13.25,215.30,228.55,0.41,...,0.040975,-0.013121,0.144290,3.096677,3.184231,0.108333,0.080797,0.194000,0.183614,-29.933510
36400,CC,5,Parent,89675.0,89942.0,0.775888,2.67,896.75,899.42,1.06,...,0.029356,0.000888,0.084954,0.751880,1.532567,0.435000,0.078262,0.266667,0.215149,-26.710680
101236,Lester,1,Child,26424.0,26456.0,0.678608,32.00,264.24,264.56,0.32,...,0.000000,0.000000,0.000000,6.451613,4.000000,0.240000,0.000000,0.000000,0.000000,-19.272778
145527,TC,5,Child,47229.0,47310.0,0.977865,81.00,472.29,473.10,0.81,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,Milo,3,Child,120301.0,120315.0,0.880139,14.00,1203.01,1203.15,0.14,...,0.027927,-0.024868,0.157776,0.000000,0.000000,0.000000,0.000000,0.070000,0.000000,-35.544720
103694,Lester,6,Child,100287.0,100368.0,0.963919,81.00,1002.87,1003.68,0.81,...,0.074630,-0.027076,0.990070,6.250000,2.702703,0.350000,0.310000,0.000000,0.000000,-19.679186
131932,RR,5,Parent,11219.0,11292.0,0.531600,73.00,112.19,112.92,0.73,...,0.006223,-0.000660,0.132007,11.111112,4.477612,0.053333,0.028674,0.110000,0.088600,-33.940086
146867,Tim,2,Parent,100869.0,100916.0,0.445618,47.00,1008.69,1009.16,0.47,...,-0.017512,0.004907,0.103895,4.347826,5.000000,0.145000,0.115000,0.060000,0.000000,-20.559052


In [62]:
features.shape

(158886, 145)

In [63]:
# save data
def save_data(df, filename):
    filepath = os.path.join("..", 
                            "in", 
                            filename) 
    df.to_csv(filepath, index=False) 
    print(f"Data saved to {filepath}")

In [64]:
shuffled_na_rows = features[features['jitterLocal_sma3nz_amean'].isna()].sample(frac=1, random_state=42)

round(shuffled_na_rows.describe(),2)
save_data(shuffled_na_rows, "na_rows.csv")
shuffled_na_rows.shape

Data saved to ../in/na_rows.csv


(7831, 145)

In [65]:
shuffled_non_na_rows = features[features['jitterLocal_sma3nz_amean'].notna()].sample(frac=1, random_state=42)
round(shuffled_non_na_rows.describe(),2)
save_data(shuffled_non_na_rows, "non_na_rows.csv")

Data saved to ../in/non_na_rows.csv


In [66]:
shuffled_non_na_rows.shape

(151055, 145)

In [67]:
features.columns.to_list()

['Participant',
 'Visit',
 'Interlocutor',
 'StartTime',
 'EndTime',
 'Confidence',
 'Duration',
 'StartTimeSec',
 'EndTimeSec',
 'DurationSec',
 'ASD',
 'InterviewerSinceLast',
 'InterTurn',
 'Ethnicity',
 'Diagnosis',
 'ASD_check',
 'ASD2',
 'Gender',
 'Birthdate',
 'Age',
 'Total..Understands...Says.',
 'Total..Understands.',
 'Total.of.Both',
 'Age2',
 'ADOS',
 'CARS',
 'CDI1',
 'VinelandStandardScore',
 'VinelandReceptive',
 'VinelandExpressive',
 'VinelandWritten',
 'DailyLivingSkills',
 'Socialization',
 'MotorSkills',
 'MullenRaw',
 'MullenTScore',
 'MullenAge',
 'FineMotorRaw',
 'FineMotorTScore',
 'FIneMotorAge',
 'ReceptiveLanguageRaw',
 'ReceptiveLanguageTScore',
 'ReceptiveLanguageAge',
 'ExpressiveLangRaw',
 'ExpressiveLangTScore',
 'ExpressiveLangAge',
 'EarlyLearningComposite',
 'ADOS1',
 'Mullen1',
 'EL1',
 'ExpressiveTScore1',
 'ExpressiveAge1',
 'StartTimeSec1',
 'Visit2',
 'StartTimeSec2',
 'Socialization1',
 'wav_path',
 'F0semitoneFrom27.5Hz_sma3nz_amean',
 'F0sem

## Investigation

In [68]:
features.shape # 158886

(158886, 145)

In [69]:
# check for missing values - 5005! 

print(features[features['ASD'] == 0.0].isnull().sum().to_string())
print(features[features['ASD'] == 1.0].isnull().sum().to_string())

Participant                                           0
Visit                                                 0
Interlocutor                                          0
StartTime                                             0
EndTime                                               0
Confidence                                            0
Duration                                              0
StartTimeSec                                          0
EndTimeSec                                            0
DurationSec                                           0
ASD                                                   0
InterviewerSinceLast                              38848
InterTurn                                           187
Ethnicity                                          5167
Diagnosis                                          5167
ASD_check                                          5167
ASD2                                               5167
Gender                                          

In [70]:
# Get summary of the numerical
round(features.describe(),2)

Unnamed: 0,Visit,StartTime,EndTime,Confidence,Duration,StartTimeSec,EndTimeSec,DurationSec,ASD,InterviewerSinceLast,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
count,158886.0,158842.0,158842.0,158842.0,158842.0,158842.0,158842.0,158842.0,158842.0,78937.0,...,151055.0,151055.0,151055.0,151055.0,151055.0,151055.0,151055.0,151055.0,151055.0,151055.0
mean,3.53,90682.8,91027.09,0.77,67.02,906.83,910.27,0.97,0.51,6.92,...,0.03,-0.01,0.22,3.57,2.86,0.2,0.1,0.28,0.26,-27.03
std,1.69,51195.57,51189.63,0.14,90.7,511.96,511.9,0.96,0.5,9.77,...,0.02,0.01,0.23,2.84,2.27,0.18,0.11,0.62,0.54,6.82
min,1.0,6.0,49.0,0.1,0.33,0.06,0.49,0.01,0.0,0.01,...,-0.08,-0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-54.88
25%,2.0,47507.25,47812.25,0.68,7.8,475.07,478.12,0.39,0.0,1.77,...,0.02,-0.01,0.08,2.11,1.55,0.1,0.0,0.08,0.0,-31.44
50%,4.0,89977.5,90290.5,0.79,40.0,899.78,902.9,0.74,1.0,3.82,...,0.03,-0.01,0.15,3.08,2.44,0.16,0.08,0.16,0.07,-26.61
75%,5.0,134858.0,135189.75,0.88,96.0,1348.58,1351.9,1.27,1.0,8.25,...,0.04,-0.0,0.28,4.35,3.64,0.24,0.15,0.32,0.28,-21.95
max,6.0,235005.0,235400.0,1.0,12249.0,2350.05,2354.0,122.49,1.0,357.41,...,0.13,0.06,5.51,36.36,33.33,4.52,2.1,163.01,27.97,-2.92


In [71]:
# remove parents:
features = features[features['Interlocutor'] == 'Child']

# get the names of all the participants. 
particiants = features['Participant'].unique() # all the names 
print(f"Number of unique participants in the study: {len(particiants)}") # 68 participants - only supposed to be 67

# see how many rows per person
print(features["Participant"].value_counts().to_string()) # 6 children with less than 10 rows - should be excluded (maybe also gabriel?)


# We need to add the gender and age for two childen (AD and AR)
# This is for the first visit - we could increment with 4 months per visit, but for now, we don't need it. 
updates = {
    ('AD', 1): (40.13, 1.0),  # Age, Gender
    ('AR', 1): (20.1, 1.0)    # Age, Gender
}

# use loc
for (participant, visit), (age, gender) in updates.items():
    features.loc[(features['Participant'] == participant) & (features['Visit'] == visit), ['Age', 'Gender']] = (age, gender)


print(features.head().to_string())

Number of unique participants in the study: 68
Participant
Marius     2196
Milo       1891
Eduardo    1827
JB         1742
Annie      1709
JP         1652
Anthony    1648
DH         1580
CC         1522
AR         1507
John       1452
MC         1433
Lester     1370
CD         1344
Albert     1323
LL         1323
CH         1313
Todd       1307
KM         1304
TC         1293
LD         1276
DK         1247
HG         1243
JG         1239
MD         1238
Johan      1236
Jerry      1234
SA         1217
Ryder      1214
Caleb      1208
VC         1186
Charles    1181
Dirk       1180
Vick       1168
SB         1166
Alfie      1164
MM         1140
BC         1126
Witt       1114
Jason      1100
Kara       1098
AH         1095
DS         1085
AD         1079
SE         1075
RR         1054
Bernie     1049
ES         1045
Luis       1019
Frankie     988
JG2         987
AS2         976
Kevin       972
AZ          939
MJ          917
Jack        911
ST          885
Adam        861
Tina        8

In [72]:
# find amount of participants per visit - only 67 participants
unique_counts = features.groupby(['Visit', 'ASD'])['Participant'].nunique() # 421 visits in total - should be 395 
print(unique_counts)
print(f"total number of visits: {unique_counts.sum()}")
print(features.groupby(['Visit', 'ASD']).size())
print(features.groupby(['ASD']).size())


Visit  ASD
1      0.0    30
       1.0    32
2      0.0    32
       1.0    34
3      0.0    30
       1.0    31
4      0.0    32
       1.0    31
5      0.0    32
       1.0    31
6      0.0    31
       1.0    31
Name: Participant, dtype: int64
total number of visits: 377
Visit  ASD
1      0.0    5486
       1.0    6054
2      0.0    6410
       1.0    8678
3      0.0    6675
       1.0    6374
4      0.0    6590
       1.0    6033
5      0.0    7023
       1.0    6849
6      0.0    6477
       1.0    6864
dtype: int64
ASD
0.0    38661
1.0    40852
dtype: int64


In [73]:
print(features['ASD'].unique())

#How many of each?
#35 td children, 32 autistic 
print(features.groupby('ASD')['Participant'].nunique()) # is ASD 0.0?? Should be 32 ASD and 35 TD

features.shape

[0. 1.]
ASD
0.0    32
1.0    36
Name: Participant, dtype: int64


(79513, 145)

## Cleaning

What needs to be done?: 
- Parents are to be removed (Interlocutor - Parent)
- Riccardo: "If a feature has a 0 value (unless it's pause count or pause duration) the utterance is most likely unanalyzable, so that row should be excluded."
- Remove the 6 participants with few visits. 


In [74]:
from sklearn.preprocessing import LabelEncoder

def clean(df):
    '''
    This function renames the values in the ASD column, and removes all the rows containing parent utterances. 
    Remove the participants with fewer than 10 rows. 
    It makes 'Participant' anonymous and numeric
    Lastly it removes all the rows where jitter is NA. 
    '''
    # rename the asd and td column values
    #value_map = {0.0: 'ASD', 1.0: 'TD'}
    #df["ASD"] = df["ASD"].replace(value_map)

    # remove rows where Interlocutor is 'Parent' or NA
    df = df[df['Interlocutor'] == 'Child']

    # try to just remove the rows with NA in one feature (if NA in this, it has NA alot!):
    df = df.dropna(subset=['jitterLocal_sma3nz_amean']) # 
    df = df.dropna(subset=['ASD'])

    # remove rows where a feature has 0.0 - unanalyzable. (StddevUnvoicedSegmentLength represents the rest) 
    df = df[df['StddevUnvoicedSegmentLength'] != 0.0]
    
    # making 'Participant' anonymous and numeric - use the LabelEncoder
    label_encoder = LabelEncoder()
    df['Participant'] = label_encoder.fit_transform(df['Participant'])
    
    # drop columns from StartTime to F0semitoneFrom27.5Hz_sma3nz_amean - except: ASD, Gender, Age, ExpressiveLangRaw
    columns_to_keep = ['ASD', 'Gender', 'Age', 'ExpressiveLangRaw', 'Duration']
    df_cleaned = df.loc[:, ~df.columns.isin(df.loc[:, 'Interlocutor':'wav_path'].columns) | df.columns.isin(columns_to_keep)]
    
    # make a column whith the ExpressiveLangRaw values from visit 1 (most only has 1 value)
    first_visit = df_cleaned[df_cleaned['Visit'] == 1].groupby('Participant')['ExpressiveLangRaw'].first()
    df_cleaned['ExpressiveLangRaw1'] = df_cleaned['Participant'].map(first_visit)


    print("Shape after removing unnecessary columns:", df_cleaned.shape)    
    return df_cleaned

In [75]:
clean_features = clean(features)  

Shape after removing unnecessary columns: (59377, 96)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['ExpressiveLangRaw1'] = df_cleaned['Participant'].map(first_visit)


In [76]:
unique_c = clean_features.groupby(['Visit', 'ASD'])['Participant'].nunique() 
print(unique_c)
print(f"Number of total visits: {unique_c.sum()}")
print(clean_features.groupby(['Visit', 'ASD']).size())
print(clean_features.groupby(['ASD']).size())
print(clean_features.groupby(['Gender', 'ASD'])['Participant'].nunique())

Visit  ASD
1      0.0    30
       1.0    32
2      0.0    32
       1.0    31
3      0.0    29
       1.0    30
4      0.0    32
       1.0    30
5      0.0    31
       1.0    29
6      0.0    28
       1.0    30
Name: Participant, dtype: int64
Number of total visits: 364
Visit  ASD
1      0.0    4362
       1.0    4617
2      0.0    5157
       1.0    6130
3      0.0    5175
       1.0    4606
4      0.0    5348
       1.0    4507
5      0.0    5422
       1.0    4589
6      0.0    4652
       1.0    4812
dtype: int64
ASD
0.0    30116
1.0    29261
dtype: int64
Gender  ASD
1.0     0.0    26
        1.0    31
2.0     0.0     6
        1.0     4
Name: Participant, dtype: int64


In [77]:
clean_features

Unnamed: 0,Participant,Visit,Duration,ASD,Gender,Age,ExpressiveLangRaw,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,...,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,ExpressiveLangRaw1
7,0,1,23.74,0.0,1.0,40.13,,40.659519,0.131604,35.850307,...,-0.002900,0.088063,1.390645,1.821262,0.158837,0.131736,0.378333,0.672536,-26.992199,
9,0,1,8.15,0.0,1.0,40.13,,38.676754,0.172724,31.886988,...,-0.006561,0.143007,1.719902,1.856436,0.170667,0.185812,0.348000,0.623690,-25.599806,
11,0,1,2.83,0.0,1.0,40.13,,45.456615,0.051397,43.772179,...,-0.001511,0.067914,4.964539,1.805054,0.066000,0.016248,0.468000,0.635623,-38.131786,
13,0,1,3.96,0.0,1.0,40.13,,43.732323,0.146375,35.226803,...,-0.004340,0.046791,2.025316,1.542416,0.073333,0.043076,0.668000,0.885356,-38.917393,
17,0,1,31.76,0.0,1.0,40.13,,42.977459,0.220249,34.966969,...,-0.001777,0.078688,2.078740,1.677215,0.170000,0.183920,0.498372,1.225437,-29.441130,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158876,66,6,3.02,1.0,1.0,62.33,31.0,34.108238,0.122650,33.502441,...,-0.002640,0.109798,2.657807,3.040540,0.153333,0.130128,0.153333,0.227547,-28.029463,16.0
158878,66,6,17.22,1.0,1.0,62.33,31.0,39.396801,0.216041,34.770889,...,-0.005397,0.122169,2.091807,1.631702,0.126786,0.075074,0.482593,0.979704,-29.263926,16.0
158880,66,6,10.34,1.0,1.0,62.33,31.0,37.399639,0.152843,34.794445,...,-0.007668,0.099432,2.129719,1.070039,0.205455,0.150838,0.709091,1.135945,-30.744387,16.0
158882,66,6,57.22,1.0,1.0,62.33,31.0,35.548386,0.144766,33.447212,...,-0.004979,0.100879,0.926412,0.734780,0.110714,0.102153,1.259512,2.467880,-31.588963,16.0


In [78]:
print(clean_features.isnull().sum().to_string()) # now all the extracted features with NA is removed

Participant                                           0
Visit                                                 0
Duration                                              0
ASD                                                   0
Gender                                             1608
Age                                                2469
ExpressiveLangRaw                                 41775
F0semitoneFrom27.5Hz_sma3nz_amean                     0
F0semitoneFrom27.5Hz_sma3nz_stddevNorm                0
F0semitoneFrom27.5Hz_sma3nz_percentile20.0            0
F0semitoneFrom27.5Hz_sma3nz_percentile50.0            0
F0semitoneFrom27.5Hz_sma3nz_percentile80.0            0
F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2              0
F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope           0
F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope         0
F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope          0
F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope        0
loudness_sma3_amean                             

In [95]:
# add missing gender and age and expresseive lang

# so, we add the mean age for the two groups - all boys
clean_features['Age'] = clean_features['Age'].fillna(clean_features['ASD'].apply(lambda x: 21.24 if x == 0.0 else 32.33))

# all boys
clean_features['Gender'] = clean_features['Gender'].fillna(1.0)


clean_features['ExpressiveLangRaw1'] = clean_features['ExpressiveLangRaw1'].fillna(clean_features['ASD'].apply(lambda x: 20.43 if x == 0.0 else 18.78))


In [96]:
print(clean_features.isnull().sum().to_string()) # now all the extracted features with NA is removed

Participant                                           0
Visit                                                 0
Duration                                              0
ASD                                                   0
Gender                                                0
Age                                                   0
ExpressiveLangRaw                                 41775
F0semitoneFrom27.5Hz_sma3nz_amean                     0
F0semitoneFrom27.5Hz_sma3nz_stddevNorm                0
F0semitoneFrom27.5Hz_sma3nz_percentile20.0            0
F0semitoneFrom27.5Hz_sma3nz_percentile50.0            0
F0semitoneFrom27.5Hz_sma3nz_percentile80.0            0
F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2              0
F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope           0
F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope         0
F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope          0
F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope        0
loudness_sma3_amean                             

In [97]:
participants_with_nans = clean_features[clean_features['ExpressiveLangRaw1'].isna()]['Participant'].unique()
participants_with_nans

array([], dtype=int64)

In [98]:

save_data(clean_features, "clean_features.csv")

Data saved to ../in/clean_features.csv


# Visualize participants

In [82]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd

In [83]:
clean_features = pd.read_csv('../in/clean_features.csv')

## Visualize age and gender distribution with PrettyTable

In [84]:
from prettytable import PrettyTable


# Map Gender
#clean_features['Gender'] = clean_features['Gender'].map({2.0: 'Female', 1.0: 'Male'})

# Step 1: Filter for Visit 1 only
visit1_data = clean_features[clean_features['Visit'] == 1]

# Step 2: Group by ASD and Gender, and calculate the number of unique participants, mean age, and mean ExpressiveLangRaw
grouped_data = visit1_data.groupby(['ASD', 'Gender']).agg(
    Participants=('Participant', 'nunique'),
    Mean_Age=('Age', 'mean'),
    Mean_ExpressiveLangRaw=('ExpressiveLangRaw', 'mean')
).round(2)

# Function to create PrettyTable from grouped data
def create_pretty_table(grouped_data):
    # Create table
    table = PrettyTable()
    
    # Add main headers
    table.field_names = ["ASD Group", "Gender", "Participants", "Mean Age", "Mean ExpressiveLangRaw"]

    # Iterate over the grouped data to fill the table
    for (asd_group, gender), row in grouped_data.iterrows():
        table.add_row([asd_group, gender, int(row['Participants']), round(row['Mean_Age'], 2), round(row['Mean_ExpressiveLangRaw'], 2)])

    # Print the formatted table
    print(table)

# Create the pretty table for both groups
create_pretty_table(grouped_data)


+-----------+--------+--------------+----------+------------------------+
| ASD Group | Gender | Participants | Mean Age | Mean ExpressiveLangRaw |
+-----------+--------+--------------+----------+------------------------+
|    0.0    |  1.0   |      26      |  21.24   |         20.43          |
|    0.0    |  2.0   |      4       |  20.32   |         21.87          |
|    1.0    |  1.0   |      28      |  32.33   |         18.78          |
|    1.0    |  2.0   |      4       |  34.36   |         12.92          |
+-----------+--------+--------------+----------+------------------------+


In [85]:
import scipy.stats as stats
import numpy as np


def mean_ci(data, confidence=0.95):
    return stats.t.interval(confidence, len(data)-1, loc=data.mean(), scale=stats.sem(data))

def calculate_mean_ci_by_asd(df, variable, visit=1, confidence=0.95):
    """
    Calculate the mean and confidence intervals for a specified variable by ASD groups.

    """
    visit_data = df[df['Visit'] == visit].dropna(subset=[variable])
    unique_participants = visit_data[['Participant', 'ASD', variable]].drop_duplicates()

    for asd in [0.0, 1.0]:
        group_data = unique_participants[unique_participants['ASD'] == asd][variable]
        mean_value = group_data.mean()
        ci = mean_ci(group_data, confidence)
        
        print(f"ASD Group {asd}: Mean {variable} = {mean_value:.2f}, 95% CI = ({ci[0]:.2f}, {ci[1]:.2f})")

calculate_mean_ci_by_asd(clean_features, 'Age')
calculate_mean_ci_by_asd(clean_features, 'ExpressiveLangRaw')


ASD Group 0.0: Mean Age = 20.99, 95% CI = (19.52, 22.46)
ASD Group 1.0: Mean Age = 32.72, 95% CI = (30.77, 34.67)
ASD Group 0.0: Mean ExpressiveLangRaw = 20.29, 95% CI = (18.42, 22.15)
ASD Group 1.0: Mean ExpressiveLangRaw = 17.56, 95% CI = (14.91, 20.21)


In [86]:
print(unique_participants.sort_values(by='Participant').to_string())

NameError: name 'unique_participants' is not defined