<a href="https://colab.research.google.com/github/Janina712/RhythmMetrics_Duration/blob/main/3_3_Match_UtteranceType.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **0. Imports & Set-Up**

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import random as random
import os
import math
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
%cd /content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics/2.BreathGroups_Assigned/

/content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics/2.BreathGroups_Assigned


In [None]:
reading = pd.read_excel("reading_TextGrid_comb_BG_loop.xlsx")
frog = pd.read_excel("frog_TextGrid_comb_BG_loop.xlsx")

In [None]:
# IDs by condition
IDs_reading = ['24fa']
IDs_frog = ['24fa']

In [None]:
# IDs by group
IDs_pws = ['24fa']
control_IDs = []

# combine both groups
IDs_col = pd.DataFrame(columns =['ID'])
IDs_col['ID'] =['24fa']

# **1. Define Functions**

**1.1 Assign Participant Groups**

In [None]:
def assign_group(df):
  group = pd.DataFrame(index = range(len(df)),columns=["Group"])

  for i in range(0,len(df)):
    if "_" in df["ID"][i]:
      group["Group"][i] = "Control"
    else:
      group["Group"][i] = "PWS"

  df_out = pd.concat([ group, df], axis=1)
  df_out = df_out[df_out.Type != "silence"]
  df_out.index = range(len(df_out.index))
  df_out.drop(['Unnamed: 0'], axis=1 , inplace = True)
  return(df_out)

**2. Count Vowels**

In [None]:
def count_vowels(df, condition):
  if 'Syllables' in df.columns:
    df = df.drop(columns=['Syllables'])
  df_vowels = df[df["Type"]  == "vowel"]
  df_vowels.index = range(len(df_vowels.index))

  # reading or interview condition
  if condition == "frog":
    IDs_here = IDs_frog
  else:
    IDs_here = IDs_reading

  syll_col = pd.DataFrame()  ## initialize group-level dataframe
  for ID in IDs_here: ## loop over participnts
    syll_current_ID = pd.DataFrame()   ## initialize participant-level dataframe
    subset_sounds = df[df["ID"] == ID]  # get subset of sound dataframe that corresponds to current participant
    subset_sounds.index = range(len(subset_sounds.index)) # reset index
    subset_vowels = subset_sounds[subset_sounds["Type"] == "vowel"]  # get subset of sound dataframe that corresponds to current participant
    subset_vowels.index = range(len(subset_vowels.index)) # reset index
    syll = subset_vowels["Breath.Group"].value_counts().sort_index() # count how often a certain Breath group occurs for this participant
    syll.index = range(len(syll.index)) # reset index
    for a in range (0,len(syll)): # go through all breath groups that this participant produced
      syll_current_BG = pd.DataFrame()  ## initialize BG-level dataframe
      syll_current_BG = pd.DataFrame(np.repeat(syll.iloc[a], syll.iloc[a], axis=0)) #replicate the sum sum times
      syll_current_ID = syll_current_ID.append([syll_current_BG], ignore_index = True) # add BG-level dataframe to participant-level dataframe
    syll_col = syll_col.append([syll_current_ID], ignore_index = True) # add participant-level dataframe to group-level dataframe

  df_vowels = pd.concat([df_vowels, syll_col], axis=1)
  df_vowels.rename(columns = {'Syllables':'Unmached_Vowels'}, inplace = True)
  df_vowels.rename(columns = {0:'Syllables'}, inplace = True) # rename new column
  pre_df_vowel_avg = df_vowels.groupby("Group").mean()    ########### average counting 13 13 times

  return(df_vowels, pre_df_vowel_avg)

**1.3 Count Consonants**

In [None]:
def count_consonants(df, condition):
  if 'Consonants' in df.columns:
    df = df.drop(columns=['Consonants'])
  df_consonants = df[df["Type"]  == "consonant"]
  df_consonants.index = range(len(df_consonants.index))

  # reading or interview condition
  if condition == "frog":
    IDs_here = IDs_frog
  else:
    IDs_here = IDs_reading

  con_col = pd.DataFrame()  ## initialize group-level dataframe
  for ID in IDs_here: ## loop over participnts
    con_current_ID = pd.DataFrame()   ## initialize participant-level dataframe
    subset_sounds = df[df["ID"] == ID]  # get subset of sound dataframe that corresponds to current participant
    subset_sounds.index = range(len(subset_sounds.index)) # reset index
    subset_cons = subset_sounds[subset_sounds["Type"] == "consonant"]  # get subset of sound dataframe that corresponds to current participant
    subset_cons.index = range(len(subset_cons.index)) # reset index
    con = subset_cons["Breath.Group"].value_counts().sort_index() # count how often a certain Breath group occurs for this participant
    con.index = range(len(con.index)) # reset index
    for a in range (0,len(con)): # go through all breath groups that this participant produced
      con_current_BG = pd.DataFrame()  ## initialize BG-level dataframe
      con_current_BG = pd.DataFrame(np.repeat(con.iloc[a], con.iloc[a], axis=0)) #replicate the sum sum times
      con_current_ID = con_current_ID.append([con_current_BG], ignore_index = True) # add BG-level dataframe to participant-level dataframe
    con_col = con_col.append([con_current_ID], ignore_index = True) # add participant-level dataframe to group-level dataframe

  df_consonants = pd.concat([df_consonants, con_col], axis=1)
  df_consonants.rename(columns = {'Consonants':'Unmatched_Cons'}, inplace = True)
  df_consonants.rename(columns = {0:'Consonants'}, inplace = True) # rename new column
  pre_df_consonant_avg = df_consonants.groupby("Group").mean()    ########### average counting 13 13 times

  return(df_consonants, pre_df_consonant_avg)

**1.4 Average Vowel Count Per Participant**

In [None]:
def participant_vowel_avg(df, condition):
  IDs_out = []
  group_list = []
  if len(df) == 0:
    df_participant_vowel_avg = pd.DataFrame()
    df_participant_vowel_avg["Group"] = []
    df_participant_vowel_avg["ID"] = []
    df_participant_vowel_avg["Syllables"] = []

  else:
    if condition == "frog":
      IDs_here = IDs_frog
    else:
      IDs_here = IDs_reading

    IDs_here_pws = []
    for ID in IDs_here:
      if ID in IDs_pws:
        IDs_here_pws.append(ID)

    n = -1
    avg_col = pd.DataFrame(columns=["Syllables"])   # Syllables
    for ID in IDs_here_pws: ## loop over participnts
      n = n + 1
      subset_BGs = df[df["ID"] == ID]  # get subset of sound dataframe that corresponds to current participant
      subset_BGs.index = range(len(subset_BGs.index)) # reset index
      subset_fluent = df[df["FluencyStatus"] == "fluent"]  # get subset of sound dataframe that corresponds to current participant
      subset_fluent.index = range(len(subset_BGs.index)) # reset index
      BG_avg = subset_fluent.groupby("Breath.Group").mean()
      subj_avg_fluent = BG_avg["Syllables"].mean()
      avg_col.loc[n] = subj_avg_fluent

      subset_disfluent = df[df["FluencyStatus"] == "disfluent"]  # get subset of sound dataframe that corresponds to current participant
      subset_disfluent.index = range(len(subset_disfluent.index)) # reset index
      if len(subset_disfluent) == 0:
        avg_col.loc[n+1] = 'NAN'
        n = n + 1
      else:
        BG_avg = subset_fluent.groupby("Breath.Group").mean()
        subj_avg_disfluent = BG_avg["Syllables"].mean()
        avg_col.loc[n+1] = subj_avg_disfluent
        n = n + 1


      group_col = pd.DataFrame(columns=['FluencyStatus'])
      IDs_col_here = pd.DataFrame(columns=['ID'])
      for i in range(0,len(IDs_here_pws)):
        group_list.append("Fluent")
        group_list.append('Disfluent')

      IDs_out.append(ID)
      IDs_out.append(ID)

    group_col["FluencyStatus"] = group_list
    IDs_col_here["ID"] = IDs_out

    df_participant_vowel_avg = pd.concat([group_col, IDs_col_here, avg_col], axis=1)

  return(df_participant_vowel_avg)

**1.5 Average Consonant Count Per Participant**

In [None]:
def participant_consonant_avg(df, condition):
  IDs_out = []
  group_list = []
  if len(df) == 0:
    df_participant_cons_avg = pd.DataFrame()
    df_participant_cons_avg["Group"] = []
    df_participant_cons_avg["ID"] = []
    df_participant_cons_avg["Syllables"] = []
  else:
    if condition == "frog":
      IDs_here = IDs_frog
    else:
      IDs_here = IDs_reading

    IDs_here_pws = []
    for ID in IDs_here:
      if ID in IDs_pws:
        IDs_here_pws.append(ID)


    n = -1
    avg_col = pd.DataFrame(columns=["Consonants"])   # Syllables
    for ID in IDs_here_pws: ## loop over participnts
      n = n + 1
      subset_BGs = df[df["ID"] == ID]  # get subset of sound dataframe that corresponds to current participant
      subset_BGs.index = range(len(subset_BGs.index)) # reset index
      subset_fluent = df[df["FluencyStatus"] == "fluent"]  # get subset of sound dataframe that corresponds to current participant
      subset_fluent.index = range(len(subset_BGs.index)) # reset index
      BG_avg = subset_fluent.groupby("Breath.Group").mean()
      subj_avg_fluent = BG_avg["Consonants"].mean()
      avg_col.loc[n] = subj_avg_fluent

      subset_disfluent = df[df["FluencyStatus"] == "disfluent"]  # get subset of sound dataframe that corresponds to current participant
      subset_disfluent.index = range(len(subset_disfluent.index)) # reset index
      if len(subset_disfluent) == 0:
        avg_col.loc[n+1] = 'NAN'
        n = n + 1
      else:
        BG_avg = subset_fluent.groupby("Breath.Group").mean()
        subj_avg_disfluent = BG_avg["Consonants"].mean()
        avg_col.loc[n+1] = subj_avg_disfluent
        n = n + 1

      group_col = pd.DataFrame(columns=['FluencyStatus'])
      IDs_col_here = pd.DataFrame(columns=['ID'])
      for i in range(0,len(IDs_here_pws)):
        group_list.append("Fluent")
        group_list.append('Disfluent')

      IDs_out.append(ID)
      IDs_out.append(ID)

    group_col["FluencyStatus"] = group_list
    IDs_col_here["ID"] = IDs_out

    df_participant_cons_avg = pd.concat([group_col, IDs_col_here, avg_col], axis=1)

  return(df_participant_cons_avg)

**1.6 Compare Vowel and Consonant Counts Across Utterance Types**

In [None]:
def compare_groups(df_vowels_fluent, df_vowels_disfluent, df_consonants_fluent, df_consonants_disfluent):

  if len(df_vowels_fluent) + len(df_vowels_disfluent) + len(df_consonants_fluent) + len(df_consonants_disfluent) == 0:
    short_v = []
    difference_v = []
    short_c = []
    difference_c = []
    longer = []
    shorter = []
    string1 = (f"There is no data for this condition.")
    string2 = ("")
    string3 = ("")
    string4 = ("")
  else:
    fluent_v = df_vowels_fluent.groupby("FluencyStatus").mean()["Syllables"].mean()
    disfluent_v = df_vowels_disfluent.groupby("FluencyStatus").mean()["Syllables"].mean()
    difference_v = abs(fluent_v - disfluent_v)

    fluent_c = (df_consonants_fluent.groupby("FluencyStatus").mean()["Consonants"]).mean()
    disfluent_c = (df_consonants_disfluent.groupby("FluencyStatus").mean()["Consonants"]).mean()
    difference_c = abs(fluent_c - disfluent_c)

    if fluent_v > disfluent_v:
      longer = 'fluent'
      shorter = 'disfluent'
      short_v = disfluent_v
      short_c = disfluent_c
    elif disfluent_v > fluent_v:
      longer = 'disfluent'
      shorter = 'fluent'
      short_v = fluent_v
      short_c = fluent_c
    elif (np.isnan(fluent_v)):
      longer = 'disfluent'
      shorter = 'fluent'
      short_v = fluent_v
      short_c = fluent_c
    else:
      longer = 'fluent'
      shorter = 'disfluent'
      short_v = disfluent_v
      short_c = disfluent_c


    string1 = (f"PWS produced on average {round(fluent_v,2)} syllables in fluent utterance and {round(disfluent_v,2)} syllables in disflunt utterances.")
    string2 = (f"This means that on average {longer} utterances were {round(difference_v,2)} syllables longer than {shorter} utterances.")
    string3 = (f"\nPWS produced on average {round(fluent_c,2)} consonants in fluent utterance and {round(disfluent_c,2)} consonants in disfluent utterances.")
    string4 = (f"This means that on average {longer} utterances were {round(difference_c,2)} consonants longer than {shorter} utterances.")

  return(string1, string2, string3, string4, short_v, short_c, difference_v, difference_c, longer, shorter)

**1.7 Match Number of Vowels Across Utterance Types**

In [None]:
def match_vowels(df_vowels, short_group, cut_v, condition):
  if condition == 'frog':
    IDs_here = IDs_frog
  else:
    IDs_here = IDs_reading

  IDs_here_pws = []
  for ID in IDs_here:
    if ID in IDs_pws:
      IDs_here_pws.append(ID)

  groups = []
  for item in df_vowels['FluencyStatus']:
    if item == "fluent":
      groups.append("fluent")
    if item == 'disfluent':
      groups.append("disfluent")
    else:
      pass

  if len(list(set(groups))) < 2:
    print(f"Not enough data in {condition} condition. Matching not possible.")
    syll_col_matched = df_vowels
    return(syll_col_matched)
  else:
    cut_v = round(cut_v)
    syll_col_matched = pd.DataFrame()  ## initialize group-level dataframe
    participant = pd.DataFrame()  ## initialize participant-level dataframe
    if short_group == "disfluent":
      df_control = df_vowels[df_vowels["Group"]  == "fluent"]
      df_control.index = range(len(df_control.index)) ## group
      for ID in IDs_here_pws:
        df_control_ID = df_control[df_control["ID"]  == ID]
        df_control_ID.index = range(len(df_control_ID.index))  ### person
        BGs = df_control_ID["Breath.Group"].unique()
        for BG in BGs:
          df_control_ID_BG = df_control_ID[df_control_ID["Breath.Group"]  == BG]
          df_control_ID_BG.index = range(len(df_control_ID_BG.index)) ## BG
          if len(df_control_ID_BG) >= cut_v:
            df_control_ID_BG.drop(df_control_ID_BG.tail(cut_v).index, inplace = True)
            participant = participant.append([df_control_ID_BG], ignore_index = True)
          else:
            participant = participant.append([df_control_ID_BG], ignore_index = True)
      else:
          df_control = df_vowels[df_vowels["Group"]  == "disfluent"]
          df_control.index = range(len(df_control.index)) ## group
          for ID in IDs_here_pws:
            df_control_ID = df_control[df_control["ID"]  == ID]
            df_control_ID.index = range(len(df_control_ID.index))  ### person
            BGs = df_control_ID["Breath.Group"].unique()
            for BG in BGs:
              df_control_ID_BG = df_control_ID[df_control_ID["Breath.Group"]  == BG]
              df_control_ID_BG.index = range(len(df_control_ID_BG.index)) ## BG
              if len(df_control_ID_BG) >= cut_v:
                df_control_ID_BG.drop(df_control_ID_BG.tail(cut_v).index, inplace = True)
                participant = participant.append([df_control_ID_BG], ignore_index = True)
              else:
                participant = participant.append([df_control_ID_BG], ignore_index = True)
          syll_col_matched = syll_col_matched.append([participant], ignore_index = True) # add participant-level dataframe to group-level dataframe
    return(syll_col_matched)

**1.8 Match Number of Consonants Across Utterance Types**

In [None]:
def match_consonants(df_consonants, short_group, cut_c, condition):
  if condition == 'frog':
    IDs_here = IDs_frog
  else:
    IDs_here = IDs_reading

  IDs_here_pws = []
  for ID in IDs_here:
    if ID in IDs_pws:
      IDs_here_pws.append(ID)

  groups = []
  for item in df_consonants['FluencyStatus']:
    if item == "fluent":
      groups.append("fluent")
    if item == 'disfluent':
      groups.append("disfluent")
    else:
      pass

  if len(list(set(groups))) < 2:
    print(f"Not enough data in {condition} condition. Matching not possible.")
    cons_col_matched = df_consonants
    return(cons_col_matched)
  else:
    cut_c = round(cut_c)
    cons_col_matched = pd.DataFrame()  ## initialize group-level dataframe
    participant = pd.DataFrame()  ## initialize participant-level dataframe
    if short_group == "disfluent":
      df_control = df_consonants[df_consonants["Group"]  == "fluent"]
      df_control.index = range(len(df_control.index)) ## group
      for ID in control_IDs:
        df_control_ID = df_control[df_control["ID"]  == ID]
        df_control_ID.index = range(len(df_control_ID.index))  ### person
        BGs = df_control_ID["Breath.Group"].unique()
        for BG in BGs:
          df_control_ID_BG = df_control_ID[df_control_ID["Breath.Group"]  == BG]
          df_control_ID_BG.index = range(len(df_control_ID_BG.index)) ## BG
          if len(df_control_ID_BG) >= cut_c:
            df_control_ID_BG.drop(df_control_ID_BG.tail(cut_c).index, inplace = True)
            participant = participant.append([df_control_ID_BG], ignore_index = True)
          else:
            participant = participant.append([df_control_ID_BG], ignore_index = True)
        cons_col_matched = cons_col_matched.append([participant], ignore_index = True) # add participant-level dataframe to group-level dataframe
    else:
      df_control = df_consonants[df_consonants["Group"]  == "disfluent"]
      df_control.index = range(len(df_control.index)) ## group
      for ID in IDs_pws:
        df_control_ID = df_control[df_control["ID"]  == ID]
        df_control_ID.index = range(len(df_control_ID.index))  ### person
        BGs = df_control_ID["Breath.Group"].unique()
        for BG in BGs:
          df_control_ID_BG = df_control_ID[df_control_ID["Breath.Group"]  == BG]
          df_control_ID_BG.index = range(len(df_control_ID_BG.index)) ## BG
          if len(df_control_ID_BG) >= cut_c:
            df_control_ID_BG.drop(df_control_ID_BG.tail(cut_c).index, inplace = True)
            participant = participant.append([df_control_ID_BG], ignore_index = True)
          else:
            participant = participant.append([df_control_ID_BG], ignore_index = True)
      cons_col_matched = cons_col_matched.append([participant], ignore_index = True) # add participant-level dataframe to group-level dataframe
  return(cons_col_matched)

**1.9 Count Utterances**

In [None]:
def countUtterances(df):
  sum = 0
  for ID in df["ID"].unique():
    subset_ID = df[df["ID"] == ID]
    sum = sum + len(subset_ID["Breath.Group"].unique())
  return(sum)

**1.10 Test if Remaining Vowel Difference is Significant**

In [None]:
def ttest_vowelDifference(matched_participant_df, shorter_group):
  if len(matched_participant_df) == 0:
      string11 = ("There is no data for this condition.")
      return(string11)
  if shorter_group == "fluent":
    longer_group = "disfluent"
  else:
    longer_group = "fluent"

  ttest_fluent = matched_participant_df[matched_participant_df["FluencyStatus"] == "Fluent"]
  ttest_disfluent = matched_participant_df[matched_participant_df["FluencyStatus"] == "Disfluent"]

  ttest_fluent = ttest_fluent.dropna()
  ttest_disfluent = ttest_fluent.dropna()

  fluent_v =  ttest_fluent.mean()["Syllables"]
  disfluent_v =  ttest_disfluent.mean()["Syllables"]

  difference_v_matched = fluent_v - disfluent_v

  df_v_matched_test = stats.ttest_ind(np.array(ttest_fluent['Syllables'].astype(float)),
                      np.array(ttest_fluent['Syllables']).astype(float))
  if df_v_matched_test[1] >= 0.05:
    level = "insignificant"
  elif df_v_matched_test[1] < 0.05:
    level = "significant"
  else:
    level = 'undetermined'
  string11 = (f"{longer_group} utterances that were longer than the mean average length of {shorter_group} utterances were shortened. \nAfter matching, PWS had {round(fluent_v,2)} vowels per fluent utterance, while they had {round(disfluent_v,2)} per disfluent utterance. \nThe difference is reduced to {abs(round(difference_v_matched,2))}, which is statistically {level} (p = {round(df_v_matched_test[1],2)}).")

  return(string11)

**1.11 Test if Remaining Consonant Difference is Significant**

In [None]:
def ttest_conDifference(matched_participant_df, shorter_group):
  if len(matched_participant_df) == 0:
    string11 = ("There is no data for this condition.")
    return(string11)
  if shorter_group == "fluent":
    longer_group = "disfluent"
  else:
    longer_group = "fluent"
  ttest_disfluent = matched_participant_df[matched_participant_df["FluencyStatus"] == "Disfluent"]
  ttest_fluent = matched_participant_df[matched_participant_df["FluencyStatus"] == "Fluent"]

  ttest_disfluent = ttest_disfluent.dropna()
  ttest_fluent = ttest_fluent.dropna()

  disfluent_c =  ttest_disfluent.mean()["Consonants"]
  fluent_c =  ttest_fluent.mean()["Consonants"]

  difference_c_matched = disfluent_c - fluent_c

  df_c_matched_test = stats.ttest_ind(np.array(ttest_disfluent['Consonants']).astype(float),
                        np.array(ttest_fluent['Consonants']).astype(float))
  if df_c_matched_test[1] >= 0.05:
    level = "insignificant"
  elif df_c_matched_test[1] < 0.05:
    level = "significant"
  else:
    level = "undetermined"
  string11 = (f"{longer_group} utterances that were longer than the mean average length of {shorter_group} utterances, were shortened. \nAfter matching, fluent utterances had {round(fluent_c,2)} consonants per utterance, while disfluent utterances had {round(disfluent_c,2)} per utterance. \nThe difference is reduced to {abs(round(difference_c_matched,2))}, which is statistically {level} (p = {round(df_c_matched_test[1],2)}).")

  return(string11)

# **2. Match Across Utterance Types (Fluent vs. Disfluent)**

**2.1 Pre-Matching**

In [None]:
# add new column to dataframe that denotes participant's group membership
frog = assign_group(frog)
reading = assign_group(reading)

In [None]:
# account for case differences in annotation
for i in range (0, len(frog)):
  frog["FluencyStatus"][i] = frog["FluencyStatus"][i].lower().strip()
for i in range (0, len(reading)):
  reading["FluencyStatus"][i] = reading["FluencyStatus"][i].lower().strip()

In [None]:
# exclude control participants from further analysis
frog_pws = frog[frog["Group"] == "PWS"]
frog_pws.index = range(len(frog_pws.index))
reading_pws = reading[reading["Group"] == "PWS"]
reading_pws.index = range(len(reading_pws.index))

In [None]:
# count vowels per utterance
frog_vowels_pws, pre_frog_vowel_avg_across_BG = count_vowels(frog_pws, "frog")
reading_vowels_pws, pre_reading_vowel_avg_across_BG = count_vowels(reading_pws, "reading")

In [None]:
# count consonants per utterance
[frog_consonants_pws, pre_frog_consonant_avg_across_BG]  = count_consonants(frog_pws,'frog')
[reading_consonants_pws, pre_reading_consonants_avg_across_BG] = count_consonants(reading_pws,'reading')

In [None]:
#average # vowels per breath group for each participant
frog_participant_vowel_avg_pws = participant_vowel_avg(frog_vowels_pws,'frog')
reading_participant_vowel_avg_pws = participant_vowel_avg(reading_vowels_pws,'reading')

In [None]:
#average # consonants per breath group for each participant
frog_participant_cons_avg_pws = participant_consonant_avg(frog_consonants_pws,'frog')
reading_participant_cons_avg_pws = participant_consonant_avg(reading_consonants_pws,'reading')

In [None]:
#average # per breath group for each participant and fluency status in frog
if len(frog_participant_vowel_avg_pws) > 1:
  frog_participant_vowels_avg_pws_fluent = frog_participant_vowel_avg_pws[frog_participant_vowel_avg_pws["FluencyStatus"] == "Fluent"]
  frog_participant_vowels_avg_pws_disfluent = frog_participant_vowel_avg_pws[frog_participant_vowel_avg_pws["FluencyStatus"] == "Disfluent"]
  frog_participant_consonants_avg_pws_fluent = frog_participant_cons_avg_pws[frog_participant_cons_avg_pws["FluencyStatus"] == "Fluent"]
  frog_participant_consonants_avg_pws_disfluent = frog_participant_cons_avg_pws[frog_participant_cons_avg_pws["FluencyStatus"] == "Disfluent"]
else:
  frog_participant_vowels_avg_pws_fluent = pd.DataFrame()
  frog_participant_vowels_avg_pws_disfluent = pd.DataFrame()
  frog_participant_consonants_avg_pws_fluent = pd.DataFrame()
  frog_participant_consonants_avg_pws_disfluent = pd.DataFrame()

In [None]:
#average # per breath group for each participant and fluency status in reading
if len(reading_participant_vowel_avg_pws) > 1:
  reading_participant_vowels_avg_pws_fluent = reading_participant_vowel_avg_pws[reading_participant_vowel_avg_pws["FluencyStatus"] == "Fluent"]
  reading_participant_vowels_avg_pws_disfluent = reading_participant_vowel_avg_pws[reading_participant_vowel_avg_pws["FluencyStatus"] == "Disfluent"]
  reading_participant_consonants_avg_pws_fluent = reading_participant_cons_avg_pws[reading_participant_cons_avg_pws["FluencyStatus"] == "Fluent"]
  reading_participant_consonants_avg_pws_disfluent = reading_participant_cons_avg_pws[reading_participant_cons_avg_pws["FluencyStatus"] == "Disfluent"]
else:
  reading_participant_vowels_avg_pws_fluent = pd.DataFrame()
  reading_participant_vowels_avg_pws_disfluent = pd.DataFrame()
  reading_participant_consonants_avg_pws_fluent = pd.DataFrame()
  reading_participant_consonants_avg_pws_disfluent = pd.DataFrame()

In [None]:
# compare group averages of number of vowels per utterance and number of consonants per utterance
# frog
string1, string2, string3, string4, short_v_frog_pws, short_c_frog_pws, difference_v_frog_pws, difference_c_frog_pws, long_group_frog, short_group_frog = compare_groups(frog_participant_vowels_avg_pws_fluent, frog_participant_vowels_avg_pws_disfluent, frog_participant_consonants_avg_pws_fluent, frog_participant_consonants_avg_pws_disfluent)

print(string1)
print(string2)
print(string3)
print(string4)

PWS produced on average 6.66 syllables in fluent utterance and nan syllables in disflunt utterances.
This means that on average fluent utterances were nan syllables longer than disfluent utterances.

PWS produced on average 9.95 consonants in fluent utterance and nan consonants in disfluent utterances.
This means that on average fluent utterances were nan consonants longer than disfluent utterances.


In [None]:
# compare group averages of number of vowels per utterance and number of consonants per utterance
# reading
string1, string2, string3, string4, short_v_read_pws, short_c_read_pws, difference_v_read_pws, difference_c_read_pws, long_group_read, short_group_read = compare_groups(reading_participant_vowels_avg_pws_fluent, reading_participant_vowels_avg_pws_disfluent, reading_participant_consonants_avg_pws_fluent, reading_participant_consonants_avg_pws_disfluent)

print(string1)
print(string2)
print(string3)
print(string4)

There is no data for this condition.





**2.2. Matching**

In [None]:
# match number of vowels per utterance by cutting long control utterances by the difference in average vowel number between groups
matched_vowels_read_pws = match_vowels(reading_vowels_pws, short_group_read,  difference_v_read_pws,'reading')
matched_vowels_frog_pws = match_vowels(frog_vowels_pws, short_group_frog, difference_v_frog_pws,'frog')

Not enough data in reading condition. Matching not possible.
Not enough data in frog condition. Matching not possible.


In [None]:
# match number of consonants per utterance by cutting long control utterances by the difference in average consonant number between groups
matched_consonants_read_pws = match_consonants(reading_consonants_pws, short_group_read, difference_c_read_pws,'reading')
matched_consonants_frog_pws = match_consonants(frog_consonants_pws, short_group_frog, difference_c_frog_pws,'frog')

Not enough data in reading condition. Matching not possible.
Not enough data in frog condition. Matching not possible.


In [None]:
if (difference_c_read_pws == []) or (np.isnan(difference_c_read_pws)):
  difference_c_read_pws = 0
if (difference_c_frog_pws == []) or (np.isnan(difference_c_frog_pws)):
  difference_c_frog_pws = 0
if (difference_v_read_pws == []) or (np.isnan(difference_v_read_pws)):
  difference_v_read_pws = 0
if (difference_v_frog_pws == []) or (np.isnan(difference_v_frog_pws)):
  difference_v_frog_pws = 0
if long_group_read == []:
  long_group_read = "No"
if long_group_frog == []:
  long_group_frog = "No"

print(f"{long_group_read} utterances were cut by {round(difference_v_read_pws)} vowels and {round(difference_c_read_pws)} consonants in the reading condition.")
print(f"{long_group_frog} utterances were cut by {round(difference_v_frog_pws)} vowels and {round(difference_c_frog_pws)} consonants in the frog condition.")

No utterances were cut by 0 vowels and 0 consonants in the reading condition.
fluent utterances were cut by 0 vowels and 0 consonants in the frog condition.


**2.3. Post-Matching**

In [None]:
# combined shortened dataframe with original dataframe
# frog
if short_group_frog == "fluent":
  matched_vowels_frog_pws = matched_vowels_frog_pws.append(frog_vowels_pws[frog_vowels_pws["FluencyStatus"]=="fluent"], ignore_index=True)
  matched_consonants_frog_pws =  matched_consonants_frog_pws.append(frog_consonants_pws[frog_consonants_pws["FluencyStatus"]=="fluent"], ignore_index=True)
elif short_group_frog == "disfluent":
  matched_vowels_frog_pws = matched_vowels_frog_pws.append(frog_vowels_pws[frog_vowels_pws["FluencyStatus"]=="disfluent"], ignore_index=True)
  matched_consonants_frog_pws = matched_consonants_frog_pws.append(frog_consonants_pws[frog_consonants_pws["FluencyStatus"]=="disfluent"], ignore_index=True)

In [None]:
# combined shortened dataframe with original dataframe
# reading
if short_group_read == "fluent":
  matched_vowels_reading_pws = matched_vowels_read_pws.append(reading_vowels_pws[reading_vowels_pws["FluencyStatus"]=="fluent"], ignore_index=True)
  matched_consonants_reading_pws =  matched_consonants_read_pws.append(reading_consonants_pws[reading_consonants_pws["FluencySTatus"]=="fluent"], ignore_index=True)
elif short_group_read == "disfluent":
  matched_vowels_reading_pws = matched_vowels_read_pws.append(reading_vowels_pws[reading_vowels_pws["FLuencyStatus"]=="disfluent"], ignore_index=True)
  matched_consonants_reading_pws = matched_consonants_read_pws.append(reading_consonants_pws[reading_consonants_pws["FluencyStatus"]=="disfluent"], ignore_index=True)

In [None]:
# count vowels per utterance after matching
[post_frog_vowels_pws, post_frog_vowel]  = count_vowels(matched_vowels_frog_pws,'frog')
[post_reading_vowels_pws, post_reading_vowel]  = count_vowels(matched_vowels_read_pws,'reading')

In [None]:
# average # vowels per breath group for each participant after matching
post_frog_participant_vowels_avg_pws = participant_vowel_avg(post_frog_vowels_pws, 'frog')
post_reading_participant_vowels_avg_pws = participant_vowel_avg(post_reading_vowels_pws,'reading')

In [None]:
# count consonants per utterance after matching
[post_frog_consonants_pws, post_frog_consonant]  = count_consonants(matched_consonants_frog_pws, 'frog')
[post_reading_consonants_pws, post_reading_consonant]  = count_consonants(matched_consonants_read_pws,'reading')

In [None]:
#average # consonants per breath group for each participant after matching
post_frog_participant_cons_avg_pws = participant_consonant_avg(post_frog_consonants_pws, 'frog')
post_reading_participant_cons_avg_pws = participant_consonant_avg(post_reading_consonants_pws,'reading')

In [None]:
#average # per breath group for each participant and fluency status in frog
if len(post_frog_participant_vowels_avg_pws) > 1:
  post_frog_participant_vowels_avg_pws_fluent = post_frog_participant_vowels_avg_pws[post_frog_participant_vowels_avg_pws["FluencyStatus"] == "Fluent"]
  post_frog_participant_vowels_avg_pws_disfluent = post_frog_participant_vowels_avg_pws[post_frog_participant_vowels_avg_pws["FluencyStatus"] == "Disfluent"]
  post_frog_participant_consonants_avg_pws_fluent = post_frog_participant_cons_avg_pws[post_frog_participant_cons_avg_pws["FluencyStatus"] == "Fluent"]
  post_frog_participant_consonants_avg_pws_disfluent = post_frog_participant_cons_avg_pws[post_frog_participant_cons_avg_pws["FluencyStatus"] == "Disfluent"]
else:
  post_frog_participant_vowels_avg_pws_fluent = pd.DataFrame()
  post_frog_participant_vowels_avg_pws_disfluent = pd.DataFrame()
  post_frog_participant_consonants_avg_pws_fluent = pd.DataFrame()
  post_frog_participant_consonants_avg_pws_disfluent = pd.DataFrame()

In [None]:
#average # per breath group for each participant and fluency status in reading
if len(post_reading_participant_vowels_avg_pws) > 1:
  post_reading_participant_vowels_avg_pws_fluent = post_reading_participant_vowels_avg_pws[post_reading_participant_vowels_avg_pws["FluencyStatus"] == "Fluent"]
  post_reading_participant_vowels_avg_pws_disfluent = post_reading_participant_vowels_avg_pws[post_reading_participant_vowels_avg_pws["FluencyStatus"] == "Disfluent"]
  post_reading_participant_consonants_avg_pws_fluent = post_reading_participant_cons_avg_pws[post_reading_participant_cons_avg_pws["FluencyStatus"] == "Fluent"]
  post_reading_participant_consonants_avg_pws_disfluent = post_reading_participant_cons_avg_pws[post_reading_participant_cons_avg_pws["FluencyStatus"] == "Disfluent"]
else:
  post_reading_participant_vowels_avg_pws_fluent = pd.DataFrame()
  post_reading_participant_vowels_avg_pws_disfluent = pd.DataFrame()
  post_reading_participant_consonants_avg_pws_fluent = pd.DataFrame()
  post_reading_participant_consonants_avg_pws_disfluent = pd.DataFrame()

In [None]:
# compare group averages of number of vowels per utterance and number of consonants per utterance
# frog
string1, string2, string3, string4, short_v_frog_pws, short_c_frog_pws, difference_v_frog_pws, difference_c_frog_pws, long_group_frog, short_group_frog = compare_groups(post_frog_participant_vowels_avg_pws_fluent, post_frog_participant_vowels_avg_pws_disfluent, post_frog_participant_consonants_avg_pws_fluent, post_frog_participant_consonants_avg_pws_disfluent)

print(string1)
print(string2)
print(string3)
print(string4)

PWS produced on average 6.66 syllables in fluent utterance and nan syllables in disflunt utterances.
This means that on average fluent utterances were nan syllables longer than disfluent utterances.

PWS produced on average 9.95 consonants in fluent utterance and nan consonants in disfluent utterances.
This means that on average fluent utterances were nan consonants longer than disfluent utterances.


In [None]:
# compare group averages of number of vowels per utterance and number of consonants per utterance
# reading
string1, string2, string3, string4, short_v_read_pws, short_c_read_pws, difference_v_read_pws, difference_c_read_pws, long_group_read, short_group_read = compare_groups(post_reading_participant_vowels_avg_pws_fluent, post_reading_participant_vowels_avg_pws_disfluent, post_reading_participant_consonants_avg_pws_fluent, post_reading_participant_consonants_avg_pws_disfluent)

print(string1)
print(string2)
print(string3)
print(string4)

There is no data for this condition.





# **3. Significance After Matching**

In [None]:
# test if average number of vowels per breathgroup is still significantly different between utterance types after matching
# frog
string1 = ttest_vowelDifference(post_frog_participant_vowels_avg_pws, short_group_frog)
print(string1)

In [None]:
# test if average number of vowels per breathgroup is still significantly different between utterance types after matching
# reading
string2 = ttest_vowelDifference(post_reading_participant_vowels_avg_pws, short_group_read)
print(string2)

There is no data for this condition.


In [None]:
# test if average number of consonants per breathgroup is still significantly different between utterance types after matching
# frog
string1 = ttest_conDifference(post_frog_participant_cons_avg_pws, short_group_frog)
print(string1)

In [None]:
# test if average number of consonants per breathgroup is still significantly different between utterance types after matching
# reading
string2 = ttest_conDifference(post_reading_participant_cons_avg_pws, short_group_read)
print(string2)

There is no data for this condition.


# **5. Count Utterances in Each Category**

In [None]:
pws_frog_fluent = post_frog_vowels_pws[post_frog_vowels_pws["FluencyStatus"] == "fluent"]
pws_frog_disfluent = post_frog_vowels_pws[post_frog_vowels_pws["FluencyStatus"] == "disfluent"]
pws_read_fluent = post_reading_vowels_pws[post_reading_vowels_pws["FluencyStatus"] == "fluent"]
pws_read_disfluent = post_reading_vowels_pws[post_reading_vowels_pws["FluencyStatus"] == "disfluent"]

In [None]:
number_reading_pws_disfluent = countUtterances(pws_read_disfluent)
number_frog_pws_disfluent = countUtterances(pws_frog_disfluent)
number_reading_pws_fluent = countUtterances(pws_read_fluent)
number_frog_pws_fluent = countUtterances(pws_frog_fluent)

In [None]:
print(f"There are {number_reading_pws_fluent} fluent utterances in the reading condition and {number_reading_pws_disfluent} disfluent utterances.")
print(f"\nThere are {number_frog_pws_fluent} fluent utterances in the frog condition and {number_frog_pws_disfluent} disfluent utterances.")

There are 0 fluent utterances in the reading condition and 0 disfluent utterances.

There are 41 fluent utterances in the frog condition and 0 disfluent utterances.


# **6. Save**

In [None]:
%cd /content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics

/content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics


In [None]:
dir = "3.MLU_Matched"

if os.path.exists(dir) == False:
  os.mkdir(dir)

In [None]:
%cd /content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics/3.MLU_Matched/

/content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics/3.MLU_Matched


In [None]:
# export
post_reading_vowels_pws.to_excel("fluencyMatched_vowels_reading_pws.xlsx")
post_reading_consonants_pws.to_excel("fluencyMatched_consonants_reading_pws.xlsx")
post_frog_vowels_pws.to_excel("fluencyMatched_vowels_frog_pws.xlsx")
post_frog_consonants_pws.to_excel("fluencyMatched_consonants_frog_pws.xlsx")