<a href="https://colab.research.google.com/github/Janina712/RhythmMetrics_Duration/blob/main/3_1_Match_Nothing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**0. Imports & Set-Up**

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import random as random
import os
import math
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%cd /content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics/

/content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics


In [None]:
reading = pd.read_excel("reading_TextGrid_comb_BG_loop.xlsx")
frog = pd.read_excel("frog_TextGrid_comb_BG_loop.xlsx")

In [None]:
# IDs by condition
IDs_reading = ['24fa']
IDs_frog = ['24fa']

In [None]:
# IDs by group
pws_IDs = ['24fa']
control_IDs = []

# combine both groups
IDs_col = pd.DataFrame(columns =['ID'])
IDs_col['ID'] =['24fa']

**1. Define Functions**

**1.1 Assign Participant Group**

In [None]:
def assign_group(df):
  group = pd.DataFrame(index = range(len(df)),columns=["Group"])

  for i in range(0,len(df)-1):
    if df["ID"][i] in control_IDs:
      group["Group"][i] = "Control"
    else:
      group["Group"][i] = "PWS"

  df_out = pd.concat([ group, df], axis=1)
  df_out = df_out[df_out.Type != "silence"]
  df_out.index = range(len(df_out.index))
  df_out.drop(['Unnamed: 0'], axis=1 , inplace = True)
  return(df_out)

**2. Count Vowels**

In [None]:
def count_vowels(df, condition):
  df_vowels = df[df["Type"]  == "vowel"]
  df_vowels.index = range(len(df_vowels.index))

  # reading or interview condition
  if condition == "frog":
    IDs_here = IDs_frog
  else:
    IDs_here = IDs_reading

  syll_col = pd.DataFrame()  ## initialize group-level dataframe
  for ID in IDs_here: ## loop over participnts
    syll_current_ID = pd.DataFrame()   ## initialize participant-level dataframe
    subset_sounds = df[df["ID"] == ID]  # get subset of sound dataframe that corresponds to current participant
    subset_sounds.index = range(len(subset_sounds.index)) # reset index
    subset_vowels = subset_sounds[subset_sounds["Type"] == "vowel"]  # get subset of sound dataframe that corresponds to current participant
    subset_vowels.index = range(len(subset_vowels.index)) # reset index
    syll = subset_vowels["Breath.Group"].value_counts().sort_index() # count how often a certain Breath group occurs for this participant
    syll.index = range(len(syll.index)) # reset index
    for a in range (0,len(syll)): # go through all breath groups that this participant produced
      syll_current_BG = pd.DataFrame()  ## initialize BG-level dataframe
      syll_current_BG = pd.DataFrame(np.repeat(syll.iloc[a], syll.iloc[a], axis=0)) #replicate the sum sum times
      syll_current_ID = syll_current_ID.append([syll_current_BG], ignore_index = True) # add BG-level dataframe to participant-level dataframe
    syll_col = syll_col.append([syll_current_ID], ignore_index = True) # add participant-level dataframe to group-level dataframe

  df_vowels = pd.concat([df_vowels, syll_col], axis=1)
  df_vowels.rename(columns = {'Syllables':'Unmached_Vowels'}, inplace = True)
  df_vowels.rename(columns = {0:'Syllables'}, inplace = True) # rename new column
  pre_df_vowel_avg = df_vowels.groupby("Group").mean()    ########### average counting 13 13 times

  return(df_vowels, pre_df_vowel_avg)

**1.3 Count Consonants**

In [None]:
def count_consonants(df, condition):
  df_consonants = df[df["Type"]  == "consonant"]
  df_consonants.index = range(len(df_consonants.index))

  # reading or interview condition
  if condition == "frog":
    IDs_here = IDs_frog
  else:
    IDs_here = IDs_reading

  con_col = pd.DataFrame()  ## initialize group-level dataframe
  for ID in IDs_here: ## loop over participnts
    con_current_ID = pd.DataFrame()   ## initialize participant-level dataframe
    subset_sounds = df[df["ID"] == ID]  # get subset of sound dataframe that corresponds to current participant
    subset_sounds.index = range(len(subset_sounds.index)) # reset index
    subset_cons = subset_sounds[subset_sounds["Type"] == "consonant"]  # get subset of sound dataframe that corresponds to current participant
    subset_cons.index = range(len(subset_cons.index)) # reset index
    con = subset_cons["Breath.Group"].value_counts().sort_index() # count how often a certain Breath group occurs for this participant
    con.index = range(len(con.index)) # reset index
    for a in range (0,len(con)): # go through all breath groups that this participant produced
      con_current_BG = pd.DataFrame()  ## initialize BG-level dataframe
      con_current_BG = pd.DataFrame(np.repeat(con.iloc[a], con.iloc[a], axis=0)) #replicate the sum sum times
      con_current_ID = con_current_ID.append([con_current_BG], ignore_index = True) # add BG-level dataframe to participant-level dataframe
    con_col = con_col.append([con_current_ID], ignore_index = True) # add participant-level dataframe to group-level dataframe

  df_consonants = pd.concat([df_consonants, con_col], axis=1)
  df_consonants.rename(columns = {'Consonants':'Unmatched_Cons'}, inplace = True)
  df_consonants.rename(columns = {0:'Consonants'}, inplace = True) # rename new column
  pre_df_consonant_avg = df_consonants.groupby("Group").mean()    ########### average counting 13 13 times

  return(df_consonants, pre_df_consonant_avg)

**1.4. Average Vowel Count Per Participant**

In [None]:
def participant_vowel_avg(df, condition):
  group_col = pd.DataFrame(index = range(len(IDs_col)),columns=["Group"])
  for i in range(0,len(group_col)):
    if df["ID"][i] in control_IDs:
      group_col["Group"][i] = "Control"
    else:
      group_col["Group"][i] = "PWS"

    # reading or interview condition
  if condition == "frog":
    IDs_here = IDs_frog
  else:
    IDs_here = IDs_reading

  n = -1
  avg_col = pd.DataFrame(index = range(len(IDs_col)),columns=["Syllables"])   # Syllables
  for ID in IDs_here: ## loop over participnts
    n = n + 1
    subset_BGs = df[df["ID"] == ID]  # get subset of sound dataframe that corresponds to current participant
    subset_BGs.index = range(len(subset_BGs.index)) # reset index
    BG_avg = subset_BGs.groupby("Breath.Group").mean()
    subj_avg = BG_avg["Syllables"].mean()
    avg_col["Syllables"][n] = subj_avg

  df_participant_vowel_avg = pd.concat([group_col, IDs_col, avg_col], axis=1)

  return(df_participant_vowel_avg)

**1.5. Average Consonant Count Per Participant**

In [None]:
def participant_consonant_avg(df, condition):
  group_col = pd.DataFrame(index = range(len(IDs_col)),columns=["Group"])
  for i in range(0,len(group_col)):
    if df["ID"][i] in control_IDs:
      group_col["Group"][i] = "Control"
    else:
      group_col["Group"][i] = "PWS"

  # reading or interview condition
  if condition == "frog":
    IDs_here = IDs_frog
  else:
    IDs_here = IDs_reading

  n = -1
  avg_col = pd.DataFrame(index = range(len(IDs_col)),columns=["Consonants"])
  for ID in IDs_here: ## loop over participnts
    n = n + 1
    subset_BGs = df[df["ID"] == ID]  # get subset of sound dataframe that corresponds to current participant
    subset_BGs.index = range(len(subset_BGs.index)) # reset index
    BG_avg = subset_BGs.groupby("Breath.Group").mean()
    subj_avg = BG_avg["Consonants"].mean()
    avg_col["Consonants"][n] = subj_avg

  df_participant_cons_avg = pd.concat([group_col, IDs_col, avg_col], axis=1)

  return(df_participant_cons_avg)

**1.6. Compare Consonant and Vowel Counts Across Groups**

In [None]:
def compare_groups(df_vowels, df_consonants):
  control_v = (df_vowels.groupby('ID').mean()["Syllables"][len(pws_IDs):(len(pws_IDs)+len(control_IDs))]).mean()
  pws_v = (df_vowels.groupby('ID').mean()["Syllables"][0:len(pws_IDs)]).mean()
  difference_v = control_v - pws_v

  control_c = (df_consonants.groupby('ID').mean()["Consonants"][len(pws_IDs):(len(pws_IDs)+len(control_IDs))]).mean()
  pws_c = (df_consonants.groupby('ID').mean()["Consonants"][0:len(pws_IDs)]).mean()
  difference_c = control_c - pws_c

  string1 = (f"PWS produced on average {round(pws_v,2)} syllables per utterance, while control participants produced {round(control_v,2)} syllables on average.")
  string2 = (f"This means that on average control participants produced {round(difference_v,2)} syllables more per utterance.")
  string3 = (f"\nPWS produced on average {round(pws_c,2)} consonants per utterance, while control participants produced {round(control_c,2)} consonants on average.")
  string4 = (f"This means that on average control participants produced {round(difference_c,2)} consonants more per utterance.")

  return(string1, string2, string3, string4, pws_v, pws_c, difference_v, difference_c)

**2. Prepare DataFrame**

In [None]:
# add new column to dataframe that denotes participant's group membership
frog = frog.dropna()
frog = assign_group(frog)
reading = reading.dropna()
reading = assign_group(reading)

In [None]:
# account for case differences in annotation
for i in range (0, len(frog)):
  frog["FluencyStatus"][i] = frog["FluencyStatus"][i].lower().strip()
for i in range (0, len(reading)):
  reading["FluencyStatus"][i] = reading["FluencyStatus"][i].lower().strip()

In [None]:
# exclude disfluent utterances from further analysis
frog_fluent = frog[frog["FluencyStatus"] == "fluent"]
frog_fluent.index = range(len(frog_fluent.index))
reading_fluent = reading[reading["FluencyStatus"] == "fluent"]
reading_fluent.index = range(len(reading_fluent.index))

In [None]:
# count vowels per utterance
[frog_vowels_fluent, pre_frog_vowel_avg_fluent]  = count_vowels(frog_fluent, "frog")
[reading_vowels_fluent, pre_reading_vowel_avg_fluent] = count_vowels(reading_fluent, "reading")

In [None]:
# count consonants per utterance
[frog_consonants_fluent, pre_frog_consonant_avg_fluent]  = count_consonants(frog_fluent,'frog')
[reading_consonants_fluent, pre_reading_consonants_avg_fluent] = count_consonants(reading_fluent,'reading')

In [None]:
#average # vowels per breath group for each participant
frog_participant_vowel_avg_fluent = participant_vowel_avg(frog_vowels_fluent,'frog')
reading_participant_vowel_avg_fluent = participant_vowel_avg(reading_vowels_fluent,'reading')

In [None]:
#average # consonants per breath group for each participant
frog_participant_cons_avg_fluent = participant_consonant_avg(frog_consonants_fluent,'frog')
reading_participant_cons_avg_fluent = participant_consonant_avg(reading_consonants_fluent,'reading')

In [None]:
# compare group averages of number of vowels per utterance and number of consonants per utterance
# frog
string1, string2, string3, string4, pws_v_frog_fluent, pws_c_frog_fluent, difference_v_frog_fluent, difference_c_frog_fluent = compare_groups(frog_participant_vowel_avg_fluent, frog_participant_cons_avg_fluent)

print(string1)
print(string2)
print(string3)
print(string4)

PWS produced on average 8.91 syllables per utterance, while control participants produced nan syllables on average.
This means that on average control participants produced nan syllables more per utterance.

PWS produced on average 13.97 consonants per utterance, while control participants produced nan consonants on average.
This means that on average control participants produced nan consonants more per utterance.


In [None]:
# compare group averages of number of vowels per utterance and number of consonants per utterance
# reading
string5, string6, string7, string8, pws_v_read_fluent, pws_c_read_fluent, difference_v_read_fluent, difference_c_read_fluent  = compare_groups(reading_participant_vowel_avg_fluent, reading_participant_cons_avg_fluent)

print(string5)
print(string6)
print(string7)
print(string8)

PWS produced on average 6.5 syllables per utterance, while control participants produced nan syllables on average.
This means that on average control participants produced nan syllables more per utterance.

PWS produced on average 9.5 consonants per utterance, while control participants produced nan consonants on average.
This means that on average control participants produced nan consonants more per utterance.


**3. Create Dataframe**

In [None]:
reading_vowels_matched_fluent = reading_vowels_fluent
frog_vowels_matched_fluent = frog_vowels_fluent
reading_consonants_matched_fluent = reading_consonants_fluent
frog_consonants_matched_fluent = frog_consonants_fluent

In [None]:
if IDs_reading == []:
  reading_vowels_matched_fluent['Syllables'] = []
  reading_consonants_matched_fluent['Consonants'] = []
if IDs_frog == []:
  frog_vowels_matched_fluent['Syllables'] = []
  frog_consonants_matched_fluent['Consonants'] = []

**4. Save**

In [None]:
%cd /content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics

In [None]:
dir = "3.MLU_Matched"

if os.path.exists(dir) == False:
  os.mkdir(dir)

In [None]:
%cd /content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics/3.MLU_Matched/

/content/gdrive/MyDrive/ATAS_Plus/Duration_Metrics/3.MLU_Matched


In [None]:
# export
reading_vowels_matched_fluent.to_excel("matchedVowels_reading_FLUENT.xlsx")
reading_consonants_matched_fluent.to_excel("matchedConsonants_reading_FLUENT.xlsx")
frog_vowels_matched_fluent.to_excel("matchedVowels_frog_FLUENT.xlsx")
frog_consonants_matched_fluent.to_excel("matchedConsonants_frog_FLUENT.xlsx")