In [None]:
import nltk
import pylangacq as pla #program that reads the .cha files
import pandas as pd #will organize the data into an early readable DataFrame
import os #reads file directory paths

CHA_DIR = "/Users/kennedycameron/Downloads/Hoff/"

files = [os.path.join(CHA_DIR, f) for f in os.listdir(CHA_DIR) if f.endswith(".cha")]
#go to the folder that contains the cha files and collect them

utterances = []
for f in files:
    chat = pla.read_chat(f)
    name = os.path.basename(f)
    # loop through the files

    group = "monolingual" if "(1)" in name else "bilingual"
    #seperating monolingual and bilingual cha files

    for utt in chat.utterances(participants="CHI"):
        lang = getattr(utt, "language", None)
        if lang is None or lang == "eng":  
            text = " ".join(tok.word for tok in utt.tokens)
    #pay attention to when the child is speaking, check if they are speaking english
            utterances.append({
                "file": name,
                "group": group,
                "utterance": text
            })
            # storing each usuable English utterance in a list


df = pd.DataFrame(utterances)
#convert that list to a dataframe for easy viewing

#shows how much data we have captures and the head of the dataframe itself
print("Total English utterances:", len(df))
print(df.head())


Total English utterances: 11843
         file        group utterance
0  028(1).cha  monolingual  better .
1  028(1).cha  monolingual   uhhuh !
2  028(1).cha  monolingual   uhhuh .
3  028(1).cha  monolingual    mmhm .
4  028(1).cha  monolingual         .


In [None]:
import spacy
#spaCy's English language model provides part of speech tagging and depenceny parsing so I used that to detect subejects within sentences.
nlp = spacy.load("en_core_web_sm")

#This function analyzes each utterance by
def analyze_subject(utt):
    doc = nlp(utt)

    has_finite_verb = False
    has_subject = False

#(a) checking for a finite verb
    for token in doc:
        if token.pos_ in {"VERB", "AUX"} and token.morph.get("VerbForm") != ["Inf"]:
            has_finite_verb = True
#(b) checking for a grammatical subject
        if token.dep_ in {"nsubj", "nsubj:pass"}:
            has_subject = True
#(c) reporting the findings
    return pd.Series({
        "has_finite_verb": has_finite_verb,
        "has_subject": has_subject
    })

#This function is applied to every utterance
subject_info = df["utterance"].apply(analyze_subject)
df_analysis = pd.concat([df, subject_info], axis=1)


In [44]:
df_analysis = df_analysis.loc[:, ~df_analysis.columns.duplicated()]

print(df_analysis.dtypes)


file               object
group              object
utterance          object
has_finite_verb      bool
has_subject          bool
dtype: object


In [45]:
finite_df = df_analysis[df_analysis["has_finite_verb"]].copy()
finite_df["subject_omitted"] = ~finite_df["has_subject"]

print("Finite clauses:", len(finite_df))

summary = finite_df.groupby("group")["subject_omitted"].agg(
    total_clauses="count",
    omissions="sum"
)
summary["omission_rate"] = summary["omissions"] / summary["total_clauses"]

print("\nSummary stats:")
print(summary)

Finite clauses: 2146

Summary stats:
             total_clauses  omissions  omission_rate
group                                               
bilingual              760        131       0.172368
monolingual           1386         93       0.067100
