In [2]:
import nltk
import re
import os
from nltk.corpus.reader import CHILDESCorpusReader
from nltk import FreqDist, word_tokenize, bigrams, ConditionalFreqDist
import pylangacq
import pandas as pd

In [None]:
# I had to install pylangacq like this for it to work in jupiter
#import sys
#!"$sys.executable" -m pip install pylangacq

pylangnacq source

@TechReport{lee-et-al-pylangacq:2016,
   Title       = {Working with CHAT transcripts in Python},
   Author      = {Lee, Jackson L. and Burkholder, Ross and Flinn, Gallagher B. and Coppess, Emily R.},
   Institution = {Department of Computer Science, University of Chicago},
   Year        = {2016},
   Number      = {TR-2016-02},
}

In [3]:
# # # Set Up

# Put the Hoff folder in the project root, copy the path into the reader.
# Create the hoff reader
hre = pylangacq.read_chat('/Users/baeddanhemphill/Desktop/Ling_401/Homeworks/Hoff')
hre.n_files()

# Separate by eng,span,mono
# The metadata for age 2.5 is missing dependent tier data, so focus only on ages 3/3.5
#.filter(keep, remove)
eng_files = hre.filter('biling-eng', '2.5')
span_files = hre.filter('biling-spa', '2.5')
mono_files = hre.filter('mono', '2.5')


# Define pronouns
eng_pnns_sg = {'i', 'you', 'he', 'she', 'it'}
eng_pnns_pl = {'we', 'they'}
span_pnns_sg = {'yo', 'tu', 'Ã©l', 'ella', 'usted'}
span_pnns_pl = {'nosotros', 'nosotras', 'ellos', 'ellas', 'ustedes'}

In [None]:
# Check specific tier contents
pos_set = set()
for utt in hre.utterances():
    for tok in utt.tokens:
        if tok.pos:
            pos_set.add(tok.pos)
pos_set

mor_set = set()
for utt in hre.utterances():
    for tok in utt.tokens:
        if tok.mor:
            mor_set.add(tok.mor.split("|")[0])  # category part
mor_set

In [None]:
def corpus_to_chi_df_v1(corpus):
    cols = []
    for file_path in corpus.file_paths():

        # Get ages for CHI in this file
        single_file_reader = corpus.filter(file_path)
        chi_ages = single_file_reader.ages('CHI')
        age = chi_ages[0] if chi_ages else None

        # Iterate over CHI utterances
        for utt in single_file_reader.utterances(participants='CHI'):

            #Only include the utterences containing a verb
            if not any(tok.pos == "verb" or tok.pos == "aux" for tok in utt.tokens):
                continue

            tokens = [tok.word for tok in utt.tokens]
            text = " ".join(tokens).strip()
            cols.append({
                "age": age,
                "utterance": text,
            })

    return pd.DataFrame(cols)

In [None]:
def corpus_to_chi_df_v2(corpus):
    cols = []
    for file_path in corpus.file_paths():

        # Get ages for CHI in this file
        single_file_reader = corpus.filter(file_path)
        chi_ages = single_file_reader.ages('CHI')
        age = chi_ages[0] if chi_ages else None

        # Determine language filter based on file path
        # This will filter out code switching in target language.
        if "biling-eng" in file_path:
            exclude_marker = "[- spa]"
        elif "biling-spa" in file_path:
            exclude_marker = "[- eng]"
        else:
            exclude_marker = "[-RANDOM PHRASE NOT IN TEXTS SUCH AS THIS ONE]"

        # Iterate over CHI utterances
        for utt in single_file_reader.utterances(participants='CHI'):

            tokens = utt.tokens
            pos = [token.pos for token in tokens ]  # access POS list

            # Only include utterances containing a verb
            if not any(p in ("verb", "aux") for p in pos):
                continue

            # Only include utterances in target language
            badtext = utt.tiers['CHI']
            if exclude_marker in badtext:
                continue

            words = [tok.word for tok in utt.tokens]
            text = " ".join(words).strip()

            # Determine pronoun-before-verb
            pronoun_before_verb = False
            pron_verb_bigram = None

            for i, p in enumerate(pos):

                if p == "pron" and i+1 < len(pos) and pos[i+1] in ("verb", "aux"):
                    pronoun_before_verb = True
                    pron_verb_bigram = (words[i], words[i+1])
                    break

                if p in ("verb", "aux"):
                    break

                if p == "pron":
                    pronoun_before_verb = True

            cols.append({
                "age": age,
                "utterance": text,
                "pronoun_before_verb": pronoun_before_verb,
                "pron_verb_bigram": pron_verb_bigram
            })

    return pd.DataFrame(cols)


In [57]:
### PRINT THE RESULTS!

hoffdataframes = [corpus_to_chi_df_v2(eng_files),
corpus_to_chi_df_v2(span_files),
corpus_to_chi_df_v2(mono_files)]

group_names = ["Biling-English", "Biling-Spanish", "Mono-English"]

for name, df in zip(group_names, hoffdataframes):
    counts = df['pronoun_before_verb'].value_counts()
    percent = df['pronoun_before_verb'].value_counts(normalize=True) * 100

    display_df = pd.DataFrame({
        "Count": counts,
        "Percent": percent.map("{:.1f}%".format)
    }).reindex([True, False])

    print(f"--- {name} Group ---")
    print(display_df)
    print("\n")

--- Biling-English Group ---
                     Count Percent
pronoun_before_verb               
True                  4050   65.1%
False                 2170   34.9%


--- Biling-Spanish Group ---
                     Count Percent
pronoun_before_verb               
True                  2544   43.3%
False                 3333   56.7%


--- Mono Group ---
                     Count Percent
pronoun_before_verb               
True                  3835   70.1%
False                 1639   29.9%


