In [2]:
import nltk
import re
import os
from nltk.corpus.reader import CHILDESCorpusReader
from nltk import FreqDist, word_tokenize, bigrams, ConditionalFreqDist
import pylangacq
import pandas as pd

In [None]:
# I had to install pylangacq like this for it to work in jupiter
#import sys
#!"$sys.executable" -m pip install pylangacq

pylangnacq source

@TechReport{lee-et-al-pylangacq:2016,
   Title       = {Working with CHAT transcripts in Python},
   Author      = {Lee, Jackson L. and Burkholder, Ross and Flinn, Gallagher B. and Coppess, Emily R.},
   Institution = {Department of Computer Science, University of Chicago},
   Year        = {2016},
   Number      = {TR-2016-02},
}

In [3]:
# # # Set Up

# Put the Hoff folder in the project root, copy the path into the reader.
# Create the hoff reader
hre = pylangacq.read_chat('/Users/baeddanhemphill/Desktop/Ling_401/Homeworks/Hoff')
hre.n_files()

# Separate by eng,span,mono
# The metadata for age 2.5 is missing dependent tier data, so focus only on ages 3/3.5
#.filter(keep, remove)
eng_files = hre.filter('biling-eng', '2.5')
span_files = hre.filter('biling-spa', '2.5')
mono_files = hre.filter('mono', '2.5')


# Define pronouns
eng_pnns_sg = {'i', 'you', 'he', 'she', 'it'}
eng_pnns_pl = {'we', 'they'}
span_pnns_sg = {'yo', 'tu', 'él', 'ella', 'usted'}
span_pnns_pl = {'nosotros', 'nosotras', 'ellos', 'ellas', 'ustedes'}

In [None]:
hre.headers()

In [None]:
def pos_list(files):
     poss = [token.pos for token in files.tokens(participants = 'CHI')]
     return poss

In [None]:

def corpus_to_chi_df_v1(corpus):
    cols = []
    for file_path in corpus.file_paths():

        # Get ages for CHI in this file
        single_file_reader = corpus.filter(file_path)
        chi_ages = single_file_reader.ages('CHI')
        age = chi_ages[0] if chi_ages else None

        # Iterate over CHI utterances
        for utt in single_file_reader.utterances(participants='CHI'):

            #Only include the utterences containing a verb
            if not any(tok.pos == "verb" or tok.pos == "aux" for tok in utt.tokens):
                continue

            tokens = [tok.word for tok in utt.tokens]
            text = " ".join(tokens).strip()
            cols.append({
                "age": age,
                "utterance": text,
            })

    return pd.DataFrame(cols)

corpus_to_chi_df_v1(eng_files)

In [None]:
eng_df = corpus_to_chi_df(eng_files)
span_df = corpus_to_chi_df(span_files)
mono_df = corpus_to_chi_df(mono_files)

#The dependent fields in 2.5 are broken :(
valid_ages = ['3;0', '3;6']
eng_df = eng_df[eng_df['age'].isin(valid_ages)]
span_df = span_df[span_df['age'].isin(valid_ages)]
mono_df = mono_df[mono_df['age'].isin(valid_ages)]



In [None]:
pos_set = set()
for utt in hre.utterances():
    for tok in utt.tokens:
        if tok.pos:
            pos_set.add(tok.pos)
pos_set

In [None]:
mor_set = set()
for utt in hre.utterances():
    for tok in utt.tokens:
        if tok.mor:
            mor_set.add(tok.mor.split("|")[0])  # category part
mor_set

In [21]:
def corpus_to_chi_df_v2(corpus):
    cols = []
    for file_path in corpus.file_paths():

        # Get ages for CHI in this file
        single_file_reader = corpus.filter(file_path)
        chi_ages = single_file_reader.ages('CHI')
        age = chi_ages[0] if chi_ages else None

        # Iterate over CHI utterances
        for utt in single_file_reader.utterances(participants='CHI'):

            tokens = utt.tokens
            pos = [token.pos for token in tokens ]  # access POS list

            # Only include utterances containing a verb
            if not any(p in ("verb", "aux") for p in pos):
                continue

            words = [tok.word for tok in utt.tokens]  # convert to list of words
            text = " ".join(words).strip()

            # Determine pronoun-before-verb
            pronoun_before_verb = False
            pron_verb_bigram = None

            for i, p in enumerate(pos):
                # pronoun–verb bigram
                if p == "pron" and i+1 < len(pos) and pos[i+1] in ("verb", "aux"):
                    pronoun_before_verb = True
                    pron_verb_bigram = (words[i], words[i+1])
                    break

                # any pronoun before first verb
                if p in ("verb", "aux"):
                    break
                if p == "pron":
                    pronoun_before_verb = True

            cols.append({
                "age": age,
                "utterance": text,
                "pronoun_before_verb": pronoun_before_verb,
                "pron_verb_bigram": pron_verb_bigram
            })

    return pd.DataFrame(cols)

corpus_to_chi_df_v2(eng_files)

Unnamed: 0,age,utterance,pronoun_before_verb,pron_verb_bigram
0,"(2, 6, 0)",a ver los patos .,False,
1,"(2, 6, 0)",llama .,False,
2,"(2, 6, 0)",pon eso .,False,
3,"(2, 6, 0)",take it .,False,
4,"(2, 6, 0)",put it ?,False,
...,...,...,...,...
7093,"(3, 0, 0)",esto se está acabando .,True,"(se, está)"
7094,"(3, 0, 0)",podemos también ?,False,
7095,"(3, 0, 0)",I like /.,True,"(I, like)"
7096,"(3, 0, 0)",esto no se puede bajar .,True,"(se, puede)"
