# Notebook 6
### Kasper Fyhn Jacobsen

A bit like in my last notebook, I have tried to define a class which makes writing the different scripts with different needs easier - at least to some degree, i.e. I will be using the same class through the different scripts, so I won't have to write it again. Another nifty thing is that I will potentially be able to simply expand the class definition to add on more functionality for later projects.

In [1]:
import os
from collections import Counter
from string import punctuation as pnc
   
class Transcript:
    '''This class is a representation of a .cha transcript which can yield
    things like tokens, speaker details and lines from just one speaker.'''
    
    def __init__(self, filepath):
        
        try:
            file = open(filepath, 'r', encoding='utf-8')
            self.name = file.name
            
            # store the raw transcript, but clean it a little bit
            self.raw_transcript = file.read()
            remove_list = ['\t', '\r']
            for item in remove_list:
                self.raw_transcript = self.raw_transcript.replace(item, '')
                
            # extract headerlines and transcriptlines
            text = self.raw_transcript.split('\n')
            self.headers = [line for line in text if line.startswith('@')]
            self.lines = [line for line in text if line.startswith('*')]
            self.fully_loaded = True # flag that transcript is fully loaded
            
        except IOError as e:
            self.fully_loaded = False # flag that transcript was not loaded
            print('An error occured when loading:', filepath)
            print('Error message:', e)
    
    def lines_as_tuples(self):
        '''Return a list of tuples of all utterance lines, where tuple[0] is
        the three letter initials for the speaker and tuple[1] is the line.'''
        
        return [(line[1:4], line[5:]) for line in self.lines]
    
    def tokens(self, speakers='all'):
        '''Return a list of tokens uttered by the specified speaker(s). If no
        speakers are specified, return tokens for all speakers.'''
        
        if speakers == 'all':
            speakers = self.speakers()
            
        # get tokens from the specified speakers
        tokens = [word.lower()
                  for tpl in self.lines_as_tuples() if tpl[0] in speakers
                  for word in tpl[1].split()]
        
        # clean for punctuation
        tokens = ' '.join(tokens)
        tokens = ''.join(c for c in tokens if c not in pnc)
        tokens = tokens.split()
        
        return tokens
    
    def types(self, speakers='all'):
        '''Return a list of types uttered by the specified speaker(s). If no
        speakers are specified, return types for all speakers.'''
        
        return set(self.tokens(speakers=speakers))
    
    def word_freqs(self, speakers='all'):
        '''Return a Counter object of tokens uttered by the specified
        speaker(s). If no speakers are specified, return a Counter object for
        all speakers.'''
        
        return Counter(self.tokens(speakers=speakers))
    
    def speakers(self):
        '''Return a set of all speakers that appear in the transcript'''

        return {line[1:4] for line in self.lines}
    
    def speaker_details(self):
        '''Return a dictionary of dictionaries containing details about the
        given speaker(s).If no info is given in the original transcript file
        on some details, e.g. age or sex, those entries will simply be empty.'''
               
        ids = [id_str for id_str in self.headers if id_str.startswith('@ID')]
        ids = [entry[4:].split(sep='|') for entry in ids]
        ids = [{'lang':entry[0], 'corp':entry[1], 'name':entry[2],
                'age':entry[3], 'sex':entry[4], 'role':entry[7]}
                for entry in ids]
        ids = {entry['name']:entry
               for entry in ids if entry['name'] in self.speakers()}
        return ids

## Adam's word frequencies
Having the above defined class, I can not load all transcripts into a list of Transcript objects and iterate over all of these to extract word frequencies of any given word. The script looks like this:

In [2]:
# get the path to the directory containing the transcript files
path = input('Please, provide the path to the folder that you a report on: ')
os.chdir(path)
folder = os.path.basename(os.getcwd())

# prepare a report file stating which folder it is a report for
report = open(f'../report_{folder}.csv', 'a+')
print('Report of word frequencies in folder: ' + folder, file=report)

# load all transcripts from the folder
trans = [Transcript(file) for file in os.listdir()]
trans = [trn for trn in trans if trn.fully_loaded] # clean out non-loaded ones

# get a list of words for which the frequencies should be listed
words = input('Enter words in lowercase (separate with ","): ')
words = words.split(',')
words = [word.strip() for word in words]

# print column headers in the .csv file
print('age', *words, sep=',', file=report)

# for every file, get age and frequency of each word and report it
for trn in trans:
    freqs = trn.word_freqs(speakers='CHI')
    age = trn.speaker_details()['CHI']['age']
    word_freqs = []
    for word in words:
        freq = freqs[word]
        prop_freq = freq/sum(freqs.values())
        word_freqs.append(prop_freq)
    word_freqs = [f'{digit:.4}' for digit in word_freqs]
    print(age, *word_freqs, sep=',', file=report)

# let the user know that the program has ended and close the file
print('Done. The frequencies have been saved in', report.name)    
report.close()

Please, provide the path to the folder that you a report on: C:\Users\Kasper Fyhn Jacobsen\Dropbox\Child Language Acquisition\Data\Brown\Adam
Enter words in lowercase (separate with ","): i,you, food, mom
Done. The frequencies have been saved in ../report_Adam.csv


## Mom's word frequencies
Knowing that the age of the mom is not given in the transcript, let's kill that line and the age report in the later print statement. Other than that, the only line we need to change is really this line:

`freqs = trn.word_freqs(speakers='CHI')`

Which we will change into:

`freqs = trn.word_freqs(speakers='MOT')`

I'll also append 'mother' to the report file's name to have them separated. Let's see how much she uses words like 'don't', 'no' and 'bad':

In [3]:
# get the path to the directory containing the transcript files
path = input('Please, provide the path to the folder that you a report on: ')
os.chdir(path)
folder = os.path.basename(os.getcwd())

# prepare a report file stating which folder it is a report for
report = open(f'../report_{folder}_mother.csv', 'a+')
print('Report of word frequencies in folder: ' + folder, file=report)

# load all transcripts from the folder
trans = [Transcript(file) for file in os.listdir()]
trans = [trn for trn in trans if trn.fully_loaded] # clean out non-loaded ones

# get a list of words for which the frequencies should be listed
words = input('Enter words in lowercase (separate with ","): ')
words = words.split(',')
words = [word.strip() for word in words]

# print column headers in the .csv file
print('age', *words, sep=',', file=report)

# for every file, get age and frequency of each word and report it
for trn in trans:
    freqs = trn.word_freqs(speakers='MOT')
    word_freqs = []
    for word in words:
        freq = freqs[word]
        prop_freq = freq/sum(freqs.values())
        word_freqs.append(prop_freq)
    word_freqs = [f'{digit:.4}' for digit in word_freqs]
    print(*word_freqs, sep=',', file=report)

# let the user know that the program has ended and close the file
print('Done. The frequencies have been saved in', report.name)    
report.close()

Please, provide the path to the folder that you a report on: C:\Users\Kasper Fyhn Jacobsen\Dropbox\Child Language Acquisition\Data\Brown\Adam
Enter words in lowercase (separate with ","): dont, no, bad
Done. The frequencies have been saved in ../report_Adam_mother.csv


## Bonus: one .csv for each child
To do this task, we only need to change to do one major from the previous scripts: have the script find the subfolders in the "master folder", i.e. the one called Brown, and do what it did before to each of these. Thus, with this line

`folders = [folder for folder in os.listdir() if os.path.isdir(folder)]`

and a for loop iterating over this list, we're good to go.

In [4]:
# get the path to the directory containing the sub folders with data
path = input('Please, provide the path to the master folder of your data: ')
os.chdir(path)
master = os.path.basename(os.getcwd())

# make a list of all sub folders (assuming that there are only data dirs)
folders = [folder for folder in os.listdir() if os.path.isdir(folder)]
   
# get a list of words for which the frequencies should be listed
words = input('Enter words in lowercase (separate with ","): ')
words = words.split(',')
words = [word.strip() for word in words]

for folder in folders:
    # go into the relevant directory
    os.chdir(folder)
    
    # prepare a report file stating which folder it is a report for
    report = open(f'../report_{folder}.csv', 'a+')
    print('Report of word frequencies in folder: ' + folder, file=report)
    
    # load all transcripts from the folder
    trans = [Transcript(file) for file in os.listdir()]
    trans = [trn for trn in trans if trn.fully_loaded] # clean out non-loaded
    
    # print column headers in the .csv file
    print('age', *words, sep=',', file=report)
    
    # for every file, get age and frequency of each word and report it
    for trn in trans:
        freqs = trn.word_freqs(speakers='CHI')
        age = trn.speaker_details()['CHI']['age']
        word_freqs = []
        for word in words:
            freq = freqs[word]
            prop_freq = freq/sum(freqs.values())
            word_freqs.append(prop_freq)
        word_freqs = [f'{digit:.4}' for digit in word_freqs]
        print(age, *word_freqs, sep=',', file=report)
    
    # close the report file and go up one dir
    print(f'The frequencies for {folder} have been saved in {report.name}.')    
    report.close()
    os.chdir('..')

Please, provide the path to the master folder of your data: C:\Users\Kasper Fyhn Jacobsen\Dropbox\Child Language Acquisition\Data\Brown
Enter words in lowercase (separate with ","): doll, dad, me
The frequencies for Adam have been saved in ../report_Adam.csv.
The frequencies for Eve have been saved in ../report_Eve.csv.
The frequencies for Sarah have been saved in ../report_Sarah.csv.


## Extra bonus: All children in one file
I've made something, but I'm really not sure that it is ideal. Basically, the idea is to make frequency lists like we've done so far and then iterate over all these simultaneously up until the end of the longest list. When a list is empty of entries, I've simply had the script report nothing, i.e. nothing between commas so as to keep the separated values for each folder.

In [5]:
# get the path to the directory containing the sub folders with data
path = input('Please, provide the path to the master folder of your data: ')
os.chdir(path)

# make a list of all sub folders (assuming that there are only data dirs)
folders = [folder for folder in os.listdir() if os.path.isdir(folder)]

# prepare a report file stating that it is for all
report = open('report_all.csv', 'a+')
print('Report of word frequencies in folders: ', folders, file=report)

# print column headers in the .csv file
print('visit', *folders, sep=',', file=report)

# get a word to get the frequency of
word = input('Enter word in lowercase: ')

# prepare a list to contain lists for each folder
all_trans = []
for folder in folders:
    # go into the relevant directory
    os.chdir(folder)   
    # load all transcripts from the folder
    trans = [Transcript(file) for file in os.listdir()]
    trans = [trn for trn in trans if trn.fully_loaded] # clean out non-loaded    
    # add a list of transcripts
    all_trans.append(trans)    
    os.chdir('..')

# for every folder, make a prop frequency list of the given word   
all_freqs = [[trn.word_freqs(speakers='CHI')[word]/
              sum(trn.word_freqs(speakers='CHI').values())
             for trn in trans] for trans in all_trans]

# find the list with the highest length
max_lgth = 0
for freq_list in all_freqs:
    if len(freq_list) > max_lgth:
        max_lgth = len(freq_list)

# up to the max length of the longest list, report the frequency of each list
# of transcripts. When such a list is empty, simply report nothing.
for i in range(max_lgth):
    line = [f'{freq_list[i]:.4}' if not i >= len(freq_list) else ''
            for freq_list in all_freqs]
    print(i+1, *line, sep=',', file=report)
    
# close the report file
master = os.path.basename(os.getcwd()) # get the name of the master folder
print(f'The frequencies for {master} have been saved in {report.name}.')    
report.close()

Please, provide the path to the master folder of your data: C:\Users\Kasper Fyhn Jacobsen\Dropbox\Child Language Acquisition\Data\Brown
Enter word in lowercase: eat
The frequencies for Brown have been saved in report_all.csv.
