# Data gathering scripts

### Step 1. Gathering the phonetics data

The data set was initially as follows: 

For a given case (e.g. 2006_04_1350), we have:
-- A text-grid file `2006_04_1350.TextGrid`, found in `/data/Dropbox/Data/Supreme_Court_Audio/Oyez_vowels/FAVE/oyez_full/2006`
This is the breakdown by sentences/paragraphs. It contains a number N of « items » which is indicated at the beginning as the size parameter (here, size = 11). Each item corresponds to one person speaking.
-- Corresponding files in `/data/Dropbox/Data/Supreme_Court_Audio/Oyez_vowels/FAVE/FAVE-extract/`
For any given hearing, there are 4 * M files, where M is less than or equal to N - the number of items (or size) as above. For 2006_04_1350 for example, N is 11 but M is 4. File K corresponds to speaker K. The relevant files for us are those named 
`2006_04_1350_sK_norm.txt`
where K is the index of the speaker. They contained the breakdown of speaker K speech by word/vowels, together with the relevant phonetics information for each vowel (the duration, the two first formants, and more formant data)

From the first file (the `.TextGrid`), we find the name of speaker K, and we then gather the phonetics data associated to this speaker by going through the corresponding `_norm.txt` file.

In [None]:
####
## Gathers the data from formants, using the names in the TextGrids
####
# encoding: utf-8
import os
import io

TEXTGRIDS_FOLDERS = "/data/Dropbox/Data/Supreme_Court_Audio/Oyez_vowels/FAVE/oyez_full/"
FORMANTS_FOLDERS = "/data/Dropbox/Data/Supreme_Court_Audio/Oyez_vowels/FAVE/FAVE-extract/"
RESULT_FOLDERS = ""

#######
### Takes a file with formants (the norm.txt ones)
### extracts the words with more than 1 syllable
### and adds (in 'ouput') a data entry of the type
### word, speaker, [list of the syllables and the formants data for each syllable]
#####
def treat_file(filetotreat, output, speaker):
    lines = filetotreat.readlines()
    current_word = "--"
    word_occurrence = []

    for line in (l.split() for l in lines[3:] if len(l.split()) == 26): ## Skip the first 3 lines, and avoid weird entries
        collapsed_line = collapse_line(line)
        if (collapsed_line[0] == current_word or current_word == "--"):
            word_occurrence.append(collapsed_line)
        else:
            if (len(word_occurrence) > 1): ## We only keep words with more than one syllable
                add_occurrence(current_word, word_occurrence, speaker, output)
            word_occurrence = [collapsed_line]
        current_word = collapsed_line[0]

    ### Last word of the file
    if (len(word_occurrence) > 1):
        add_occurrence(current_word, word_occurrence, speaker, output)

    return True

def add_occurrence(word, occurrence, speaker, output):
    output.write(word+','+speaker+','+str(occurrence)+'\n')

def collapse_line(line):
    vowel = line[0]
    # stress = line[1]
    word = line[2]
    formant_first = line[3]
    formant_second = line[4]
    # t 5, beg 6, end 7
    duration = line[8]
    # cd 9, fm 10, fp 11, fv 12, ps 13, fs 14, style 15, glide 16
    formant_first_20 = line[17]
    formant_second_20 = line[18]
    formant_first_35 = line[19]
    formant_second_35 = line[20]
    formant_first_50 = line[21]
    formant_second_50 = line[22]
    formant_first_65 = line[23]
    formant_second_65 = line[24]
    ff_summary = [formant_first_20, formant_first_35, formant_first_50, formant_first_65]
    fs_summary = [formant_second_20, formant_second_35, formant_second_50, formant_second_65]
    line_collapsed = [word, vowel, formant_first, formant_second, duration, ff_summary, fs_summary]
    return line_collapsed



if __name__ == "__main__":
    os.chdir(RESULT_FOLDERS)
    output = open("wordssyllablesformants.txt", "a")
    os.chdir(TEXTGRIDS_FOLDERS)
    years = os.listdir() ### List all the years
    for year in (y for y in years if y.isnumeric()):
        os.chdir(TEXTGRIDS_FOLDERS+year)
        list_hearings = os.listdir() ### List all the hearings for a given year
        for hearing in (h for h in list_hearings if h.endswith('.TextGrid')):
            hearing_textgrid = open(TEXTGRIDS_FOLDERS+year+'/'+hearing, 'r', encoding='utf-8')
            hearing_textgrid_lines = hearing_textgrid.readlines()
            item = 0
            for line in (l for l in hearing_textgrid_lines if l.strip().startswith("name =")): ### Looking for the speaker name
                item += 1
                speaker = (line.strip())[8:-1] ## Only keep the name
                if (item < 10):
                    name_file = hearing.strip('.TextGrid')+'_s0'+str(item)+'_norm.txt'
                else:
                    name_file= hearing.strip('.TextGrid')+'_s'+str(item)+'_norm.txt'
                try:
                    os.chdir(FORMANTS_FOLDERS+year+'_vowels') ### Opening the formant file
                    filetotreat = open(name_file)
                except FileNotFoundError:
                    break
                # os.chdir(RESULT_FOLDERS+year)
                # output = open("result_"+name_file, 'a')
                treat_file(filetotreat, output, speaker) #### Treating the formant file
                filetotreat.close()
    output.close()

### Step 2. Listing words and names

In an exploratory phase, we figure out the words most pronounced by our speakers, as well as a list of all the speakers that appear.

The number of speakers is of order 1000.

In [None]:
# Find the list of the words that are pronounced in the hearing
# Keep only the one that are pronounced more than 100 times
input_file = "/data/WorkData/ideology_from_audio/RESULTS/wordssyllablesformants.txt"
output_file = "/data/WorkData/ideology_from_audio/RESULTS/listwords.txt"

listofwords = open(input_file, "r")
orderedwords = open(output_file, "a")
lines = listofwords.readlines()
words = {}

for line in lines:
    word = line.split(',')[0]
    if (word in words):
        words[word] += 1
    else:
        words[word] = 1

listofwords.close()

words_rev = [(words[word], word) for word in words]
words_rev.sort(reverse=True)

for value, word in words_rev:
    if (value > 100):
        orderedwords.write(word+" "+str(value)+"\n")
    else:
        break
orderedwords.close()

In [None]:
# encoding: utf-8
# Write down the list of people who say something in the hearing dataset

input_file = "/data/WorkData/ideology_from_audio/RESULTS/wordssyllablesformants.txt"
output_file = "/data/WorkData/ideology_from_audio/RESULTS/listpeople.txt"

listofwords = open(input_file, "r")
peoples = open(output_file, "a")
lines = listofwords.readlines()

names = []
for line in lines:
    name = line.split(',')[1]
    if not (name in names):
        names.append(name)
names.sort()

for name in names:
    peoples.write(name+"\n")
    
listofwords.close()
peoples.close()

### Step 3. Searching one's ideology

We now have to connect speakers with their suspected ideology (Democrat or Republican). As suggested by our advisors, we use the DIME (Database on Ideology, Money in politics and Elections) database, developed by Adam Bonica and available at https://data.stanford.edu/dime. This dataset lists the financial contributions to political campaigns in the USA. Some basic difficulties appear:
- The database is very large, with several dozens of millions of entries, and a naive thorough search takes too much time if it has to be done for ~1000 speakers.
- Some common names may appear several times.
- How to identify a speaker in the donation list?
We also encounter the less obvious issue:
- It is fairly common that a given person will give money to both parties!
Inspired by the previous group, we chose the following approach:
1. We go through the list of donation exactly once, and only keep the entry if the "job/activity" contains a word relevant for us: lawyer, justice, attorney... 
This effectively reduces the size of the list by a factor 10--100.
2. Then for each speaker, we go through the reduced list and try to match first/last name with the first/last name entry of the donation. **Here we make the following bet: there is only one person with a given first&last name who happens to work as a lawyer. A more careful procedure would involve checking the middle name (not always present in both tables) or the adresses and listing the possible collisions.**
3. If a speaker is not present, his/her ideology is set to "Undefined". Otherwise, we take the average amount
$$
\textbf{Ideology}(\textrm{speaker}) = \frac{\textrm{Donation of the given speaker for Republican candidates}}{\textrm{Donation of the given speaker}}
$$
Hence an ideology of $0$ corresponds to a pure Democrat and ideology of $1$ corresponds to pure Republican.

We run this on two datasets: the 2008 political campaign (including State and Local elections) and the combined Presidential campaigns (all Presidential elections since the mid-70's) and we take the average (which is admittedly not a proper barycenter).

This way, we obtain an ideology label for ~500 speakers over ~900.

In [None]:
# encoding: utf-8
# Extract the lawyers from the donation dataset, to avoid looping on a huge file
import csv

DONATIONS_CSV = "contribDB_president.csv"
DONATIONS_LAWYERS = "contriblawyers_from_presidential.csv"

file_donations=open(DONATIONS_CSV, 'r')
file_contributions=open(DONATIONS_LAWYERS, 'w')

def is_lawyer(job):
    return ("judge" in job or "lawyer" in job or "attorney" in job or "advocate" in job or "law" in job or "justice" in job)

contributions = csv.writer(file_contributions)

for donation in csv.reader(file_donations):
    if is_lawyer(donation[19]):
        contributions.writerow(donation)
        
file_donations.close()
file_contributions.close()

In [None]:
# encoding: utf-8
# Find ideology from a donation dataset
import csv

DONATIONS_CSV = "contriblawyers_from_presidential.csv"
PEOPLE_FILE = "listpeople.txt"
LOG_FILE = "results_ideologt_from_presidential.txt"

log = open(LOG_FILE, "w")

#### Form the list of speakers
file_people = open(PEOPLE_FILE, 'r')
list_people = file_people.readlines()

speakers = []

for people in list_people:
    name_components = people.split()
    if len(name_components) == 2:
        first_name = name_components[0].lower()
        last_name = name_components[1].lower()
    elif len(name_components) > 2:
        first_name = name_components[0].lower()
        last_name = name_components[2].lower()
    speakers.append({'FN':first_name, 'LN':last_name})

file_people.close()

def is_lawyer(job):
    return ("judge" in job or "lawyer" in job or "attorney" in job or "advocate" in job or "law" in job or "justice" in job)

def parcours_donation(speaker):
    sum_dem = 0
    sum_rep = 0
    file_donations = open(DONATIONS_CSV, 'r')
    for donation in csv.reader(file_donations):
        amount = float(donation[3])
        first_name = donation[8]
        last_name = donation[7]
        party = donation[24]
        job = donation[19]
        if first_name == speaker['FN'] and last_name == speaker['LN'] and is_lawyer(job):
            print("FOUND ONE!")
            print("An amount of "+str(amount))
            if party=="100":
                sum_dem += amount
                print("A democrat!")
            elif party=="200":
                sum_rep += amount
                print("A republican !")
    file_donations.close()
    print("End of run!")
    if (sum_dem+sum_rep == 0):
        return -1
    else:
        return (sum_rep)/(sum_dem+sum_rep)

#### This is a silly "human" way of writing down the results and we shall pay a small price later on
for speaker in speakers:
    ideology = parcours_donation(speaker)
    if ideology == -1:
        log.write(speaker['FN']+' '+speaker['LN']+' is undefined \n')
    else:
        log.write(speaker['FN']+' '+speaker['LN']+' is '+str(ideology)+' \n')

log.close()