In [2]:
import pandas as pd
import numpy as np

In [3]:
# get advocate data (1946 - 2014)
df = pd.DataFrame.from_csv("../data/advocate_data/advocates.csv", index_col=False)

In [4]:
df['term_oyez'] = df['term_oyez'].apply(lambda x: str(x))
df['lawyer_name'] = df['lawyer_name'].apply(lambda x: x.strip())

In [5]:
# get additional data (1998 - 2014)
a_df = pd.read_stata("../data/advocate_data/MatchedNames_FromAlex.dta")

In [6]:
# break term and docket out of "1998_01_227" type string
a_df['term'] = a_df['docket_id'].apply(lambda x: x.split("_",1)[0])
a_df['docket'] = a_df['docket_id'].apply(lambda x: x.split("_",1)[1].replace("_","-"))

In [7]:
# break flags out of file name (flag b = p or r, petitioner/respondent)
a_df['a'], a_df['b'], a_df['c'], a_df['d'] = zip(*a_df['audio'].apply(lambda x: x[-11:-4].split("_")))

In [8]:
# this is optional -- it's not used in the join
def bool_num(b):
    if b == True:
        return 1
    else:
        return 0
    
a_df['petitioner'] = (a_df['b'].apply(lambda x: bool_num(x=='p')))
a_df['respondent'] = (a_df['b'].apply(lambda x: bool_num(x=='r')))
a_df['other'] = (a_df['b'].apply(lambda x: bool_num((x<>'r') & (x <>'p'))))

In [9]:
# outer join the two data sources to find holes
final =  df.merge(a_df,how='outer',
         left_on=['term_oyez','docket_oyez','lawyer_name'],
         right_on=['term','docket','full_name'])

In [10]:
final['term_merged'] = np.where(final['term_oyez'].isnull(), final['term'], final['term_oyez'])
final['docket_merged'] = np.where(final['docket_oyez'].isnull(), final['docket'], final['docket_oyez'])
final['name_merged'] = np.where(final['lawyer_name'].isnull(), final['full_name'], final['lawyer_name'])

In [11]:
final.to_csv("../data/advocate_data/lawyers_cases_1946-2014.csv")

In [57]:
out = final[["name_merged","docket_merged","term_merged"]]

In [58]:
out.columns = ['name', 'case', 'Year']

In [63]:
lawyers_dict = {}
for row in out.sort(["name","Year","case"]).values:
    if row[0] in lawyers_dict:
        lawyers_dict[row[0]]['case'].append(row[1])
        if lawyers_dict[row[0]]['year_max'] < row[2]:
            lawyers_dict[row[0]]['year_max'] = row[2]
        if lawyers_dict[row[0]]['year_min'] > row[2]:
            lawyers_dict[row[0]]['year_max'] = row[2]
    else:
        lawyer_dict = {'case': [row[1]], 'year_max': row[2], 'year_min': row[2]}
        lawyers_dict[row[0]] = lawyer_dict

In [92]:
# sometimes I get annoyed by pandas/csv writer and I just want to do it myself...
text_file = open("../data/image_data/mturk/input.csv", "wb")
for lawyer in lawyers_dict:
    cases = str(lawyers_dict[lawyer]['case'])
    year_min = str(lawyers_dict[lawyer]['year_min']) 
    year_max = str(lawyers_dict[lawyer]['year_max'])
    if year_max == year_min:
        text_file.write('"' + lawyer + '"' + "," + year_min + "," + '"' + cases + '"' + '\n')
    else:
        text_file.write('"' + lawyer + '"' + "," + year_min + "-" + year_max + "," + '"' + cases + '"' + '\n')
text_file.close()