In [1]:
import pandas as pd
import re
from pdfminer.high_level import extract_text
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_json("../Data/testimonies.json")

In [3]:
# Explode the testimonies 
df = df.explode("utterances")#.reset_index()

In [4]:
# Create utterance id
df['id_U'] = df.groupby(['id_H', 'id_T']).cumcount() + 1
# Count utterance length 
df["word_count"] = df.utterances.apply(lambda x: len(str(x).split()))
# Create unique utterance id
df['global_key'] = df.id_H.astype(str) + "_T_" + df.id_T.astype(str) + "__U_" + df.id_U.astype(str) + "_"

In [5]:
df.head()

Unnamed: 0,congress,chamber,committee,committee_short,year,date,title,id_H,witness,witness_name,...,subcategory,state_research,contrarian_organisation,contrarian,denialist,utterances,id_T,id_U,word_count,global_key
0,108,Senate,"Committee on Commerce, Science, and Transporta...","Commerce, Science, and Transportation",2004,2004-05-06,The Impacts of Climate Change and States' Actions,108shrg82493,"Colburn, Kenneth A., Executive Director, North...",Colburn,...,Government Officials,0,,Other witness,Other witness,"Thank you, Mr. Chairman. It's a delight to be ...",1,1,10,108shrg82493_T_1__U_1_
0,108,Senate,"Committee on Commerce, Science, and Transporta...","Commerce, Science, and Transportation",2004,2004-05-06,The Impacts of Climate Change and States' Actions,108shrg82493,"Colburn, Kenneth A., Executive Director, North...",Colburn,...,Government Officials,0,,Other witness,Other witness,My name's Ken Colburn. I'm the Executive Direc...,1,2,59,108shrg82493_T_1__U_2_
0,108,Senate,"Committee on Commerce, Science, and Transporta...","Commerce, Science, and Transportation",2004,2004-05-06,The Impacts of Climate Change and States' Actions,108shrg82493,"Colburn, Kenneth A., Executive Director, North...",Colburn,...,Government Officials,0,,Other witness,Other witness,"The biggest concern I have, Senator, is stayin...",1,3,42,108shrg82493_T_1__U_3_
0,108,Senate,"Committee on Commerce, Science, and Transporta...","Commerce, Science, and Transportation",2004,2004-05-06,The Impacts of Climate Change and States' Actions,108shrg82493,"Colburn, Kenneth A., Executive Director, North...",Colburn,...,Government Officials,0,,Other witness,Other witness,"This week is a good example. On Monday, the no...",1,4,51,108shrg82493_T_1__U_4_
0,108,Senate,"Committee on Commerce, Science, and Transporta...","Commerce, Science, and Transportation",2004,2004-05-06,The Impacts of Climate Change and States' Actions,108shrg82493,"Colburn, Kenneth A., Executive Director, North...",Colburn,...,Government Officials,0,,Other witness,Other witness,"Senator, I'm not sure if California has those ...",1,5,16,108shrg82493_T_1__U_5_


In [6]:
# Save the data at utterance level
df.reset_index(inplace=True, drop =True)
df.to_json("../Data/utterances.json")

In [7]:
# Subset contrarian utterances with at least 10 words
con = df[df.contrarian == "Contrarian"]
con = con[con.word_count >=10]
con.reset_index(inplace=True, drop =True)
con.head(3)

Unnamed: 0,congress,chamber,committee,committee_short,year,date,title,id_H,witness,witness_name,...,subcategory,state_research,contrarian_organisation,contrarian,denialist,utterances,id_T,id_U,word_count,global_key
0,108,House,Committee on Energy and Commerce,Energy and Commerce,2003,2003-07-08,The Clear Skies Initiative: A Multipollutant A...,108hhrg88427,"Holmstead, Hon. Jeffrey, Assistant Administrat...",Holmstead,...,Government Officials,0,,Contrarian,Other contrarian,It really is an honor to be here today and to ...,1,2,53,108hhrg88427_T_1__U_2_
1,108,House,Committee on Energy and Commerce,Energy and Commerce,2003,2003-07-08,The Clear Skies Initiative: A Multipollutant A...,108hhrg88427,"Holmstead, Hon. Jeffrey, Assistant Administrat...",Holmstead,...,Government Officials,0,,Contrarian,Other contrarian,I have been looking forward to this opportunit...,1,3,81,108hhrg88427_T_1__U_3_
2,108,House,Committee on Energy and Commerce,Energy and Commerce,2003,2003-07-08,The Clear Skies Initiative: A Multipollutant A...,108hhrg88427,"Holmstead, Hon. Jeffrey, Assistant Administrat...",Holmstead,...,Government Officials,0,,Contrarian,Other contrarian,"As a number of you have mentioned, the air in ...",1,4,76,108hhrg88427_T_1__U_4_


In [8]:
# Prepare contrarian utterances for export to labelbox
con["value"] = "tag_string"
con = con[['utterances', 'global_key']]
con = con.rename(columns={"utterances": "row_data"})
con["media_type"] = "TEXT"
con.reset_index(inplace=True, drop=True)
con.head()

Unnamed: 0,row_data,global_key,media_type
0,It really is an honor to be here today and to ...,108hhrg88427_T_1__U_2_,TEXT
1,I have been looking forward to this opportunit...,108hhrg88427_T_1__U_3_,TEXT
2,"As a number of you have mentioned, the air in ...",108hhrg88427_T_1__U_4_,TEXT
3,"Over the last 30 years, as we have implemented...",108hhrg88427_T_1__U_5_,TEXT
4,A lot of you have mentioned concern about natu...,108hhrg88427_T_1__U_6_,TEXT


In [9]:
label_dicts = con.to_dict('records')
label_dicts[0:2]

[{'row_data': 'It really is an honor to be here today and to appear in front of you again. I had planned to begin by singing a rousing rendition of Happy Birthday to Mr. Dingell. But now that Chairman Tauzin has already recognized this day, I think I will defer in all of our interests.',
  'global_key': '108hhrg88427_T_1__U_2_',
  'media_type': 'TEXT'},
 {'row_data': "I have been looking forward to this opportunity for quite some time. And at your invitation, I am going to depart from custom a little bit, and rather than reading a statement, what I would like to do is just give a relatively brief presentation and go through some slides in the hope that we can collectively understand some of these issues a little bit better. And I do hope, in particular, that I can begin to overcome Mr. Dingell's skepticism.",
  'global_key': '108hhrg88427_T_1__U_3_',
  'media_type': 'TEXT'}]

In [10]:
# from labelbox import Client

# client = Client(api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJjbGQ4bXFlbWY5czdpMDd6YTQxZXU1OHdnIiwib3JnYW5pemF0aW9uSWQiOiJjbGQ4bXFlbTI5czdoMDd6YWNvbjc2MXlrIiwiYXBpS2V5SWQiOiJjbGQ4bm1pbzc0eDJkMDcyNzc4MHBnYmJ2Iiwic2VjcmV0IjoiZjFlNmY2ZWRjNjRjMDNmMThhMzM2ZjU0YmQwMzMxNzciLCJpYXQiOjE2NzQ0NjkwMjcsImV4cCI6MjMwNTYyMTAyN30.Cm4rJlDd2B53BXtcXpLP0DvWSjmeWr8GdZU9RV5NEPQ")

# dataset = client.create_dataset(name="Cap-and-trade contrarian testimonies")

# task = dataset.create_data_rows(label_dicts)
# task.wait_till_done()
# print(task.errors)