<a href="https://colab.research.google.com/github/KatBCN/PODS-Project/blob/main/MissingActionCodes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import datetime

In [2]:
# Import raw data from Github link
dfRaw = pd.read_csv("https://raw.githubusercontent.com/KatBCN/PODS-Project/Irene/obtain_data/data/117Congress_BillActions_RAW.csv")

# Import action code dictionary from Github link
actionCode_df = pd.read_csv("https://raw.githubusercontent.com/KatBCN/PODS-Project/Irene/obtain_data/data/actionCode_dict.csv", sep = '\t', )[['Code','Action']]
actionCode_df.columns = ['actionCode','actionName']

In [3]:
# Show shape of raw data frame
dfRaw.shape

(33020, 11)

In [4]:
# Show column titles and types
dfRaw.dtypes

billTitle            object
billNumber            int64
billType             object
congress              int64
fullDate             object
actionCode           object
actionName           object
type                 object
sourceSystem/name    object
text                 object
billOriginalTitle    object
dtype: object

In [5]:
# Assign data types
dfRaw = dfRaw.astype({'billNumber':object, 'congress':object, 'fullDate':'datetime64'})

In [6]:
# Check for duplicates
sum(dfRaw.duplicated())

0

In [7]:
# Show first 5 rows of data
dfRaw.head(5)

Unnamed: 0,billTitle,billNumber,billType,congress,fullDate,actionCode,actionName,type,sourceSystem/name,text,billOriginalTitle
0,Proposing an amendment to the Constitution of ...,1,HJRES,117,2021-01-04,1000,Introduced in House,IntroReferral,Library of Congress,Introduced in House,Proposing an amendment to the Constitution of ...
1,Proposing an amendment to the Constitution of ...,1,HJRES,117,2021-01-04,H11100,Referred to the Committee,IntroReferral,House floor actions,Referred to the House Committee on the Judiciary.,Proposing an amendment to the Constitution of ...
2,Proposing an amendment to the Constitution of ...,1,HJRES,117,2021-01-04,Intro-H,,IntroReferral,Library of Congress,Introduced in House,Proposing an amendment to the Constitution of ...
3,Proposing an amendment to the Constitution of ...,1,HJRES,117,2021-03-04,,,Committee,House committee actions,Referred to the Subcommittee on the Constituti...,Proposing an amendment to the Constitution of ...
4,Expressing support for designation of Septembe...,10,HJRES,117,2021-01-04,1000,Introduced in House,IntroReferral,Library of Congress,Introduced in House,Expressing support for designation of Septembe...


In [8]:
# Create function to view summary statistics of each variable.
def mySummary(df):
    for v in df.columns:
            print ("\n" + v)
            print(df[v].describe())

In [9]:
mySummary(dfRaw)


billTitle
count                                                 33020
unique                                                 7675
top       National Defense Authorization Act for Fiscal ...
freq                                                     43
Name: billTitle, dtype: object

billNumber
count     33020
unique     6065
top         610
freq         41
Name: billNumber, dtype: int64

billType
count     33020
unique        4
top          HR
freq      24976
Name: billType, dtype: object

congress
count     33020
unique        1
top         117
freq      33020
Name: congress, dtype: int64

fullDate
count                   33020
unique                   2706
top       2021-01-28 00:00:00
freq                      587
first     2021-01-03 00:00:00
last      2021-12-10 00:00:00
Name: fullDate, dtype: object

actionCode
count     25921
unique       62
top        1000
freq       6132
Name: actionCode, dtype: object

actionName
count                   18965
unique                     30
top 

  """


In [10]:
# Create variable of unique actionCodes
actionCodes = dfRaw['actionCode'].unique()
actionCodes

array(['1000', 'H11100', 'Intro-H', nan, 'H1L210', 'H1L220', 'H30000',
       'H8D000', 'H35000', '8000', 'H37100', 'H38310', 'B00100', '5500',
       'H12300', 'H30200', '14500', '17000', '28000', 'E20000', '36000',
       'E30000', 'E40000', '5000', 'H12200', 'H12410', 'H36200', 'H8A000',
       'H36210', 'H38900', 'H30300', 'H37220', 'H37300', 'H1B000', '9000',
       'H38800', '19500', 'H41610', 'H41931', 'H12100', 'H40150',
       'H41400', 'H12440', 'H40110', 'H40140', 'H11210', '14000',
       'H36600', 'H36610', 'H17000', 'H12210', 'H12420', 'H30800',
       'H41930', 'H82000', 'H38400', 'H38410', 'H40130', '10000',
       'H14000', 'H15000', '20500', '14900'], dtype=object)

In [11]:
# Show number of unique actionCodes
actionCodes.shape

(63,)

In [12]:
# Create variable of unique actionNames
actionNames = dfRaw['actionName'].unique()
actionNames

array(['Introduced in House', 'Referred to the Committee', nan,
       'Rule provides for consideration of',
       'Rule passed/agreed in House', 'Consideration by House', 'DEBATE',
       'The previous question was ordered pursuant to the rule',
       'Passed/agreed to in House', 'Motion To Reconsider Results',
       'Sponsor introductory remarks on measure',
       'House committee discharged', 'Committee discharged',
       'Senate committee discharged', 'Passed/agreed to in Senate',
       'Presented to President', 'Became Public Law',
       'Signed by President', 'Became Public Law No: 114-47',
       'Reported to House', 'Committee reported',
       'Union Calendar assignment',
       'Final Passage Under Suspension of the Rules Results',
       'Failed of passage/not agreed to in House',
       'Committee report of an original measure', 'Reported to Senate',
       'Motion to Discharge Committee', 'Introduced in Senate',
       'Received in the House', 'Held at the desk',
  

In [13]:
# Show number of unique actionNames
actionNames.shape

(31,)

In order to fill in missing action codes and maintain consistency, we will reference the websites: https://www.congress.gov/help/field-values/action-codes and https://github.com/usgpo/bill-status/blob/master/BILLSTATUS-XML_User_User-Guide.md#3-action-code-element-possible-values

In [14]:
# Create dicitionary of published action codes
actionCode_dict = dict(zip(actionCode_df.actionCode, actionCode_df.actionName))
len(actionCode_dict)

93

In [15]:
# Display action code dictionary from published sources
actionCode_dict

{'1000': 'Introduced in House',
 '10000': 'Introduced in Senate',
 '11000': 'Referred to Senate committee',
 '12000': 'Referred to Senate subcommittee',
 '13000': 'Senate committee/subcommittee actions',
 '13100': 'Senate committee/subcommittee hearings',
 '13200': 'Senate committee/subcommittee markups',
 '13900': 'Senate committee time extension',
 '14000': 'Reported to Senate',
 '14500': 'Senate committee discharged',
 '14900': 'Senate committee report filed after reporting',
 '16000': 'Senate floor actions',
 '17000': 'Passed/agreed to in Senate',
 '18000': 'Failed of passage/not agreed to in Senate',
 '19000': 'Resolving differences -- House actions',
 '2000': 'Referred to House committee',
 '20000': 'Resolving differences -- Senate actions',
 '20800': 'Conference committee actions',
 '20900': 'Conference report filed',
 '21000': 'Conference report agreed to in House',
 '22000': 'Conference report disagreed to in House',
 '23000': 'Conference report agreed to in Senate',
 '24000':

Dictionaries of Action Codes should be built per source system because multiple sources may use the same codes for different types of actions. The systems are not coordinated with each other.

In [16]:
# Display unique source system names
dfRaw['sourceSystem/name'].unique()

array(['Library of Congress', 'House floor actions',
       'House committee actions', 'Senate'], dtype=object)

In [17]:
# Subset data frame per unique source system
dfLOC = dfRaw.loc[dfRaw['sourceSystem/name'] == 'Library of Congress']
dfHF = dfRaw.loc[dfRaw['sourceSystem/name'] == 'House floor actions']
dfHC = dfRaw.loc[dfRaw['sourceSystem/name'] == 'House committee actions']
dfS = dfRaw.loc[dfRaw['sourceSystem/name'] == 'Senate']

In [18]:
# Create dictionaries of action codes and names per system that appear in data
codeNameLOC = dict(zip(dfLOC.actionCode, dfLOC.actionName))
codeNameHF = dict(zip(dfHF.actionCode, dfHF.actionName))
codeNameHC = dict(zip(dfHC.actionCode, dfHC.actionName))
codeNameS = dict(zip(dfS.actionCode, dfS.actionName))

Action Code Dictionary for Library of Congress:

In [19]:
codeNameLOC

{'1000': 'Introduced in House',
 '10000': 'Introduced in Senate',
 '14000': 'Reported to Senate',
 '14500': 'Senate committee discharged',
 '14900': 'Senate committee report filed after reporting',
 '17000': 'Passed/agreed to in Senate',
 '19500': nan,
 '20500': nan,
 '28000': 'Presented to President',
 '36000': 'Became Public Law',
 '5000': 'Reported to House',
 '5500': 'House committee discharged',
 '8000': 'Passed/agreed to in House',
 '9000': 'Failed of passage/not agreed to in House',
 'B00100': 'Sponsor introductory remarks on measure',
 'E30000': 'Signed by President',
 'E40000': 'Became Public Law No: 114-47',
 'Intro-H': nan}

Action Code Dictionary for House of Representatives Floor Actions

In [20]:
codeNameHF

{'E20000': 'Presented to President',
 'H11100': 'Referred to the Committee',
 'H11210': nan,
 'H12100': 'Committee report of an original measure',
 'H12200': 'Committee reported',
 'H12210': nan,
 'H12300': 'Committee discharged',
 'H12410': 'Union Calendar assignment',
 'H12420': nan,
 'H12440': nan,
 'H14000': 'Received in the House',
 'H15000': 'Held at the desk',
 'H17000': 'Motion to Discharge Committee',
 'H1B000': nan,
 'H1L210': 'Rule provides for consideration of',
 'H1L220': 'Rule passed/agreed in House',
 'H30000': 'Consideration by House',
 'H30200': nan,
 'H30300': nan,
 'H30800': nan,
 'H35000': 'The previous question was ordered pursuant to the rule',
 'H36200': nan,
 'H36210': nan,
 'H36600': nan,
 'H36610': nan,
 'H37100': nan,
 'H37220': nan,
 'H37300': 'Final Passage Under Suspension of the Rules Results',
 'H38310': 'Motion To Reconsider Results',
 'H38400': nan,
 'H38410': nan,
 'H38800': nan,
 'H38900': nan,
 'H40110': nan,
 'H40130': nan,
 'H40140': nan,
 'H40150

Action Code Dictionary for House of Representatives Committee Actions:

In [21]:
codeNameHC

{nan: nan}

Action Code Dictionary for Senate:

In [22]:
codeNameS

{nan: nan}

In [23]:
# Show number of rows which are missing actionCodes in Raw Data
dfRaw.actionCode.isna().sum()

7099

All data that comes from the sources House Committee Actions and Senate do not have action codes included in the xml files that were downloaded from the bulk data repository.

In [24]:
# Create subset of data frame where actionCode is missing
df = dfRaw[dfRaw['actionCode'].isnull()]
df.shape

(7099, 11)

In [25]:
df.head(5)

Unnamed: 0,billTitle,billNumber,billType,congress,fullDate,actionCode,actionName,type,sourceSystem/name,text,billOriginalTitle
3,Proposing an amendment to the Constitution of ...,1,HJRES,117,2021-03-04,,,Committee,House committee actions,Referred to the Subcommittee on the Constituti...,Proposing an amendment to the Constitution of ...
10,Proposing an amendment to the Constitution of ...,11,HJRES,117,2021-03-04,,,Committee,House committee actions,Referred to the Subcommittee on the Constituti...,Proposing an amendment to the Constitution of ...
14,Proposing an amendment to the Constitution of ...,12,HJRES,117,2021-03-04,,,Committee,House committee actions,Referred to the Subcommittee on the Constituti...,Proposing an amendment to the Constitution of ...
18,Proposing a balanced budget amendment to the C...,13,HJRES,117,2021-03-04,,,Committee,House committee actions,Referred to the Subcommittee on the Constituti...,Proposing a balanced budget amendment to the C...
22,Proposing an amendment to the Constitution of ...,14,HJRES,117,2021-03-04,,,Committee,House committee actions,Referred to the Subcommittee on the Constituti...,Proposing an amendment to the Constitution of ...


In [26]:
# Summary of data frame of rows with missing action codes
mySummary(df)


billTitle
count                                              7099
unique                                             5605
top       Promoting Physical Activity for Americans Act
freq                                                  7
Name: billTitle, dtype: object

billNumber
count     7099
unique    4376
top        610
freq         8
Name: billNumber, dtype: int64

billType
count     7099
unique       4
top          S
freq      3540
Name: billType, dtype: object

congress
count     7099
unique       1
top        117
freq      7099
Name: congress, dtype: int64

fullDate
count                    7099
unique                    238
top       2021-06-24 00:00:00
freq                      150
first     2021-01-03 00:00:00
last      2021-12-10 00:00:00
Name: fullDate, dtype: object

actionCode
count       0
unique      0
top       NaN
freq      NaN
Name: actionCode, dtype: object

actionName
count       0
unique      0
top       NaN
freq      NaN
Name: actionName, dtype: object

type
count 

  """


In [27]:

def fillCode (row):
  """
  A function to fill missing actionCodes. This can be used with
  df['actionCode'] = df.apply (lambda row: fillCode(row) if pd.isnull(row['actionCode']) else row['actionCode'], axis=1)

  This has only been tested on rows with null actionCodes.
  If rows with non-null actionCodes are used, this will not work as intended.

  After filling the actionCodes, it is recommended to apply a dictionary
  of actionCodes and actionNames to complete the data.
  """
  if row['sourceSystem/name'] == "House committee actions":
    if "referred to the subcommittee" in str.lower(row['text']):
      return '3000'  # Referred to House subcommittee
    elif "markup" in str.lower(row['text']):
      return '4200'  # House committee/subcommittee markups
    elif "mark-up" in str.lower(row['text']):
      return '4200'  # House committee/subcommittee markups
    elif "hearings" in str.lower(row['text']):
      return '4100'  # House committee/subcommittee hearings
    elif "ordered to be reported" in str.lower(row['text']):
      return '4000'  # House committee/subcommittee actions
    # Generic rule based on type for text not matching earlier rules
    elif row['type'] == "Committee":
      return '4000'  # House committee/subcommittee actions
  # Setting rules for actionCodes related to the Senate
  elif row['sourceSystem/name'] == "Senate":
    if "received in the senate" in str.lower(row['text']):
      return '10000'  # Introduced in Senate
    elif "introduced in the senate" in str.lower(row['text']):
      return '10000'  # Introduced in Senate
    elif "referred to the committee" in str.lower(row['text']):
      return '11000'  # Referred to Senate committee
    elif "passed senate" in str.lower(row['text']):
      return '17000'  # Passed/agreed to in Senate
    elif all(word in str.lower((row['text'])) for word in ["committee", "filed", "report"]):
      return '14900' # Senate committee report filed after reporting
    elif row['type'] == "IntroReferral":
      return '11000'  # Referred to Senate committee
    elif row['type'] == "ResolvingDifferences":
      return '20000' # Resolving differences -- Senate actions
    # Generic rule based on type for text not matching earlier rules
    elif row['type'] == "Calendars":
      return 'SenateCal' # need to define a code for Senate Calendar and add to dictionary
    elif row['type'] == "Committee":
      if "hearings" in str.lower(row['text']):
        return '13100' # Senate committee/subcommittee hearings
      elif "markup" in str.lower(row['text']):
        return '13200'  # Senate committee/subcommittee markups
      elif "mark-up" in str.lower(row['text']):
        return '13200'  # Senate committee/subcommittee markups
      # Generic rule based on type for text not matching earlier rules
      else: 
        return '13000' # Senate committee/subcommittee actions
    elif row['type'] == "Floor":
      if "message on senate action sent to the house" in str.lower(row['text']):
        return '5000'  # Reported to House
      if "message on house action received in senate" in str.lower(row['text']):
        return '14000'  # Reported to Senate
      # Generic rule based on type for text not matching earlier rules
      else:
        return '16000'  # Senate floor action
  else:
    return row['actionCode']  # do nothing if source system doesn't match rules



In [28]:
# Create a copy of original raw data to fill missing actionCodes
df = dfRaw.copy()
# Use df.apply with a lambda function to fill the missing actionCodes
# It is very important only to pass rows to the function which have a null actionCode.
df['actionCode'] = df.apply (lambda row: fillCode(row) if pd.isnull(row['actionCode']) else row['actionCode'], axis=1)  

In [29]:
# Number of null actionCodes in original data
dfRaw.actionCode.isna().sum()

7099

In [30]:
# Number of null actionCodes in filled data
df.actionCode.isna().sum()

0

The next step should be to make sure that every actionCode has a reasonable actionName and standardize the names to name that are available on this website when possible: https://www.congress.gov/help/field-values/action-codes

In [31]:
pd.set_option('display.max_rows', None)
# Create a data frame showing the frequency of actionCodes
acFreq = df.actionCode.value_counts().to_frame()
acFreq = acFreq.rename(columns={"actionCode": "Frequency"})
acFreq['actionName'] = acFreq.index.map(actionCode_dict)  # index is actionCode
acFreq

Unnamed: 0,Frequency,actionName
1000,6132,Introduced in House
Intro-H,6118,
H11100,6109,Referred to the Committee
10000,3735,Introduced in Senate
11000,3348,Referred to Senate committee
3000,2951,Referred to House subcommittee
H30000,485,Consideration by House
H8D000,392,DEBATE
8000,337,Passed/agreed to in House
H30300,281,


In [83]:
new_dict = {'Intro-H' : 'Introduced in House', # IntroReferral
            'H30300' : 'Motion to suspend rules and pass bill', # House Floor
            'H37220' : 'Further proceedings postponed', #  House Floor
            'H1B000' : 'Proceedings are considered vacated', #  Senate Calendars
            'SenateCal' : 'Placed on Senate Legislative Calendar',
            'E40000' : 'Became Public Law', # President # Generalization needed
            'H37100' : 'Passed/agreed to in House', # House Floor
            'H36210' : 'Motion to recommit Failed', # House Floor
            'H36200' : 'Motion to recommit to Committee', # House Floor - Should occur prior to H36210
            'H8A000' : "Motion to recommit ordered", # House Floor
            'H38800' : 'Title of measure amended', # House Floor
            'H38900' : 'Clerk technical correction', # House Floor
            '19500' : actionCode_dict['19000'], # House Floor - same as 19000, but possibly after senate amendment
            'H41931' : 'Motion to Reconsider Agreed', # House Resolving Differences
            'H41610' : actionCode_dict['19000'], # House Resolving Differences - same as 19000, but possibly after senate amendment
            'H30200' : 'Motion to Consider', # House Floor
            'H12420' : 'Placed on House Calendar', # House Calendars
            'H11210' : 'House committee time extension', # House Intro Referral - same as 4900
            'H82000' : 'Motion to table Motion to Reconsider', # House Resolving Differences
            'H36610' : 'Motion to table Motion to Reconsider Agreed', # House Floor
            'H36600' : 'Motion to table Motion to Reconsider', # House Floor - same as H82000
            '20500' : actionCode_dict['20000'], # Senate Resolving Differences - same as 20000, but possibly after house amendment
            'H41400' : actionCode_dict['H35000']} # House Resolving Differences - same as H35000
            





In [82]:
view = df.loc[df['actionCode'] == 'H35000']
pd.set_option('display.max_colwidth', None)
view.text

36                             The previous question was ordered pursuant to the rule.
280                            The previous question was ordered pursuant to the rule.
614                            The previous question was ordered pursuant to the rule.
1209                           The previous question was ordered pursuant to the rule.
1271                           The previous question was ordered pursuant to the rule.
1726                           The previous question was ordered pursuant to the rule.
1926                           The previous question was ordered pursuant to the rule.
2022                           The previous question was ordered pursuant to the rule.
2545                           The previous question was ordered pursuant to the rule.
3263                           The previous question was ordered pursuant to the rule.
3408                           The previous question was ordered pursuant to the rule.
3506                           The previous