# Project 4: Team 7
## Predicting Congressional Bill Passage
### Extract,Transform, and Load: Congress data for the last 10 years- 113th Congress to 117th Congress

# Import dependecies and read in data:

In [1]:
# Import Dependencies:
import pandas as pd
import glob as glob
import numpy as np

In [2]:
# Read in all .csv files from congress.gov, could only extract 5,000 observations at a time, so there are 15 .csv files to read in and combine into a single df:
# Get CSV files list from a folder
path = "../Resources/all_bills_csvs"
csv_files = glob.glob(path + "/*.csv")

# Read each CSV file into DataFrame
# This creates a list of dataframes
df_list = (pd.read_csv(file) for file in csv_files)

# Concatenate all DataFrames
master_df = pd.concat(df_list, ignore_index=True)

# Save full file to Resources:
# master_df.to_csv('../Resources/all_bills.csv', index=False)

# Read in all_bills from S3 bucket on AWS: 
# master_df = pd.read_csv('https://project-4-team7.s3.ca-central-1.amazonaws.com/all_bills.csv', index_col=[0])
# Split data into raw dfs for the House and Senata data:
df_house = master_df[master_df['Legislation Number'].str.contains("H.J|H.R.")==True]
df_house.reset_index(drop=True)
df_senate = master_df[master_df['Legislation Number'].str.contains("S.J|S.")==True]
df_senate.reset_index(drop=True)

# Select only the columns that will be needed:
df_house = df_house[['Legislation Number', 'Congress', 'Title', 'Sponsor',
       'Date of Introduction', 'Number of Cosponsors', 'Committees',
       'Latest Action', 'Latest Action Date', 'Subject']]
df_senate = df_senate[['Legislation Number', 'Congress', 'Title', 'Sponsor',
       'Date of Introduction', 'Number of Cosponsors', 'Committees',
       'Latest Action', 'Latest Action Date', 'Subject']]

# # Save raw house and senate files to .csv:
# df_house.to_csv('../Resources/house_all.csv', index=False)
# df_senate.to_csv('../Resources/senate_all.csv', index=False)

# # Check master df:
master_df.head()

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,Legislation Number,URL,Congress,Title,Amends Bill,Sponsor,Date Offered,Date of Introduction,Number of Cosponsors,Date Submitted,...,Cosponsor.380,Cosponsor.381,Cosponsor.382,Cosponsor.383,Cosponsor.384,Cosponsor.385,Subject.453,Subject.454,Subject.455,Subject.456
0,S. 3,https://www.congress.gov/bill/116th-congress/s...,116th Congress (2019-2020),Keeping Health Insurance Affordable Act of 2019,,"Cardin, Benjamin L. [Sen.-D-MD]",,1/3/19,0,,...,,,,,,,,,,
1,S. 4,https://www.congress.gov/bill/116th-congress/s...,116th Congress (2019-2020),LIFT (Livable Incomes for Families Today) the ...,,"Harris, Kamala D. [Sen.-D-CA]",,1/3/19,0,,...,,,,,,,,,,
2,S. 20,https://www.congress.gov/bill/116th-congress/s...,116th Congress (2019-2020),Presidential Tax Transparency Act,,"Wyden, Ron [Sen.-D-OR]",,1/3/19,22,,...,,,,,,,,,,
3,S. 22,https://www.congress.gov/bill/116th-congress/s...,116th Congress (2019-2020),Medicare Dental Benefit Act of 2019,,"Cardin, Benjamin L. [Sen.-D-MD]",,1/3/19,4,,...,,,,,,,,,,
4,S. 23,https://www.congress.gov/bill/116th-congress/s...,116th Congress (2019-2020),National Commission on the Federal Response to...,,"Gillibrand, Kirsten E. [Sen.-D-NY]",,1/3/19,5,,...,,,,,,,,,,


### Cleaning Cosponsor data:

In [4]:
# Get list of cosponsor columns
cosponsor_cols = [col for col in master_df.columns if 'Cosponsor' in col]
cosponsor_cols.remove('Number of Cosponsors')
print(cosponsor_cols)
# Create new df with cosponsor columns
cosponsors_df = master_df[cosponsor_cols]
# Add bill and congress for identification, number of cosponsors to ensure party counts total correct
cosponsors_df.insert(0, "Legislation Number", master_df['Legislation Number'])
cosponsors_df.insert(1, "Congress", master_df['Congress'])
cosponsors_df.insert(2, "Number of Cosponsors", master_df['Number of Cosponsors'])
cosponsors_df.head()

['Cosponsor', 'Cosponsor.1', 'Cosponsor.2', 'Cosponsor.3', 'Cosponsor.4', 'Cosponsor.5', 'Cosponsor.6', 'Cosponsor.7', 'Cosponsor.8', 'Cosponsor.9', 'Cosponsor.10', 'Cosponsor.11', 'Cosponsor.12', 'Cosponsor.13', 'Cosponsor.14', 'Cosponsor.15', 'Cosponsor.16', 'Cosponsor.17', 'Cosponsor.18', 'Cosponsor.19', 'Cosponsor.20', 'Cosponsor.21', 'Cosponsor.22', 'Cosponsor.23', 'Cosponsor.24', 'Cosponsor.25', 'Cosponsor.26', 'Cosponsor.27', 'Cosponsor.28', 'Cosponsor.29', 'Cosponsor.30', 'Cosponsor.31', 'Cosponsor.32', 'Cosponsor.33', 'Cosponsor.34', 'Cosponsor.35', 'Cosponsor.36', 'Cosponsor.37', 'Cosponsor.38', 'Cosponsor.39', 'Cosponsor.40', 'Cosponsor.41', 'Cosponsor.42', 'Cosponsor.43', 'Cosponsor.44', 'Cosponsor.45', 'Cosponsor.46', 'Cosponsor.47', 'Cosponsor.48', 'Cosponsor.49', 'Cosponsor.50', 'Cosponsor.51', 'Cosponsor.52', 'Cosponsor.53', 'Cosponsor.54', 'Cosponsor.55', 'Cosponsor.56', 'Cosponsor.57', 'Cosponsor.58', 'Cosponsor.59', 'Cosponsor.60', 'Cosponsor.61', 'Cosponsor.62', 'Co

Unnamed: 0,Legislation Number,Congress,Number of Cosponsors,Cosponsor,Cosponsor.1,Cosponsor.2,Cosponsor.3,Cosponsor.4,Cosponsor.5,Cosponsor.6,...,Cosponsor.376,Cosponsor.377,Cosponsor.378,Cosponsor.379,Cosponsor.380,Cosponsor.381,Cosponsor.382,Cosponsor.383,Cosponsor.384,Cosponsor.385
0,S. 3,116th Congress (2019-2020),0,,,,,,,,...,,,,,,,,,,
1,S. 4,116th Congress (2019-2020),0,,,,,,,,...,,,,,,,,,,
2,S. 20,116th Congress (2019-2020),22,"Klobuchar, Amy [Sen.-D-MN]","Leahy, Patrick J. [Sen.-D-VT]","Carper, Thomas R. [Sen.-D-DE]","Menendez, Robert [Sen.-D-NJ]","Cardin, Benjamin L. [Sen.-D-MD]","Whitehouse, Sheldon [Sen.-D-RI]","Bennet, Michael F. [Sen.-D-CO]",...,,,,,,,,,,
3,S. 22,116th Congress (2019-2020),4,"Blumenthal, Richard [Sen.-D-CT]","Merkley, Jeff [Sen.-D-OR]","Brown, Sherrod [Sen.-D-OH]","Van Hollen, Chris [Sen.-D-MD]",,,,...,,,,,,,,,,
4,S. 23,116th Congress (2019-2020),5,"Schumer, Charles E. [Sen.-D-NY]","Blumenthal, Richard [Sen.-D-CT]","Warren, Elizabeth [Sen.-D-MA]","Markey, Edward J. [Sen.-D-MA]","Harris, Kamala D. [Sen.-D-CA]",,,...,,,,,,,,,,


In [5]:
# Set up cosponsor by party and state count of cosponsor:
# Count cosponsor dems per row and add to df
cosponsor_dems = cosponsors_df.astype(str).apply(lambda x: x.str.contains('-D-')).sum(axis=1)
cosponsors_df['Cosponsor Dems'] = cosponsor_dems
# Count cosponsor Reps per row and add to df
cosponsor_reps = cosponsors_df.astype(str).apply(lambda x: x.str.contains('-R-')).sum(axis=1)
cosponsors_df['Cosponsor Reps'] = cosponsor_reps
# Count cosponsor Independent per row and add to df
cosponsor_ind = cosponsors_df.astype(str).apply(lambda x: x.str.contains('-I-')).sum(axis=1)
cosponsors_df['Cosponsor Ind'] = cosponsor_ind
# get state for each cosponsor
for col in cosponsor_cols:
    cosponsors_df[col].update(cosponsors_df[col].str.split('-').str[2])

# remove any remaining brackets
cosponsors_df[cosponsor_cols] = cosponsors_df[cosponsor_cols].replace({']':''}, regex=True)

#get count of unique states
cosponsor_states = cosponsors_df[cosponsor_cols].nunique(axis=1)
cosponsors_df['Cosponsor States'] = cosponsor_states

# Inspect:
cosponsors_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation:

Unnamed: 0,Legislation Number,Congress,Number of Cosponsors,Cosponsor,Cosponsor.1,Cosponsor.2,Cosponsor.3,Cosponsor.4,Cosponsor.5,Cosponsor.6,...,Cosponsor.380,Cosponsor.381,Cosponsor.382,Cosponsor.383,Cosponsor.384,Cosponsor.385,Cosponsor Dems,Cosponsor Reps,Cosponsor Ind,Cosponsor States
0,S. 3,116th Congress (2019-2020),0,,,,,,,,...,,,,,,,0,0,0,0
1,S. 4,116th Congress (2019-2020),0,,,,,,,,...,,,,,,,0,0,0,0
2,S. 20,116th Congress (2019-2020),22,MN,VT,DE,NJ,MD,RI,CO,...,,,,,,,22,0,0,16
3,S. 22,116th Congress (2019-2020),4,CT,OR,OH,MD,,,,...,,,,,,,4,0,0,4
4,S. 23,116th Congress (2019-2020),5,NY,CT,MA,MA,CA,,,...,,,,,,,5,0,0,4


In [6]:
# Create clean df with cosponsor counts
clean_cosponsor_df = cosponsors_df[['Legislation Number','Congress','Number of Cosponsors','Cosponsor Dems','Cosponsor Reps','Cosponsor Ind', 'Cosponsor States']].reset_index(drop=True)
# Join clean cosponsor df with house df
house_df = pd.merge(clean_cosponsor_df, df_house, how='inner', on=['Legislation Number', 'Congress'])
house_df = house_df.drop(columns='Number of Cosponsors_y').rename(columns={'Number of Cosponsors_x': 'Number of Cosponsors'})
# Join clean cosponsor df with senate df
senate_df = pd.merge(clean_cosponsor_df, df_senate, how='inner', on=['Legislation Number', 'Congress'])
senate_df = senate_df.drop(columns='Number of Cosponsors_y').rename(columns={'Number of Cosponsors_x': 'Number of Cosponsors'})
# Concat house and senate dfs to finish cleaning
frames = [house_df, senate_df]
congress_df = pd.concat(frames).reset_index(drop=True)
congress_df.head()

Unnamed: 0,Legislation Number,Congress,Number of Cosponsors,Cosponsor Dems,Cosponsor Reps,Cosponsor Ind,Cosponsor States,Title,Sponsor,Date of Introduction,Committees,Latest Action,Latest Action Date,Subject
0,H.R. 1,117th Congress (2021-2022),222,222,0,0,41,For the People Act of 2021,"Sarbanes, John P. [Rep.-D-MD-3]",1/4/21,"House - House Administration, Intelligence (Pe...",Received in the Senate.,3/11/21,"Broadcasting, cable, digital technologies"
1,H.R. 3,117th Congress (2021-2022),92,92,0,0,33,Elijah E. Cummings Lower Drug Costs Now Act,"Pallone, Frank, Jr. [Rep.-D-NJ-6]",4/22/21,"House - Energy and Commerce, Ways and Means, E...",Referred to the Subcommittee on Oversight and ...,4/27/21,Health
2,H.R. 4,117th Congress (2021-2022),223,223,0,0,43,John R. Lewis Voting Rights Advancement Act of...,"Sewell, Terri A. [Rep.-D-AL-7]",8/17/21,House - Judiciary,Received in the Senate.,9/14/21,Government Operations and Politics
3,H.R. 5,117th Congress (2021-2022),224,224,0,0,43,Equality Act,"Cicilline, David N. [Rep.-D-RI-1]",2/18/21,"House - Judiciary, Education and Labor, Financ...",Committee on the Judiciary. Hearings held.,3/17/21,"Civil Rights and Liberties, Minority Issues"
4,H.R. 6,117th Congress (2021-2022),175,175,0,0,37,American Dream and Promise Act of 2021,"Roybal-Allard, Lucille [Rep.-D-CA-40]",3/3/21,"House - Judiciary, Education and Labor | Senat...",Committee on the Judiciary. Hearings held.,6/15/21,Immigration


In [7]:
# Set up Bill Type and Congress columns:
# Strip numbers and change Legislation Number to Bill Type
congress_df['Legislation Number'] = congress_df['Legislation Number'].str.replace('\d+', '')
congress_df = congress_df.rename(columns = {"Legislation Number": "Bill Type"})
congress_df['Bill Type'] = congress_df['Bill Type'].str.replace('Res. ', '')
congress_df['Bill Type'] = congress_df['Bill Type'].str.strip()

# Get number of congress only, column 2
congress_df['Congress'] = congress_df['Congress'].str[:3]
# Cast as int64:
congress_df['Congress'] = congress_df['Congress'].astype(int)


In [8]:
# Set up columns for sposor party, title and state
# extract party and state into new column for sponsor
new = congress_df["Sponsor"].str.split("[", n = 1, expand = True)
congress_df['Sponsor Split']= new[1]
congress_df.drop(columns =["Sponsor"], inplace = True)

# Sponsor title, sponsor state, sponsor party in new columns
new2 = congress_df["Sponsor Split"].str.split("-", n = 3, expand = True)
congress_df['Sponsor Title']= new2[0]
congress_df['Sponsor Party']= new2[1]
congress_df['Sponsor State']= new2[2]
congress_df = congress_df.drop(columns={'Sponsor Split'})
# Take out extra brackets in state column:
congress_df['Sponsor State'] = congress_df['Sponsor State'].replace({']':''}, regex=True)

# Create the month of bill introduction:
congress_df['Date of Introduction'] = pd.to_datetime(congress_df['Date of Introduction'])
congress_df['Month Introduced'] = pd.DatetimeIndex(congress_df['Date of Introduction']).month
congress_df = congress_df.drop(columns={'Date of Introduction'})

# Drop unneeded columns:
congress_df = congress_df.drop(columns={'Title', 'Latest Action Date'})

## Save whole congress dataset up to this point:

In [9]:
# Save whole cleaned dataset:
# congress_df.to_csv('../Resources/cleaned_congress.csv')

## Split the data by House and Senate:

In [10]:
# Split into house and senate dfs:
house_cleaned = congress_df[congress_df['Bill Type'].str.contains("H.J|H")==True]
senate_cleaned = congress_df[congress_df['Bill Type'].str.contains("S.J|S.")==True]
senate_cleaned = senate_cleaned.reset_index(drop=True)
house_cleaned.head()

Unnamed: 0,Bill Type,Congress,Number of Cosponsors,Cosponsor Dems,Cosponsor Reps,Cosponsor Ind,Cosponsor States,Committees,Latest Action,Subject,Sponsor Title,Sponsor Party,Sponsor State,Month Introduced
0,H.R.,117,222,222,0,0,41,"House - House Administration, Intelligence (Pe...",Received in the Senate.,"Broadcasting, cable, digital technologies",Rep.,D,MD,1
1,H.R.,117,92,92,0,0,33,"House - Energy and Commerce, Ways and Means, E...",Referred to the Subcommittee on Oversight and ...,Health,Rep.,D,NJ,4
2,H.R.,117,223,223,0,0,43,House - Judiciary,Received in the Senate.,Government Operations and Politics,Rep.,D,AL,8
3,H.R.,117,224,224,0,0,43,"House - Judiciary, Education and Labor, Financ...",Committee on the Judiciary. Hearings held.,"Civil Rights and Liberties, Minority Issues",Rep.,D,RI,2
4,H.R.,117,175,175,0,0,37,"House - Judiciary, Education and Labor | Senat...",Committee on the Judiciary. Hearings held.,Immigration,Rep.,D,CA,3


# House data cleaning:

### Committees:

In [11]:
# Committee column recoding to indicator variables:
# Create a list of committees for the House:
house_committees_lst = ["Agriculture", "Appropriations", "Armed Services", "Budget", "Education and the Workforce", "Energy and Commerce", "Ethics", "Financial Services", 
                        "Foreign Affairs", "Homeland Security", "House Administration", "Judiciary", "Natural Resources", 
                        "Oversight and Accountability", "Rules", "Science, Space, and Technology", "Small Business", "Transportation and Infrastructure", 
                        "Veterans' Affairs", "Ways and Means", "Intelligence", "Printing", "Taxation", "Library", "Economic"]
# Run a for loop to set each committee name to a new column and make a dummy var (case=False makes the str.contains case insensitive)):
for comm in house_committees_lst:
    house_cleaned[comm] = np.where(house_cleaned['Committees'].str.contains(comm, case=False), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


#### Subject:

In [12]:
# Subject column recoding to indicator variables:
# Create a df of the value counts of Subject:
value_counts_house_subject = house_cleaned['Subject'].value_counts(dropna=True, sort=True)
value_counts_house_subject_df = pd.DataFrame(value_counts_house_subject)
value_counts_house_subject_df = value_counts_house_subject_df.reset_index()
value_counts_house_subject_df.columns = ['unique_values', 'counts']
# Create a list of the Subjects with more than 200 appearances:
house_subject_df = value_counts_house_subject_df.loc[value_counts_house_subject_df["counts"]>200]
house_subject_lst = house_subject_df["unique_values"].tolist()
house_subject_lst.sort()
# Run a for loop to set each committee name to a new column and make a dummy var (case=False makes the str.contains case insensitive)):
for sub in house_subject_lst:
    house_cleaned[sub] = np.where(house_cleaned['Subject'].str.contains(sub, case=False), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [13]:
house_subject_lst

['Accounting and auditing',
 'Administrative law and regulatory procedures',
 'Administrative remedies',
 'Advisory bodies',
 'Agriculture and Food',
 'Appropriations',
 'Armed Forces and National Security',
 'Civil actions and liability',
 'Commerce',
 'Congress',
 'Congressional oversight',
 'Congressional tributes',
 'Crime and Law Enforcement',
 'Education',
 'Emergency Management',
 'Energy',
 'Environmental Protection',
 'Finance and Financial Sector',
 'Government Operations and Politics',
 'Health',
 'Housing and Community Development',
 'Immigration',
 'International Affairs',
 'Labor and Employment',
 'Native Americans',
 'Public Lands and Natural Resources',
 'Science, Technology, Communications',
 'Social Welfare',
 'Taxation',
 'Transportation and Public Works']

### Target variable for nn model

In [14]:
# Target for nn model- bill_passed
# Latest Action coding:
# Get "Became Public Law" and name it "bill_passed" set as dummy var:
house_cleaned["bill_passed"] = np.where(house_cleaned['Latest Action'].str.contains("Became Public Law", case=False), 1, 0)
print(house_cleaned["bill_passed"].value_counts())
print(f'For the 113th to 117th congress, the House of Reps passed {(round((1149/37905)*100,2))}% of laws introduced in the House.')

0    37905
1     1149
Name: bill_passed, dtype: int64
For the 113th to 117th congress, the House of Reps passed 3.03% of laws introduced in the House.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


## Save clean house dataset: 

In [15]:
# Save house cleaned dataset:
# house_cleaned.to_csv('../Resources/house_cleaned.csv', index=False)

# Senate Cleaning:

### Committees:

In [16]:
# Committee column recoding to indicator variables:
# Create a list of committees for the Senate:
senate_committees_lst = ["Agriculture, Nutrition, and Forestry", "Appropriations", "Armed Services", "Banking, Housing, and Urban Affairs", "Budget", 
                         "Commerce, Science, and Transportation", "Energy and Natural Resources", "Environment and Public Works", "Finance", 
                         "Foreign Relations", "Health, Education, Labor, and Pensions", "Homeland Security and Governmental Affairs","Judiciary", 
                         "Rules and Administration", "Small Business and Entrepreneurship", "Veterans Affairs", "International Narcotics Control", 
                         "Ethics", "Indian Affairs", "Intelligence", "Printing", "Taxation", "Library", "Economic"]
# Run a for loop to set each committee name to a new column and make a dummy var (case=False makes the str.contains case insensitive)):
for comm in senate_committees_lst:
    senate_cleaned[comm] = np.where(senate_cleaned['Committees'].str.contains(comm, case=False), 1, 0)


### Subject:

In [17]:
# Subject column recoding to indicator variables:
# Create a df of the value counts of Subject:
value_counts_senate_subject = senate_cleaned['Subject'].value_counts(dropna=True, sort=True)
value_counts_senate_subject_df = pd.DataFrame(value_counts_senate_subject)
value_counts_senate_subject_df = value_counts_senate_subject_df.reset_index()
value_counts_senate_subject_df.columns = ['unique_values', 'counts']
# Create a list of the Subjects with more than 200 appearances:
senate_subject_df = value_counts_senate_subject_df.loc[value_counts_senate_subject_df["counts"]>100]
senate_subject_lst = senate_subject_df["unique_values"].tolist()
senate_subject_lst.sort()

# Run a for loop to set each committee name to a new column and make a dummy var (case=False makes the str.contains case insensitive)):
for sub in senate_subject_lst:
    senate_cleaned[sub] = np.where(senate_cleaned['Subject'].str.contains(sub, case=False), 1, 0)

In [18]:
# Display the senate_subject_lst to copy later to the etl for the current bills in the etl_predict_currentbills nb:
senate_subject_lst

['Academic performance and assessments',
 'Accounting and auditing',
 'Administrative law and regulatory procedures',
 'Administrative remedies',
 'Advisory bodies',
 'Agriculture and Food',
 'Alternative and renewable resources',
 'Appropriations',
 'Armed Forces and National Security',
 'Civil actions and liability',
 'Commerce',
 'Congressional oversight',
 'Crime and Law Enforcement',
 'Economics and Public Finance',
 'Education',
 'Emergency Management',
 'Energy',
 'Environmental Protection',
 'Finance and Financial Sector',
 'Foreign Trade and International Finance',
 'Government Operations and Politics',
 'Health',
 'Housing and Community Development',
 'Immigration',
 'International Affairs',
 'Labor and Employment',
 'Native Americans',
 'Public Lands and Natural Resources',
 'Science, Technology, Communications',
 'Social Welfare',
 'Taxation',
 'Transportation and Public Works']

### Target variable for nn model: bill_passed

In [19]:
# Target for nn model- bill_passed
# Latest Action coding:
# Get "bill passed into law"
senate_cleaned["bill_passed"] = np.where(senate_cleaned['Latest Action'].str.contains("Became Public Law", case=False), 1, 0)
print(senate_cleaned["bill_passed"].value_counts())
print(f'For the 113th to 117th Senate passed {(round((614/20511)*100,2))}% of laws introduced in the Senate.')

0    20511
1      614
Name: bill_passed, dtype: int64
For the 113th to 117th Senate passed 2.99% of laws introduced in the Senate.


## Save clean house dataset: 

In [20]:
# Save house cleaned dataset:
# senate_cleaned.to_csv('../Resources/senate_cleaned.csv', index=False)