The postings table has data that requires flattening. Some members switched parties and these changes need to be considered a separate posting for the durations served with each party.

In [None]:
# Import Dependencies
import pandas as pd
from datetime import datetime
import csv
import re

In [32]:
# Load the postings CSV file into a DataFrame
postings_df = pd.read_csv('data\\postings.csv', dtype=str)
postings_df.head()

Unnamed: 0,bioguide_id,chamber,job_type,congress_number,congress_start_date,congress_end_date,region_type,region_code,party_name,job_start_date,job_end_date,party_start_date,party_end_date
0,A000002,Representative,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,VA,Democrat,,,,
1,A000016,Representative,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,MS,Democrat,,,,
2,A000024,Representative,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,IN,Republican,,,,
3,A000054,Representative,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,NJ,Democrat,,,,
4,A000062,Senator,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,VT,Republican,,,,


In [33]:
postings_df.shape

(36714, 13)

In [35]:
# Remove duplicate rows from postings_df
# Check for duplicate rows
duplicates_before = postings_df[postings_df.duplicated()]
print(f"Number of duplicate rows before cleaning: {len(duplicates_before)}")
# Remove duplicates
postings_df = postings_df.drop_duplicates()
print(f"Number of rows after removing duplicates: {len(postings_df)}")
postings_df.shape


Number of duplicate rows before cleaning: 18357
Number of rows after removing duplicates: 18357


(18357, 13)

18 members had switched parties while in office. A new extraction logic needed to be used to expand each posting into separate ones with a single party.

In [36]:
# Make a list of bioguide_ids where the party_name column contains a list containing more than one party
# This confirms that the new extraction logic is working as intended
# Ignore NaN values
multi_party_bioguide_ids = postings_df[postings_df['party_name'].str.contains(',', na=False)]['bioguide_id'].unique().tolist()
# print the list
print(multi_party_bioguide_ids)
# print a count of how many bioguide_ids are in the list
print(f"Number of bioguide_ids with multiple parties: {len(multi_party_bioguide_ids)}")

[]
Number of bioguide_ids with multiple parties: 0


A data error was found during exploration. The start date of the first congress served would be found in the start date of the final term served.

In [37]:
# Convert date columns to datetime format
date_columns = ['job_start_date', 'job_end_date', 'congress_start_date', 'congress_end_date', 'party_start_date', 'party_end_date']
for col in date_columns:
    postings_df[col] = pd.to_datetime(postings_df[col], errors='coerce')
# Verify dtype conversion
postings_df.dtypes

bioguide_id                    object
chamber                        object
job_type                       object
congress_number                object
congress_start_date    datetime64[ns]
congress_end_date      datetime64[ns]
region_type                    object
region_code                    object
party_name                     object
job_start_date         datetime64[ns]
job_end_date           datetime64[ns]
party_start_date       datetime64[ns]
party_end_date         datetime64[ns]
dtype: object

In [38]:
# Make a list of bioguide_ids where the job_start_date is earlier than the congress_start_date
date_error_bioguide_ids_after = postings_df[pd.to_datetime(postings_df['job_start_date'], errors='coerce') < pd.to_datetime(postings_df['congress_start_date'], errors='coerce')]['bioguide_id'].unique().tolist()
print(date_error_bioguide_ids_after)
# print a count of how many bioguide_ids are in the list
print(f"Number of bioguide_ids with job_start_date earlier than congress_start_date after cleaning: {len(date_error_bioguide_ids_after)}")


['B000223', 'P000152', 'B001251', 'B001270', 'D000482', 'G000582', 'M001165']
Number of bioguide_ids with job_start_date earlier than congress_start_date after cleaning: 7


In [39]:
# Show all the rows for the date_error_bioguide_ids_after list
date_error_df_after = postings_df[postings_df['bioguide_id'].isin(date_error_bioguide_ids_after)]
date_error_df_after.head(20)

Unnamed: 0,bioguide_id,chamber,job_type,congress_number,congress_start_date,congress_end_date,region_type,region_code,party_name,job_start_date,job_end_date,party_start_date,party_end_date
34,B000223,Representative,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,TN,Democrat,NaT,NaT,NaT,NaT
1153,B000223,Representative,CongressMemberJob,87,1961-01-03,1963-01-03,StateRegion,TN,Democrat,NaT,NaT,NaT,NaT
2283,B000223,Senator,CongressMemberJob,88,1963-01-03,1965-01-03,StateRegion,TN,Democrat,1964-11-04,NaT,NaT,NaT
2284,B000223,Representative,CongressMemberJob,88,1963-01-03,1965-01-03,StateRegion,TN,,1953-01-03,1964-11-03,NaT,NaT
3400,B000223,Senator,CongressMemberJob,89,1965-01-03,1967-01-03,StateRegion,TN,Democrat,NaT,1967-01-03,NaT,NaT
15972,P000152,Representative,CongressMemberJob,100,1987-01-03,1989-01-03,StateRegion,VA,Democrat,NaT,NaT,NaT,NaT
17080,P000152,Representative,CongressMemberJob,101,1989-01-03,1991-01-03,StateRegion,VA,Democrat,NaT,NaT,NaT,NaT
18201,P000152,Representative,CongressMemberJob,102,1991-01-03,1993-01-03,StateRegion,VA,Democrat,NaT,NaT,NaT,NaT
19323,P000152,Representative,CongressMemberJob,103,1993-01-03,1995-01-03,StateRegion,VA,Democrat,NaT,NaT,NaT,NaT
20176,D000482,Representative,CongressMemberJob,104,1995-01-03,1997-01-03,StateRegion,PA,Democrat,NaT,NaT,NaT,NaT


In [19]:
# if job_start_date or job_end_date is on a January third, delete that datapoint
# Count the number of values that are January 3rd before deleting
jan_3_start_count = len(postings_df[postings_df['job_start_date'].str.endswith('-01-03', na=False)])
jan_3_end_count = len(postings_df[postings_df['job_end_date'].str.endswith('-01-03', na=False)])
print(f"Number of job_start_date values on January 3rd: {jan_3_start_count}")
print(f"Number of job_end_date values on January 3rd: {jan_3_end_count}")
# Clear those values
postings_df.loc[postings_df['job_start_date'].str.endswith('-01-03', na=False), 'job_start_date'] = None
postings_df.loc[postings_df['job_end_date'].str.endswith('-01-03', na=False), 'job_end_date'] = None
# Count the number of values that are January 3rd after deleting
jan_3_start_count_after = len(postings_df[postings_df['job_start_date'].str.endswith('-01-03', na=False)])
jan_3_end_count_after = len(postings_df[postings_df['job_end_date'].str.endswith('-01-03', na=False)])
print(f"Number of job_start_date values on January 3rd after deletion: {jan_3_start_count_after}")
print(f"Number of job_end_date values on January 3rd after deletion: {jan_3_end_count_after}")



Number of job_start_date values on January 3rd: 1010
Number of job_end_date values on January 3rd: 574
Number of job_start_date values on January 3rd after deletion: 0
Number of job_end_date values on January 3rd after deletion: 0


In [24]:
# remove duplicate rows again to ensure none were reintroduced
duplicates_after = postings_df[postings_df.duplicated()]
print(f"Number of duplicate rows after cleaning: {len(duplicates_after)}")
# Save the cleaned DataFrame back to a CSV file
postings_df.to_csv('data\\postings_working.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)


Number of duplicate rows after cleaning: 18357


In [29]:
# count the number of duplicate rows again
duplicates_after = postings_df[postings_df.duplicated()]
print(f"Number of duplicate rows before dropping: {len(duplicates_after)}")
# drop duplicate rows again
postings_df = postings_df.drop_duplicates()
# count the number of duplicate after dropping
duplicates_after = postings_df[postings_df.duplicated()]
print(f"Number of duplicate rows after dropping: {len(duplicates_after)}")
postings_df.shape



Number of duplicate rows before dropping: 0
Number of duplicate rows after dropping: 0


(18357, 13)

In [30]:
# Check for any rows where job_start_date is earlier than congress_start_date again
date_error_bioguide_ids_after = postings_df[pd.to_datetime(postings_df['job_start_date'], errors='coerce') < pd.to_datetime(postings_df['congress_start_date'], errors='coerce')]['bioguide_id'].unique().tolist()
print(date_error_bioguide_ids_after)
# print a count of how many bioguide_ids are in the list
print(f"Number of bioguide_ids with job_start_date earlier than congress_start_date after cleaning: {len(date_error_bioguide_ids_after)}")


['P000152', 'B001251']
Number of bioguide_ids with job_start_date earlier than congress_start_date after cleaning: 2


In [31]:
# Show all the rows with the date_error_bioguide_ids_after
date_error_df_after = postings_df[postings_df['bioguide_id'].isin(date_error_bioguide_ids_after)]
date_error_df_after.head(20)


Unnamed: 0,bioguide_id,chamber,job_type,congress_number,congress_start_date,congress_end_date,region_type,region_code,party_name,job_start_date,job_end_date,party_start_date,party_end_date
15972,P000152,Representative,CongressMemberJob,100,1987-01-03,1989-01-03,StateRegion,VA,Democrat,,,,
17080,P000152,Representative,CongressMemberJob,101,1989-01-03,1991-01-03,StateRegion,VA,Democrat,,,,
18201,P000152,Representative,CongressMemberJob,102,1991-01-03,1993-01-03,StateRegion,VA,Democrat,,,,
19323,P000152,Representative,CongressMemberJob,103,1993-01-03,1995-01-03,StateRegion,VA,Democrat,,,,
20437,P000152,Representative,CongressMemberJob,104,1995-01-03,1997-01-03,StateRegion,VA,Democrat,1988-06-14,,,
24525,B001251,Representative,CongressMemberJob,108,2003-01-03,2005-01-03,StateRegion,NC,Democrat,,,,
25614,B001251,Representative,CongressMemberJob,109,2005-01-03,2007-01-03,StateRegion,NC,Democrat,,,,
26704,B001251,Representative,CongressMemberJob,110,2007-01-03,2009-01-03,StateRegion,NC,Democrat,,,,
27818,B001251,Representative,CongressMemberJob,111,2009-01-03,2011-01-03,StateRegion,NC,Democrat,,,,
28938,B001251,Representative,CongressMemberJob,112,2011-01-03,2013-01-03,StateRegion,NC,Democrat,,,,


In [None]:
# 