The postings table has data that requires flattening. Some members switched parties and these changes need to be considered a separate posting for the durations served with each party.

In [1]:
# Import Dependencies
import pandas as pd
from datetime import datetime
import csv
import re

In [2]:
# Load the postings CSV file into a DataFrame
postings_df = pd.read_csv('data\\postings_raw.csv', dtype=str)
postings_df.head()

Unnamed: 0,bioguide_id,chamber,job_type,congress_number,congress_start_date,congress_end_date,region_type,region_code,party_name,job_start_date,job_end_date,party_start_date,party_end_date
0,A000002,Representative,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,VA,Democrat,,,,
1,A000016,Representative,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,MS,Democrat,,,,
2,A000024,Representative,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,IN,Republican,,,,
3,A000054,Representative,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,NJ,Democrat,,,,
4,A000062,Senator,CongressMemberJob,86,1959-01-03,1961-01-03,StateRegion,VT,Republican,,,,


In [3]:
postings_df.shape

(36714, 13)

In [4]:
# List unique values in the job_type column
postings_df['job_type'].unique()


array(['CongressMemberJob', 'CongressLeadershipJob', 'OtherJob'],
      dtype=object)

In [5]:
# Remove all rows that have a job_type of OtherJob or CongressLeadershipJob
postings_df = postings_df[~postings_df['job_type'].isin(['OtherJob', 'CongressLeadershipJob'])]
postings_df.shape

(36456, 13)

In [6]:
# check for duplicate rows
duplicate_rows = postings_df[postings_df.duplicated()]
duplicate_rows.shape


(18228, 13)

In [7]:
# Remove Duplicates
postings_df = postings_df.drop_duplicates()
postings_df.shape

(18228, 13)

In [8]:
# remove the JobType column
postings_df = postings_df.drop(columns=['job_type'])
postings_df.shape

(18228, 12)

18 members had switched parties while in office. A new extraction logic needed to be used to expand each posting into separate ones with a single party.

In [9]:
# Make a list of bioguide_ids where the party_name column contains a list containing more than one party
# This confirms that the improved extraction logic is working as intended
# Ignore NaN values
multi_party_bioguide_ids = postings_df[postings_df['party_name'].str.contains(',', na=False)]['bioguide_id'].unique().tolist()
# print the list
print(multi_party_bioguide_ids)
# print a count of how many bioguide_ids are in the list
print(f"Number of bioguide_ids with multiple parties: {len(multi_party_bioguide_ids)}")

[]
Number of bioguide_ids with multiple parties: 0


A data error was found during exploration. The start date of the first congress served would be found in the start date of the final term served.

In [10]:
# Convert date columns to date format (without time)
date_columns = ['job_start_date', 'job_end_date', 'congress_start_date', 'congress_end_date', 'party_start_date', 'party_end_date']
for col in date_columns:
    postings_df[col] = pd.to_datetime(postings_df[col], errors='coerce').dt.date
postings_df[date_columns].head()

Unnamed: 0,job_start_date,job_end_date,congress_start_date,congress_end_date,party_start_date,party_end_date
0,NaT,NaT,1959-01-03,1961-01-03,NaT,NaT
1,NaT,NaT,1959-01-03,1961-01-03,NaT,NaT
2,NaT,NaT,1959-01-03,1961-01-03,NaT,NaT
3,NaT,NaT,1959-01-03,1961-01-03,NaT,NaT
4,NaT,NaT,1959-01-03,1961-01-03,NaT,NaT


In [11]:
postings_df.head()

Unnamed: 0,bioguide_id,chamber,congress_number,congress_start_date,congress_end_date,region_type,region_code,party_name,job_start_date,job_end_date,party_start_date,party_end_date
0,A000002,Representative,86,1959-01-03,1961-01-03,StateRegion,VA,Democrat,NaT,NaT,NaT,NaT
1,A000016,Representative,86,1959-01-03,1961-01-03,StateRegion,MS,Democrat,NaT,NaT,NaT,NaT
2,A000024,Representative,86,1959-01-03,1961-01-03,StateRegion,IN,Republican,NaT,NaT,NaT,NaT
3,A000054,Representative,86,1959-01-03,1961-01-03,StateRegion,NJ,Democrat,NaT,NaT,NaT,NaT
4,A000062,Senator,86,1959-01-03,1961-01-03,StateRegion,VT,Republican,NaT,NaT,NaT,NaT


In [12]:
# Make a list of bioguide_ids where the job_start_date is earlier than the congress_start_date
invalid_date_bioguide_ids = postings_df[pd.to_datetime(postings_df['job_start_date']) < pd.to_datetime(postings_df['congress_start_date'])]['bioguide_id'].unique().tolist()
# print the list
print(invalid_date_bioguide_ids)
# print a count of how many bioguide_ids are in the list
print(f"Number of bioguide_ids with invalid date ranges: {len(invalid_date_bioguide_ids)}")


['B000223', 'P000152', 'B001251', 'B001270', 'D000482', 'G000582', 'M001165']
Number of bioguide_ids with invalid date ranges: 7


In [13]:
# replace the job_start_date with the congress_start_date for these bioguide_ids
for bioguide_id in invalid_date_bioguide_ids:
    congress_start_date = postings_df.loc[postings_df['bioguide_id'] == bioguide_id, 'congress_start_date'].values[0]
    postings_df.loc[postings_df['bioguide_id'] == bioguide_id, 'job_start_date'] = congress_start_date


In [14]:
# subtract one day from the congress_end_date to prevent overlap with the job_start_date
postings_df['congress_end_date'] = postings_df['congress_end_date'] - pd.Timedelta(days=1)
# Verify the change
postings_df['congress_end_date'].head()


0    1961-01-02
1    1961-01-02
2    1961-01-02
3    1961-01-02
4    1961-01-02
Name: congress_end_date, dtype: object

In [15]:
postings_df.head()

Unnamed: 0,bioguide_id,chamber,congress_number,congress_start_date,congress_end_date,region_type,region_code,party_name,job_start_date,job_end_date,party_start_date,party_end_date
0,A000002,Representative,86,1959-01-03,1961-01-02,StateRegion,VA,Democrat,NaT,NaT,NaT,NaT
1,A000016,Representative,86,1959-01-03,1961-01-02,StateRegion,MS,Democrat,NaT,NaT,NaT,NaT
2,A000024,Representative,86,1959-01-03,1961-01-02,StateRegion,IN,Republican,NaT,NaT,NaT,NaT
3,A000054,Representative,86,1959-01-03,1961-01-02,StateRegion,NJ,Democrat,NaT,NaT,NaT,NaT
4,A000062,Senator,86,1959-01-03,1961-01-02,StateRegion,VT,Republican,NaT,NaT,NaT,NaT


In [16]:
# using if statments, create a new column called posting_start_date and set to the value frome the following logic
# Use party_start_date if not null, else use job_start_date if not null, then use congress_start_date
def determine_posting_start_date(row):
    if pd.notnull(row['party_start_date']):
        return row['party_start_date']
    elif pd.notnull(row['job_start_date']):
        return row['job_start_date']
    else:
        return row['congress_start_date']
    
postings_df['posting_start_date'] = postings_df.apply(determine_posting_start_date, axis=1)
postings_df.head(10)

Unnamed: 0,bioguide_id,chamber,congress_number,congress_start_date,congress_end_date,region_type,region_code,party_name,job_start_date,job_end_date,party_start_date,party_end_date,posting_start_date
0,A000002,Representative,86,1959-01-03,1961-01-02,StateRegion,VA,Democrat,NaT,NaT,NaT,NaT,1959-01-03
1,A000016,Representative,86,1959-01-03,1961-01-02,StateRegion,MS,Democrat,NaT,NaT,NaT,NaT,1959-01-03
2,A000024,Representative,86,1959-01-03,1961-01-02,StateRegion,IN,Republican,NaT,NaT,NaT,NaT,1959-01-03
3,A000054,Representative,86,1959-01-03,1961-01-02,StateRegion,NJ,Democrat,NaT,NaT,NaT,NaT,1959-01-03
4,A000062,Senator,86,1959-01-03,1961-01-02,StateRegion,VT,Republican,NaT,NaT,NaT,NaT,1959-01-03
5,A000073,Representative,86,1959-01-03,1961-01-02,StateRegion,OK,Democrat,NaT,NaT,NaT,NaT,1959-01-03
6,A000094,Representative,86,1959-01-03,1961-01-02,StateRegion,NC,Democrat,NaT,NaT,NaT,NaT,1959-01-03
7,A000105,Representative,86,1959-01-03,1961-01-02,StateRegion,AR,Independent Democrat,NaT,NaT,NaT,NaT,1959-01-03
8,A000106,Representative,86,1959-01-03,1961-01-02,StateRegion,TX,Republican,NaT,NaT,NaT,NaT,1959-01-03
9,A000138,Representative,86,1959-01-03,1961-01-02,DistrictRegion,IL,Republican,1959-01-03,NaT,NaT,NaT,1959-01-03


In [17]:
# Repeat the process for posting_end_date
def determine_posting_end_date(row):
    if pd.notnull(row['party_end_date']):
        return row['party_end_date']
    elif pd.notnull(row['job_end_date']):
        return row['job_end_date']
    else:
        return row['congress_end_date']
postings_df['posting_end_date'] = postings_df.apply(determine_posting_end_date, axis=1)
postings_df.head(10)

Unnamed: 0,bioguide_id,chamber,congress_number,congress_start_date,congress_end_date,region_type,region_code,party_name,job_start_date,job_end_date,party_start_date,party_end_date,posting_start_date,posting_end_date
0,A000002,Representative,86,1959-01-03,1961-01-02,StateRegion,VA,Democrat,NaT,NaT,NaT,NaT,1959-01-03,1961-01-02
1,A000016,Representative,86,1959-01-03,1961-01-02,StateRegion,MS,Democrat,NaT,NaT,NaT,NaT,1959-01-03,1961-01-02
2,A000024,Representative,86,1959-01-03,1961-01-02,StateRegion,IN,Republican,NaT,NaT,NaT,NaT,1959-01-03,1961-01-02
3,A000054,Representative,86,1959-01-03,1961-01-02,StateRegion,NJ,Democrat,NaT,NaT,NaT,NaT,1959-01-03,1961-01-02
4,A000062,Senator,86,1959-01-03,1961-01-02,StateRegion,VT,Republican,NaT,NaT,NaT,NaT,1959-01-03,1961-01-02
5,A000073,Representative,86,1959-01-03,1961-01-02,StateRegion,OK,Democrat,NaT,NaT,NaT,NaT,1959-01-03,1961-01-02
6,A000094,Representative,86,1959-01-03,1961-01-02,StateRegion,NC,Democrat,NaT,NaT,NaT,NaT,1959-01-03,1961-01-02
7,A000105,Representative,86,1959-01-03,1961-01-02,StateRegion,AR,Independent Democrat,NaT,NaT,NaT,NaT,1959-01-03,1961-01-02
8,A000106,Representative,86,1959-01-03,1961-01-02,StateRegion,TX,Republican,NaT,NaT,NaT,NaT,1959-01-03,1961-01-02
9,A000138,Representative,86,1959-01-03,1961-01-02,DistrictRegion,IL,Republican,1959-01-03,NaT,NaT,NaT,1959-01-03,1961-01-02


In [18]:
# Replace the DistrictRegion with StateRegion in the region_type column
postings_df['region_type'] = postings_df['region_type'].replace('DistrictRegion', 'StateRegion')
postings_df['region_type'].unique()

array(['StateRegion', 'TerritoryRegion'], dtype=object)

In [19]:
# remove unnecessary columns
columns_to_remove = ['job_start_date', 'job_end_date', 'congress_start_date', 'congress_end_date', 'party_start_date', 'party_end_date']
postings_df = postings_df.drop(columns=columns_to_remove)
postings_df.head()

Unnamed: 0,bioguide_id,chamber,congress_number,region_type,region_code,party_name,posting_start_date,posting_end_date
0,A000002,Representative,86,StateRegion,VA,Democrat,1959-01-03,1961-01-02
1,A000016,Representative,86,StateRegion,MS,Democrat,1959-01-03,1961-01-02
2,A000024,Representative,86,StateRegion,IN,Republican,1959-01-03,1961-01-02
3,A000054,Representative,86,StateRegion,NJ,Democrat,1959-01-03,1961-01-02
4,A000062,Senator,86,StateRegion,VT,Republican,1959-01-03,1961-01-02


In [20]:
# save as a new CSV file
postings_df.to_csv('data\\postings_cleaned.csv', index=False)