After cleaning the postings, the profiles need to be cleaned up.

In [1]:
# Import dependencies
import pandas as pd
import csv
import re

In [2]:
# Make a list of all the unique bioguide_ids in postings_cleaned.csv.
# make a list of requied_ids
# open a dataframe using postings_cleaned.csv
postings_df = pd.read_csv('../data/processed/postings_cleaned.csv')
# find the unique bioguide_ids
required_ids = postings_df['bioguide_id'].unique().tolist()
# display the number of unique bioguide_ids
print(f'Number of unique bioguide_ids in postings_cleaned.csv: {len(required_ids)}')


Number of unique bioguide_ids in postings_cleaned.csv: 3084


In [3]:
# open members_profiles_raw.csv and read into a DataFrame
profiles_raw_df = pd.read_csv('../data/raw/member_profiles_raw.csv')
# display the first few rows
profiles_raw_df.head()

Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile
0,A000001,Fred,Aandahl,1897-04-09,1966-04-07,A Representative from North Dakota; born in Li...
1,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...
2,A000003,Joel,Abbot,1776-03-17,1826-11-19,a Representative from Georgia; born in Ridgefi...
3,A000004,Amos,Abbott,1786-09-10,1868-11-02,A Representative from Massachusetts; born in A...
4,A000005,Joseph,Abbott,1840-01-15,1908-02-11,A Representative from Texas; born near Decatur...


In [4]:
# Remove bioguide_ids from profiles_df that are not in required_ids list
profiles_df = profiles_raw_df[profiles_raw_df['bioguide_id'].isin(required_ids)]
# display the number of rows in profiles_df after filtering
# This should match the number of unique bioguide_ids in postings_cleaned.csv
print(f'Number of rows in profiles_df after filtering: {len(profiles_df)}')

Number of rows in profiles_df after filtering: 3084


In [5]:
# check for the correct yyyy-mm-dd format in birth_date column
date_pattern = r'^\d{4}-\d{2}-\d{2}$' 
# count number of invalid birth_date entries
invalid_birth_dates = profiles_df[~profiles_df['birth_date'].str.match(date_pattern, na=False)]
# cound the number of valid birth_date entries
valid_birth_dates = profiles_df[profiles_df['birth_date'].str.match(date_pattern, na=False)]
print(f'Number of valid birth_date entries: {len(valid_birth_dates)}')
print(f'Number of invalid birth_date entries: {len(invalid_birth_dates)}')

Number of valid birth_date entries: 1559
Number of invalid birth_date entries: 1525


In [6]:
# Check for null values in birth_date column
null_birth_dates = profiles_df['birth_date'].isnull().sum()
print(f'Number of null birth_date entries: {null_birth_dates}')

Number of null birth_date entries: 0


In [7]:
# Check for year only values in birth_date column
# hopefully this will equal the number of invalid entries
year_only_pattern = r'^\d{4}$'
year_only_birth_dates = profiles_df[profiles_df['birth_date'].str.match(year_only_pattern, na=False)]
print(f'Number of year-only_birth_date entries: {len(year_only_birth_dates)}')


Number of year-only_birth_date entries: 1525


Most (if not all) the year only birthdates are in the profile column

Let's use regex to extract as may as possible

In [8]:
# Extract that date and confirm the year matches the existing year-only date.
# Create a new column 'extracted_birth_date' to hold the extracted dates.
profiles_df.loc[:, 'extracted_birth_date'] = None



profiles_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  profiles_df.loc[:, 'extracted_birth_date'] = None


Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile,extracted_birth_date
1,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...,
8,A000009,James,Abdnor,1923,2012,A Representative and a Senator from South Dako...,
10,A000011,Homer,Abele,1916,2000,"A Representative from Ohio; born in Wellston, ...",
13,A000014,Neil,Abercrombie,1938-06-26,,"a Representative from Hawaii; born in Buffalo,...",
15,A000016,Thomas,Abernethy,1903-05-16,1998-06-11,A Representative from Mississippi; born in Eup...,


In [9]:
# Define a function to extract the birth date from profile text.
year_only_pattern = r'^\d{4}$'
profile_date_pattern = r'(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, (\d{4})'

# Make a list of bioguide_ids with year-only birth dates for reference
year_only_ids = year_only_birth_dates['bioguide_id'].tolist()

# count the length of year_only_ids
print(f'Number of bioguide_ids with year-only birth dates: {len(year_only_ids)}')

Number of bioguide_ids with year-only birth dates: 1525


In [10]:
# Define a function to extract the birth date from profile text.
for index, row in profiles_df.iterrows():
    bioguide_id = row['bioguide_id']
    profile_text = row['profile']
    birth_date = row['birth_date']
    
    # Only process if the bioguide_id is in year_only_ids
    if bioguide_id in year_only_ids:
        # Search for the full date pattern in the profile text
        match = re.search(profile_date_pattern, profile_text)
        if match:
            extracted_date = match.group(0)  # Get the entire matched date
            profiles_df.at[index, 'extracted_birth_date'] = extracted_date

profiles_df.head()   


Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile,extracted_birth_date
1,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...,
8,A000009,James,Abdnor,1923,2012,A Representative and a Senator from South Dako...,"February 13, 1923"
10,A000011,Homer,Abele,1916,2000,"A Representative from Ohio; born in Wellston, ...","November 21, 1916"
13,A000014,Neil,Abercrombie,1938-06-26,,"a Representative from Hawaii; born in Buffalo,...",
15,A000016,Thomas,Abernethy,1903-05-16,1998-06-11,A Representative from Mississippi; born in Eup...,


In [11]:
# convert the extracted_birth_date to yyyy-mm-dd format
def convert_to_yyyy_mm_dd(date_str):
    try:
        return pd.to_datetime(date_str).strftime('%Y-%m-%d')
    except:
        return None

profiles_df.loc[:, 'extracted_birth_date'] = profiles_df['extracted_birth_date'].apply(convert_to_yyyy_mm_dd)
profiles_df.head()


Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile,extracted_birth_date
1,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...,
8,A000009,James,Abdnor,1923,2012,A Representative and a Senator from South Dako...,1923-02-13
10,A000011,Homer,Abele,1916,2000,"A Representative from Ohio; born in Wellston, ...",1916-11-21
13,A000014,Neil,Abercrombie,1938-06-26,,"a Representative from Hawaii; born in Buffalo,...",
15,A000016,Thomas,Abernethy,1903-05-16,1998-06-11,A Representative from Mississippi; born in Eup...,


In [12]:
# Find the year_only_ids that do not have an extracted_birth_date
missing_extracted_dates = profiles_df[
    (profiles_df['bioguide_id'].isin(year_only_ids)) & 
    (profiles_df['extracted_birth_date'].isnull())
]
print(f'Number of year-only bioguide_ids without extracted birth dates: {len(missing_extracted_dates)}')

Number of year-only bioguide_ids without extracted birth dates: 0


In [13]:
# Combine the yyyy-mm-dd extracted dates with the original birth_date column into a new column 'cleaned_birth_date'.
# Use extracted_birth_date if available, otherwise use birth_date
profiles_df['cleaned_birth_date'] = profiles_df['extracted_birth_date'].fillna(profiles_df['birth_date'])
profiles_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  profiles_df['cleaned_birth_date'] = profiles_df['extracted_birth_date'].fillna(profiles_df['birth_date'])


Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile,extracted_birth_date,cleaned_birth_date
1,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...,,1908-05-21
8,A000009,James,Abdnor,1923,2012,A Representative and a Senator from South Dako...,1923-02-13,1923-02-13
10,A000011,Homer,Abele,1916,2000,"A Representative from Ohio; born in Wellston, ...",1916-11-21,1916-11-21
13,A000014,Neil,Abercrombie,1938-06-26,,"a Representative from Hawaii; born in Buffalo,...",,1938-06-26
15,A000016,Thomas,Abernethy,1903-05-16,1998-06-11,A Representative from Mississippi; born in Eup...,,1903-05-16


In [14]:
# Check for any missing values in cleaned_birth_date
missing_cleaned_dates = profiles_df['cleaned_birth_date'].isnull().sum()
print(f'Number of missing values in cleaned_birth_date: {missing_cleaned_dates}')

Number of missing values in cleaned_birth_date: 0


In [15]:
# # Save as CSV
# # used to review in Excel

# profiles_df.to_csv('../data/processed/member_profiles_cleaning.csv', index=False)

In [16]:
# Print the columns of profiles_df
print(profiles_df.columns)

Index(['bioguide_id', 'first_name', 'last_name', 'birth_date', 'death_date',
       'profile', 'extracted_birth_date', 'cleaned_birth_date'],
      dtype='object')


In [17]:
# Verify the cleaned_birth_date year matches the year only birth_date
mismatched_years = 0
for index, row in profiles_df.iterrows():
    birth_date = row['birth_date']
    cleaned_birth_date = row['cleaned_birth_date']
    
    # Only check for year-only birth dates
    if re.match(year_only_pattern, str(birth_date)):
        birth_year = str(birth_date)
        cleaned_year = str(cleaned_birth_date)[:4]  # Extract the year from cleaned_birth_date
        
        if birth_year != cleaned_year:
            mismatched_years += 1
            print(f'Mismatch for bioguide_id {row["bioguide_id"]}: birth_date year {birth_year} vs cleaned_birth_date year {cleaned_year}')

print(f'Number of mismatched years: {mismatched_years}')

Mismatch for bioguide_id B001104: birth_date year 1897 vs cleaned_birth_date year 1963
Mismatch for bioguide_id C001073: birth_date year 1964 vs cleaned_birth_date year 2008
Mismatch for bioguide_id M001189: birth_date year 1969 vs cleaned_birth_date year 2013
Number of mismatched years: 3


In [18]:
# Manually update any mismatched years before finalizing the cleaned_birth_date column.
# M001189 cleaned_birth_date == 1969-02-27 (from profile text)
# C001073 cleaned_birth_date == 1964-01-17 (from wikipedia)
# B001104 cleaned_birth_date == 1897-01-19 (from profile text)

profiles_df.loc[profiles_df['bioguide_id'] == 'M001189', 'cleaned_birth_date'] = '1969-02-27'
profiles_df.loc[profiles_df['bioguide_id'] == 'C001073', 'cleaned_birth_date'] = '1964-01-17'
profiles_df.loc[profiles_df['bioguide_id'] == 'B001104', 'cleaned_birth_date'] = '1897-01-19'




In [19]:
# Verify the cleaned_birth_date year matches the year only birth_date
mismatched_years = 0
for index, row in profiles_df.iterrows():
    birth_date = row['birth_date']
    cleaned_birth_date = row['cleaned_birth_date']
    
    # Only check for year-only birth dates
    if re.match(year_only_pattern, str(birth_date)):
        birth_year = str(birth_date)
        cleaned_year = str(cleaned_birth_date)[:4]  # Extract the year from cleaned_birth_date
        
        if birth_year != cleaned_year:
            mismatched_years += 1
            print(f'Mismatch for bioguide_id {row["bioguide_id"]}: birth_date year {birth_year} vs cleaned_birth_date year {cleaned_year}')

print(f'Number of mismatched years: {mismatched_years}')

Number of mismatched years: 0


In [20]:
# Remove unnecessary columns
profiles_df = profiles_df.drop(columns=['birth_date',
                                        'death_date',
                                        'profile',
                                        'extracted_birth_date'])
profiles_df.head()

Unnamed: 0,bioguide_id,first_name,last_name,cleaned_birth_date
1,A000002,Watkins,Abbitt,1908-05-21
8,A000009,James,Abdnor,1923-02-13
10,A000011,Homer,Abele,1916-11-21
13,A000014,Neil,Abercrombie,1938-06-26
15,A000016,Thomas,Abernethy,1903-05-16


In [21]:
# rename cleaned_birth_date to birth_date
profiles_df = profiles_df.rename(columns={'cleaned_birth_date': 'birth_date'})
profiles_df.head()

Unnamed: 0,bioguide_id,first_name,last_name,birth_date
1,A000002,Watkins,Abbitt,1908-05-21
8,A000009,James,Abdnor,1923-02-13
10,A000011,Homer,Abele,1916-11-21
13,A000014,Neil,Abercrombie,1938-06-26
15,A000016,Thomas,Abernethy,1903-05-16


In [22]:
# Save to final cleaned CSV
profiles_df.to_csv('../data/processed/profiles_cleaned.csv', index=False)

When the csv is converted to excel for Tableau, the pre-1900 dates are causing an issue. Extra columns will be removed here and the birth_date column will be changed to a string for tableau.

In [23]:
# Convert the birthdate column to a string type if not already
profiles_df['birth_date'] = profiles_df['birth_date'].astype(str)
profiles_df.dtypes

bioguide_id    object
first_name     object
last_name      object
birth_date     object
dtype: object

In [None]:
# Save as a .xlsx file
# Label the sheet 'profiles'
import openpyxl

profiles_df.to_excel('../data/tableau/profiles_cleaned.xlsx', index=False, sheet_name='profiles')