In [1]:
# Import dependencies
import pandas as pd
import csv
import re

In [2]:
# Make a list of the bioguide_ids in the postings_cleaned.csv file

# open a dataframe using postings_cleaned.csv
postings_df = pd.read_csv('data\\postings_cleaned.csv')
# find the unique bioguide_ids
required_ids = postings_df['bioguide_id'].unique().tolist()
# display the number of unique bioguide_ids
print(f'Number of unique bioguide_ids in postings_cleaned.csv: {len(required_ids)}')


Number of unique bioguide_ids in postings_cleaned.csv: 3084


In [3]:
# open profiles.csv and read into a DataFrame
profiles_df = pd.read_csv('data\\profiles.csv')
# display the first few rows
profiles_df.head()

Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile
0,A000001,Fred,Aandahl,1897-04-09,1966-04-07,A Representative from North Dakota; born in Li...
1,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...
2,A000003,Joel,Abbot,1776-03-17,1826-11-19,a Representative from Georgia; born in Ridgefi...
3,A000004,Amos,Abbott,1786-09-10,1868-11-02,A Representative from Massachusetts; born in A...
4,A000005,Joseph,Abbott,1840-01-15,1908-02-11,A Representative from Texas; born near Decatur...


In [4]:
# Create a required_profiles_df that only contains rows from profiles_df where the bioguide_id is in required_ids
required_profiles_df = profiles_df[profiles_df['bioguide_id'].isin(required_ids)]
# display the number of rows in required_profiles_df
print(f'Number of rows in required_profiles_df: {len(required_profiles_df)}')
# save required_profiles_df to a new CSV file
required_profiles_df.to_csv('data\\required_profiles_raw.csv', index=False)


Number of rows in required_profiles_df: 3084


In [6]:
# check for the correct yyyy-mm-dd format in birth_date column
date_pattern = r'^\d{4}-\d{2}-\d{2}$' 
# count number of invalid birth_date entries
invalid_birth_dates = required_profiles_df[~required_profiles_df['birth_date'].str.match(date_pattern, na=False)]
# cound the number of valid birth_date entries
valid_birth_dates = required_profiles_df[required_profiles_df['birth_date'].str.match(date_pattern, na=False)]
print(f'Number of valid birth_date entries: {len(valid_birth_dates)}')
print(f'Number of invalid birth_date entries: {len(invalid_birth_dates)}')

Number of valid birth_date entries: 1559
Number of invalid birth_date entries: 1525


In [7]:
# Check for null values in birth_date column
null_birth_dates = required_profiles_df['birth_date'].isnull().sum()
print(f'Number of null birth_date entries: {null_birth_dates}')

Number of null birth_date entries: 0


In [None]:
# Check for year only values in birth_date column
# hopefully this will equal the number of invalid entries
year_only_pattern = r'^\d{4}$'
year_only_birth_dates = required_profiles_df[required_profiles_df['birth_date'].str.match(year_only_pattern, na=False)]
print(f'Number of year-only birth_date entries: {len(year_only_birth_dates)}')


Number of year-only birth_date entries: 1525


In [10]:
# The missing birth dates can be found in the profile column as text.
# Extract that date and confirm the year matches the existing year-only date.
# Create a new column 'extracted_birth_date' to hold the extracted dates.
# Extract the first date that matches the pattern 'month(as text) date, year' from the profile text.
def extract_birth_date(profile_text):
    if pd.isnull(profile_text):
        return None
    # regex pattern to match 'Month Day, Year'
    pattern = r'([A-Za-z]+ \d{1,2}, \d{4})'
    match = re.search(pattern, profile_text)
    if match:
        return match.group(1)
    return None

# Apply this function to the rows with year-only birth dates
required_profiles_df['extracted_birth_date'] = required_profiles_df.apply(
    lambda row: extract_birth_date(row['profile']) if row['bioguide_id'] in year_only_birth_dates['bioguide_id'].values else None,
    axis=1
)
# display the extracted birth dates
required_profiles_df[['bioguide_id', 'birth_date', 'extracted_birth_date']].head(10)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_profiles_df['extracted_birth_date'] = required_profiles_df.apply(


Unnamed: 0,bioguide_id,birth_date,extracted_birth_date
1,A000002,1908-05-21,
8,A000009,1923,"February 13, 1923"
10,A000011,1916,"November 21, 1916"
13,A000014,1938-06-26,
15,A000016,1903-05-16,
16,A000017,1931-02-24,
17,A000018,1920,"July 24, 1920"
21,A000022,1942-11-19,
23,A000024,1907-12-14,
30,A000031,1927,"January 13, 1927"


In [12]:
# Verify the year from the extracted date matches the year-only birth_dates
def get_year_from_date(date_str):
    if pd.isnull(date_str):
        return None
    try:
        return int(date_str.split()[-1])
    except:
        return None
# Create a new column 'extracted_year' to hold the year from the extracted date
required_profiles_df['extracted_year'] = required_profiles_df['extracted_birth_date'].apply(get_year_from_date)
# Create a new column 'birth_year' to hold the year from the original birth_date
required_profiles_df['birth_year'] = required_profiles_df['birth_date'].apply(lambda x: int(x) if pd.notnull(x) and len(x) == 4 else None)
# Find rows where the years do not match
mismatched_years = required_profiles_df[
    (required_profiles_df['birth_year'].notnull()) & 
    (required_profiles_df['extracted_year'].notnull()) & 
    (required_profiles_df['birth_year'] != required_profiles_df['extracted_year'])
]
print(f'Number of mismatched years: {len(mismatched_years)}')
# display the mismatched rows
mismatched_years[['bioguide_id', 'birth_date', 'extracted_birth_date', 'birth_year', 'extracted_year']]

Number of mismatched years: 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_profiles_df['extracted_year'] = required_profiles_df['extracted_birth_date'].apply(get_year_from_date)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_profiles_df['birth_year'] = required_profiles_df['birth_date'].apply(lambda x: int(x) if pd.notnull(x) and len(x) == 4 else None)


Unnamed: 0,bioguide_id,birth_date,extracted_birth_date,birth_year,extracted_year
2769,C001073,1964,"May 3, 2008",1964.0,2008.0
8527,M001189,1969,"January 3, 2013",1969.0,2013.0


In [None]:
# Read the profile for bioguide_ids with 

In [None]:
# Find the different formats in the birth_date column
unique_formats = profiles_df['birth_date'].dropna().apply(lambda x: re.sub(r'\d', 'D', x)).unique()
print('Unique birth_date formats:')
for fmt in unique_formats:
    print(fmt)
# Based on the unique formats, create a function to standardize the birth_date column to yyyy-mm-dd
def standardize_birth_date(date_str):
    if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
        return date_str
    elif re.match(r'^\d{2}/\d{2}/\d{4}$', date_str):
        month, day, year = date_str.split('/')
        return f'{year}-{int(month):02d}-{int(day):02d}'
    elif re.match(r'^\d{2}-\d{2}-\d{4}$', date_str):
        day, month, year = date_str.split('-')
        return f'{year}-{int(month):02d}-{int(day):02d}'
    else:
        return None

In [None]:
# List the bioguide_ids with the 'yyyy' format in birth_date
ids_with_year_only = profiles_df[profiles_df['birth_date'].str.match(r'^\d{4}$', na=False)]['bioguide_id'].tolist()

# count the number of bioguide_ids with the 'yyyy' format
print(f'Number of bioguide_ids with year-only birth_date: {len(ids_with_year_only)}')



In [None]:
# find the first date mentioned in the profile_text for each bioguide_id in ids_with_year_only
for bioguide_id in ids_with_year_only:
    profile_text = profiles_df[profiles_df['bioguide_id'] == bioguide_id]['profile_text'].values[0]
