In [1]:
# Import dependencies
import pandas as pd
import csv
import re

In [22]:
# Make a list of the bioguide_ids in the postings_cleaned.csv file

# open a dataframe using postings_cleaned.csv
postings_df = pd.read_csv('data\\processed\\postings_cleaned.csv')
# find the unique bioguide_ids
required_ids = postings_df['bioguide_id'].unique().tolist()
# display the number of unique bioguide_ids
print(f'Number of unique bioguide_ids in postings_cleaned.csv: {len(required_ids)}')


Number of unique bioguide_ids in postings_cleaned.csv: 3084


In [3]:
# open profiles.csv and read into a DataFrame
profiles_df = pd.read_csv('data\\profiles.csv')
# display the first few rows
profiles_df.head()

Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile
0,A000001,Fred,Aandahl,1897-04-09,1966-04-07,A Representative from North Dakota; born in Li...
1,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...
2,A000003,Joel,Abbot,1776-03-17,1826-11-19,a Representative from Georgia; born in Ridgefi...
3,A000004,Amos,Abbott,1786-09-10,1868-11-02,A Representative from Massachusetts; born in A...
4,A000005,Joseph,Abbott,1840-01-15,1908-02-11,A Representative from Texas; born near Decatur...


In [4]:
# Create a required_profiles_df that only contains rows from profiles_df where the bioguide_id is in required_ids
required_profiles_df = profiles_df[profiles_df['bioguide_id'].isin(required_ids)]
# display the number of rows in required_profiles_df
print(f'Number of rows in required_profiles_df: {len(required_profiles_df)}')
# save required_profiles_df to a new CSV file
required_profiles_df.to_csv('data\\required_profiles_raw.csv', index=False)


Number of rows in required_profiles_df: 3084


In [5]:
# check for the correct yyyy-mm-dd format in birth_date column
date_pattern = r'^\d{4}-\d{2}-\d{2}$' 
# count number of invalid birth_date entries
invalid_birth_dates = required_profiles_df[~required_profiles_df['birth_date'].str.match(date_pattern, na=False)]
# cound the number of valid birth_date entries
valid_birth_dates = required_profiles_df[required_profiles_df['birth_date'].str.match(date_pattern, na=False)]
print(f'Number of valid birth_date entries: {len(valid_birth_dates)}')
print(f'Number of invalid birth_date entries: {len(invalid_birth_dates)}')

Number of valid birth_date entries: 1559
Number of invalid birth_date entries: 1525


In [6]:
# Check for null values in birth_date column
null_birth_dates = required_profiles_df['birth_date'].isnull().sum()
print(f'Number of null birth_date entries: {null_birth_dates}')

Number of null birth_date entries: 0


In [7]:
# Check for year only values in birth_date column
# hopefully this will equal the number of invalid entries
year_only_pattern = r'^\d{4}$'
year_only_birth_dates = required_profiles_df[required_profiles_df['birth_date'].str.match(year_only_pattern, na=False)]
print(f'Number of year-only birth_date entries: {len(year_only_birth_dates)}')


Number of year-only birth_date entries: 1525


In [8]:
# The missing birth dates can be found in the profile column as text.
# Extract that date and confirm the year matches the existing year-only date.
# Create a new column 'extracted_birth_date' to hold the extracted dates.
# Extract the first date that matches the pattern 'month(as text) date, year' from the profile text.
def extract_birth_date(profile_text):
    if pd.isnull(profile_text):
        return None
    # regex pattern to match 'Month Day, Year'
    pattern = r'([A-Za-z]+ \d{1,2}, \d{4})'
    match = re.search(pattern, profile_text)
    if match:
        return match.group(1)
    return None

# Apply this function to the rows with year-only birth dates
required_profiles_df['extracted_birth_date'] = required_profiles_df.apply(
    lambda row: extract_birth_date(row['profile']) if row['bioguide_id'] in year_only_birth_dates['bioguide_id'].values else None,
    axis=1
)
# display the extracted birth dates
required_profiles_df[['bioguide_id', 'birth_date', 'extracted_birth_date']].head(10)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_profiles_df['extracted_birth_date'] = required_profiles_df.apply(


Unnamed: 0,bioguide_id,birth_date,extracted_birth_date
1,A000002,1908-05-21,
8,A000009,1923,"February 13, 1923"
10,A000011,1916,"November 21, 1916"
13,A000014,1938-06-26,
15,A000016,1903-05-16,
16,A000017,1931-02-24,
17,A000018,1920,"July 24, 1920"
21,A000022,1942-11-19,
23,A000024,1907-12-14,
30,A000031,1927,"January 13, 1927"


In [9]:
# Verify the year from the extracted date matches the year-only birth_dates
def get_year_from_date(date_str):
    if pd.isnull(date_str):
        return None
    try:
        return int(date_str.split()[-1])
    except:
        return None

# Create a copy of the dataframe to avoid SettingWithCopyWarning
required_profiles_df = required_profiles_df.copy()

# Create a new column 'extracted_year' to hold the year from the extracted date
required_profiles_df.loc[:, 'extracted_year'] = required_profiles_df['extracted_birth_date'].apply(get_year_from_date)

# Create a new column 'birth_year' to hold the year from the original birth_date
required_profiles_df.loc[:, 'birth_year'] = required_profiles_df['birth_date'].apply(lambda x: int(x) if pd.notnull(x) and len(str(x)) == 4 else None)

# Find rows where the years do not match
mismatched_years = required_profiles_df.loc[
    (required_profiles_df['birth_year'].notnull()) & 
    (required_profiles_df['extracted_year'].notnull()) & 
    (required_profiles_df['birth_year'] != required_profiles_df['extracted_year'])
]
print(f'Number of mismatched years: {len(mismatched_years)}')
# display the mismatched rows
mismatched_years[['bioguide_id', 'birth_date', 'extracted_birth_date', 'birth_year', 'extracted_year']]

Number of mismatched years: 2


Unnamed: 0,bioguide_id,birth_date,extracted_birth_date,birth_year,extracted_year
2769,C001073,1964,"May 3, 2008",1964.0,2008.0
8527,M001189,1969,"January 3, 2013",1969.0,2013.0


In [10]:
# print the profiles for the mismatched years
for index, row in mismatched_years.iterrows():
    print(f"Bioguide ID: {row['bioguide_id']}")
    print(f"Profile Text: {row['profile']}")
    print(f"Original Birth Date: {row['birth_date']}")
    print(f"Extracted Birth Date: {row['extracted_birth_date']}")
    print()
    


Bioguide ID: C001073
Profile Text: A Representative from Louisiana; born in New Roads, Pointe Coupee Parish, La; graduated from Catholic of Pointe Coupee High School, 1982; B.S., Louisiana State University, 1985 ; M.A., Louisiana State University, Baton Rouge, La., 1993; J.D., Georgetown University Law School, Washington, D.C., 1991; lawyer, private practice, 1995-present; assistant district attorney, 1996-2000; Louisiana state house of representatives, 2000-2008; elected as a Democrat to the One Hundred Tenth Congress, by special election, to fill the vacancy caused by the resignation of Representative Richard Baker (May 3, 2008-January 3, 2009); unsuccessful candidate for reelection to the One Hundred Eleventh Congress in 2008.
Original Birth Date: 1964
Extracted Birth Date: May 3, 2008

Bioguide ID: M001189
Profile Text: A Representative from Indiana; born in Evansville, Vanderburgh County, Ind., February, 27, 1969; graduated from Greensburg Community High School, Greensburg, Ind., 

In [11]:
# manually update these two mismatched years
# C001073 birth_date = January 17, 1964 (Source: Wikipedia)
# M001189 birth_date = Februrary 27, 1969 (format mismatch in profile)
# Add this data to the extracted_birth_date column
required_profiles_df.loc[required_profiles_df['bioguide_id'] == 'C001073', 'extracted_birth_date'] = 'January 17, 1964'
required_profiles_df.loc[required_profiles_df['bioguide_id'] == 'M001189', 'extracted_birth_date'] = 'February 27, 1969'
# Remove the birth_year and extracted_year columns
required_profiles_df = required_profiles_df.drop(columns=['birth_year', 'extracted_year'])
required_profiles_df.head()

Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile,extracted_birth_date
1,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...,
8,A000009,James,Abdnor,1923,2012,A Representative and a Senator from South Dako...,"February 13, 1923"
10,A000011,Homer,Abele,1916,2000,"A Representative from Ohio; born in Wellston, ...","November 21, 1916"
13,A000014,Neil,Abercrombie,1938-06-26,,"a Representative from Hawaii; born in Buffalo,...",
15,A000016,Thomas,Abernethy,1903-05-16,1998-06-11,A Representative from Mississippi; born in Eup...,


In [12]:
# Save as a pdf
required_profiles_df.to_csv('data\\profiles_required_cleaning.csv', index=False)

In [13]:
# opend required_profiles_df from the saved CSV
required_profiles_df = pd.read_csv('data\\profiles_required_cleaning.csv')

# Convert the extracted_birth_date to yyyy-mm-dd format
# Add to a new column 'cleaned_birth_date'
from datetime import datetime
def convert_to_yyyy_mm_dd(date_str):
    if pd.isnull(date_str):
        return None
    try:
        dt = datetime.strptime(date_str, '%B %d, %Y')
        return dt.strftime('%Y-%m-%d')
    except:
        return None
required_profiles_df['cleaned_birth_date'] = required_profiles_df['extracted_birth_date'].apply(convert_to_yyyy_mm_dd)
required_profiles_df.head()

Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile,extracted_birth_date,cleaned_birth_date
0,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...,,
1,A000009,James,Abdnor,1923,2012,A Representative and a Senator from South Dako...,"February 13, 1923",1923-02-13
2,A000011,Homer,Abele,1916,2000,"A Representative from Ohio; born in Wellston, ...","November 21, 1916",1916-11-21
3,A000014,Neil,Abercrombie,1938-06-26,,"a Representative from Hawaii; born in Buffalo,...",,
4,A000016,Thomas,Abernethy,1903-05-16,1998-06-11,A Representative from Mississippi; born in Eup...,,


In [14]:
# Add the birth_date to the cleaned_birth_date column where the cleaned_birth_date is null
required_profiles_df['cleaned_birth_date'] = required_profiles_df.apply(
    lambda row: row['birth_date'] if pd.isnull(row['cleaned_birth_date']) else row['cleaned_birth_date'],
    axis=1
)
required_profiles_df.head()

Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile,extracted_birth_date,cleaned_birth_date
0,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...,,1908-05-21
1,A000009,James,Abdnor,1923,2012,A Representative and a Senator from South Dako...,"February 13, 1923",1923-02-13
2,A000011,Homer,Abele,1916,2000,"A Representative from Ohio; born in Wellston, ...","November 21, 1916",1916-11-21
3,A000014,Neil,Abercrombie,1938-06-26,,"a Representative from Hawaii; born in Buffalo,...",,1938-06-26
4,A000016,Thomas,Abernethy,1903-05-16,1998-06-11,A Representative from Mississippi; born in Eup...,,1903-05-16


In [15]:
# verify all birth_date entries are now in yyyy-mm-dd format
date_pattern = r'^\d{4}-\d{2}-\d{2}$' 
# count number of invalid birth_date entries
invalid_dates_count = required_profiles_df[~required_profiles_df['cleaned_birth_date'].str.match(date_pattern, na=False)].shape[0]
invalid_dates_count
# print the number of invalid entries
print(f'Number of invalid birth_date entries after cleaning: {invalid_dates_count}')

Number of invalid birth_date entries after cleaning: 1


In [16]:
# find the invalid entries, if any
invalid_entries = required_profiles_df[~required_profiles_df['cleaned_birth_date'].str.match(date_pattern, na=False)]
invalid_entries

Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile,extracted_birth_date,cleaned_birth_date
287,B001104,Everett,Burkhalter,1897,1975,A Representative from California; born in Hebe...,"Januray 19, 1897",1897


In [17]:
# Manually update the cleaned_birth_date for B001104 to 1897-01-19 (Mispelled in profile text)
required_profiles_df.loc[required_profiles_df['bioguide_id'] == 'B001104', 'cleaned_birth_date'] = '1897-01-19'
# Ru# count number of invalid birth_date entries
invalid_dates_count = required_profiles_df[~required_profiles_df['cleaned_birth_date'].str.match(date_pattern, na=False)].shape[0]
invalid_dates_count
# print the number of invalid entries
print(f'Number of invalid birth_date entries after cleaning: {invalid_dates_count}')

Number of invalid birth_date entries after cleaning: 0


In [18]:
# drop birth_date and extracted_birth_date columns
required_profiles_df = required_profiles_df.drop(columns=['birth_date', 'extracted_birth_date'])
required_profiles_df.head()

Unnamed: 0,bioguide_id,first_name,last_name,death_date,profile,cleaned_birth_date
0,A000002,Watkins,Abbitt,1998-07-13,A Representative from Virginia; born in Lynchb...,1908-05-21
1,A000009,James,Abdnor,2012,A Representative and a Senator from South Dako...,1923-02-13
2,A000011,Homer,Abele,2000,"A Representative from Ohio; born in Wellston, ...",1916-11-21
3,A000014,Neil,Abercrombie,,"a Representative from Hawaii; born in Buffalo,...",1938-06-26
4,A000016,Thomas,Abernethy,1998-06-11,A Representative from Mississippi; born in Eup...,1903-05-16


In [19]:
# rename cleaned_birth_date to birth_date
required_profiles_df = required_profiles_df.rename(columns={'cleaned_birth_date': 'birth_date'})
required_profiles_df.head()

Unnamed: 0,bioguide_id,first_name,last_name,death_date,profile,birth_date
0,A000002,Watkins,Abbitt,1998-07-13,A Representative from Virginia; born in Lynchb...,1908-05-21
1,A000009,James,Abdnor,2012,A Representative and a Senator from South Dako...,1923-02-13
2,A000011,Homer,Abele,2000,"A Representative from Ohio; born in Wellston, ...",1916-11-21
3,A000014,Neil,Abercrombie,,"a Representative from Hawaii; born in Buffalo,...",1938-06-26
4,A000016,Thomas,Abernethy,1998-06-11,A Representative from Mississippi; born in Eup...,1903-05-16


In [20]:
# Save as a pdf
required_profiles_df.to_csv('data\\processed\\profiles_required_clean.csv', index=False)