Step 1: Open MongoDB and connect to the Bioguide folder

In [1]:
# import dependencies
from pymongo import MongoClient
from datetime import datetime
import pandas as pd

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['bioguide']
collection = db['profiles']

List of the features to be extracted from the MongoDB collection:\
Table 1. Bioguide_Table - information about the members of Congress
1. bioguide_id
2. first_name
3. last_name
4. birth_date
5. death_date

In [2]:
"""
TABLE 1. Bioguide_Table - information about the members of Congress

1. bioguide_id
2. first_name
3. last_name
4. birth_date
5. death_date
"""

# Make a list of all the bioguide_ids
bioguide_ids = collection.distinct('usCongressBioId')

# Make a table to hold the data
data = []
# append the base information to the data list
for member in bioguide_ids:
    base_info = {
        'bioguide_id': member,
        'first_name': collection.find_one({'usCongressBioId': member}).get('givenName', ''),
        'last_name': collection.find_one({'usCongressBioId': member}).get('familyName', ''),
        'birth_date': collection.find_one({'usCongressBioId': member}).get('birthDate', ''),
        'death_date': collection.find_one({'usCongressBioId': member}).get('deathDate', ''),
        'profile': collection.find_one({'usCongressBioId': member}).get('profileText', '')
    }
    data.append(base_info)

# Convert the data list to a DataFrame
df = pd.DataFrame(data)
# # Convert the birth_date and death_date columns to datetime objects
# df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')
# df['death_date'] = pd.to_datetime(df['death_date'], errors='coerce')
df.head()

Unnamed: 0,bioguide_id,first_name,last_name,birth_date,death_date,profile
0,A000001,Fred,Aandahl,1897-04-09,1966-04-07,A Representative from North Dakota; born in Li...
1,A000002,Watkins,Abbitt,1908-05-21,1998-07-13,A Representative from Virginia; born in Lynchb...
2,A000003,Joel,Abbot,1776-03-17,1826-11-19,a Representative from Georgia; born in Ridgefi...
3,A000004,Amos,Abbott,1786-09-10,1868-11-02,A Representative from Massachusetts; born in A...
4,A000005,Joseph,Abbott,1840-01-15,1908-02-11,A Representative from Texas; born near Decatur...


In [None]:
# Save profiles to CSV
df.to_csv('data\\raw\\member_profiles_raw.csv', index=False)

List of the features to be extracted from the MongoDB collection:\
Table 2. Job_Positions - information about all the job positions held by the members of Congress
1. bioguide_id
2. jobPositions.job.name
3. jobPositions.job.jobType
4. jobPositions.congressAffiliation.congress.congressNumber
5. jobPositions.congressAffiliation.congress.startDate
5a.(if applicable) jobPositions.startDate
6. jobPositions.congressAffiliation.congress.endDate
6a.(if applicable) jobPositions.endDate
7. jobPositions.congressAffiliation.represents.regionType
8. jobPositions.congressAffiliation.represents.regionCode
9. jobPositions.congressAffiliation.partyAffiliation.party.name

In [None]:
# """
# TABLE 2. Job_Positions - information about all the job positions held by the members of Congress

# 1. bioguide_id
# 2. jobPositions.job.name
# 3. jobPositions.job.jobType
# 4. jobPositions.congressAffiliation.congress.congressNumber
# 5. jobPositions.congressAffiliation.congress.startDate
# 5a.(if applicable) jobPositions.startDate
# 6. jobPositions.congressAffiliation.congress.endDate
# 6a.(if applicable) jobPositions.endDate
# 7. jobPositions.congressAffiliation.represents.regionType
# 8. jobPositions.congressAffiliation.represents.regionCode
# 9. jobPositions.congressAffiliation.partyAffiliation.party.name
# """

# # iterate through a set of congress numbers to extract job positions
# # set for congresses 86-119, this is just after HI and AK were admitted
# congress_numbers = range(86, 119)

# # Use aggregation to extract job positions for specific congress
# pipeline = [
#     # Match members who served in the congress
#     {"$match": {"jobPositions.congressAffiliation.congress.congressNumber": congress_numbers}},
    
#     # Unwind the jobPositions array to work with individual positions
#     {"$unwind": "$jobPositions"},
    
#     # Filter to only the matching congress
#     {"$match": {"jobPositions.congressAffiliation.congress.congressNumber": congress_numbers}},
    
#     # Project the fields we want
#     {"$project": {
#         "_id": 0,
#         "bioguide_id": "$usCongressBioId",
#         "chamber": "$jobPositions.job.name",
#         "job_type": "$jobPositions.job.jobType",
#         "congress_number": "$jobPositions.congressAffiliation.congress.congressNumber",
#         "congress_start_date": "$jobPositions.congressAffiliation.congress.startDate",
#         "job_start_date": "$jobPositions.startDate",
#         "congress_end_date": "$jobPositions.congressAffiliation.congress.endDate",
#         "job_end_date": "$jobPositions.endDate", 
#         "region_type": "$jobPositions.congressAffiliation.represents.regionType",
#         "region_code": "$jobPositions.congressAffiliation.represents.regionCode",
#         "party_name": "$jobPositions.congressAffiliation.partyAffiliation.party.name"

#     }}
# ]

# # Execute the pipeline throught the range of congress numbers

# results = []
# for congress_number in congress_numbers:
#     # Update the pipeline with the current congress number
#     pipeline[0]["$match"]["jobPositions.congressAffiliation.congress.congressNumber"] = congress_number
#     pipeline[2]["$match"]["jobPositions.congressAffiliation.congress.congressNumber"] = congress_number
    
#     # Aggregate results for the current congress number
#     results.extend(list(collection.aggregate(pipeline)))

# # Convert directly to DataFrame
# df = pd.DataFrame(results)
# print(f"Total members found: {len(df)}")
# df.head()
# df.tail()
# # Save to CSV
# df.to_csv('data\\postings_v1.csv', index=False)

Total members found: 36674


Unnamed: 0,bioguide_id,chamber,job_type,congress_number,congress_start_date,congress_end_date,region_type,region_code,party_name,job_start_date,job_end_date
36669,W000828,Representative,CongressMemberJob,118,2023-01-03,2025-01-03,DistrictRegion,NY,[Republican],,
36670,W000829,Representative,CongressMemberJob,118,2023-01-03,2025-01-03,DistrictRegion,WI,[Republican],2024-11-05,
36671,Y000064,Senator,CongressMemberJob,118,2023-01-03,2025-01-03,StateRegion,IN,[Republican],,
36672,Y000067,Representative,CongressMemberJob,118,2023-01-03,2025-01-03,DistrictRegion,IN,[Republican],,
36673,Z000018,Representative,CongressMemberJob,118,2023-01-03,2025-01-03,DistrictRegion,MT,[Republican],,


In [None]:
"""
TABLE 2. Postings

There are 18 members of Congress who have served in more than one party affiliation during their careers. Can this data be extracted?

"""

# iterate through a set of congress numbers to extract job positions
# set for congresses 86-119, this is just after HI and AK were admitted
congress_numbers = range(86, 119)

# Use aggregation to extract job positions for specific congress
pipeline = [
    # Match members who served in the congress
    {"$match": {"jobPositions.congressAffiliation.congress.congressNumber": congress_numbers}},
    
    # Unwind the jobPositions array to work with individual positions
    {"$unwind": "$jobPositions"},
    
    # Filter to only the matching congress
    {"$match": {"jobPositions.congressAffiliation.congress.congressNumber": congress_numbers}},
    
    # Unwind partyAffiliation to handle members who switched parties
    {"$unwind": {"path": "$jobPositions.congressAffiliation.partyAffiliation", "preserveNullAndEmptyArrays": True}},
    
    # Project the fields we want
    {"$project": {
        "_id": 0,
        "bioguide_id": "$usCongressBioId",
        "chamber": "$jobPositions.job.name",
        "job_type": "$jobPositions.job.jobType",
        "congress_number": "$jobPositions.congressAffiliation.congress.congressNumber",
        "congress_start_date": "$jobPositions.congressAffiliation.congress.startDate",
        "party_start_date": "$jobPositions.congressAffiliation.partyAffiliation.startDate",
        "party_end_date": "$jobPositions.congressAffiliation.partyAffiliation.endDate",
        "job_start_date": "$jobPositions.startDate",
        "congress_end_date": "$jobPositions.congressAffiliation.congress.endDate",
        "job_end_date": "$jobPositions.endDate", 
        "region_type": "$jobPositions.congressAffiliation.represents.regionType",
        "region_code": "$jobPositions.congressAffiliation.represents.regionCode",
        "party_name": "$jobPositions.congressAffiliation.partyAffiliation.party.name"       
    }}
]

# Execute the pipeline throught the range of congress numbers

results = []
for congress_number in congress_numbers:
    # Update the pipeline with the current congress number
    pipeline[0]["$match"]["jobPositions.congressAffiliation.congress.congressNumber"] = congress_number
    pipeline[2]["$match"]["jobPositions.congressAffiliation.congress.congressNumber"] = congress_number
    
    # Aggregate results for the current congress number
    results.extend(list(collection.aggregate(pipeline)))

# Convert directly to DataFrame
df = pd.DataFrame(results)
print(f"Total members found: {len(df)}")
df.head()
df.tail()
# Save to CSV
df.to_csv('data\\raw\\postings_raw.csv', index=False)

Total members found: 36714
