In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path



In [2]:
# Make a list of all the json files in the extracted directory
extraction_dir = "D:\\bioguides"
json_files = list(Path(extraction_dir).rglob('*.json'))
print(f'Number of JSON files found: {len(json_files)}')

Number of JSON files found: 13047


In [3]:
# read the a json file as a sample
with open(json_files[-50], 'r') as f:
    sample_data = json.load(f)
print(json.dumps(sample_data, indent=4))

{
    "usCongressBioId": "Y000036",
    "familyName": "Young",
    "givenName": "George",
    "middleName": "Morley",
    "honorificPrefix": "Mr.",
    "unaccentedFamilyName": "Young",
    "unaccentedGivenName": "George",
    "unaccentedMiddleName": "Morley",
    "birthDate": "1870-12-11",
    "birthCirca": false,
    "deathDate": "1932-05-27",
    "deathCirca": false,
    "image": [
        {
            "contentUrl": "/bioguide-published/assets/0aaf9052b3b44a92986fb46a77815338.jpg",
            "caption": "Collection of the U.S. House of Representatives"
        }
    ],
    "profileText": "A Representative from North Dakota; born in Lakelet, Huron County, Ontario, Canada, December 11, 1870; when a boy moved to the United States and settled in St. Charles, Mich.; attended the public schools; was graduated from the law department of the University of Minnesota at Minneapolis in 1894; was admitted to the bar the same year and commenced practice in Valley City, N.Dak.; member of the boa

In [4]:
# Load a sample of files to inspect their structure
sample_size = 10
sample_files = json_files[:sample_size]

# Load the sample files
sample_data = []
for file in sample_files:
    with open(file, 'r', encoding='utf-8') as f:
        sample_data.append(json.load(f))

# Check the keys (columns) in each file
print("Checking top-level keys in sample files:\n")
for idx, data in enumerate(sample_data):
    print(f"File {idx}: {list(data.keys())}")

Checking top-level keys in sample files:

File 0: ['data']
File 1: ['usCongressBioId', 'familyName', 'givenName', 'middleName', 'honorificPrefix', 'unaccentedFamilyName', 'unaccentedGivenName', 'unaccentedMiddleName', 'birthDate', 'birthCirca', 'deathDate', 'deathCirca', 'image', 'profileText', 'relationship', 'asset', 'jobPositions', 'creativeWork', 'researchRecord', 'deleted']
File 2: ['data']
File 3: ['data']
File 4: ['usCongressBioId', 'familyName', 'givenName', 'honorificPrefix', 'nickName', 'unaccentedFamilyName', 'unaccentedGivenName', 'birthDate', 'birthCirca', 'birthDateUnknown', 'deathDate', 'deathCirca', 'image', 'profileText', 'relationship', 'asset', 'jobPositions', 'creativeWork', 'researchRecord', 'deleted']
File 5: ['usCongressBioId', 'familyName', 'givenName', 'middleName', 'unaccentedFamilyName', 'unaccentedGivenName', 'unaccentedMiddleName', 'birthDate', 'birthCirca', 'deathDate', 'deathCirca', 'image', 'profileText', 'relationship', 'asset', 'jobPositions', 'creativ

In [5]:
# Remove 'data' wrapper from files and save back
files_modified = 0

for file in json_files:
    with open(file, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    
    # Check if file has 'data' wrapper at root level
    if isinstance(json_data, dict) and 'data' in json_data and len(json_data) == 1:
        # Extract the data from the wrapper
        unwrapped_data = json_data['data']
        
        # Write back to file
        with open(file, 'w', encoding='utf-8') as f:
            json.dump(unwrapped_data, f, indent=4)
        
        files_modified += 1

print(f"Files modified: {files_modified}")
print(f"Files unchanged: {len(json_files) - files_modified}")

Files modified: 4556
Files unchanged: 8491


In [6]:
# Load a sample of files to inspect their structure
sample_size = 10
sample_files = json_files[:sample_size]

# Load the sample files
sample_data = []
for file in sample_files:
    with open(file, 'r', encoding='utf-8') as f:
        sample_data.append(json.load(f))

# Check the keys (columns) in each file
print("Checking top-level keys in sample files:\n")
for idx, data in enumerate(sample_data):
    print(f"File {idx}: {list(data.keys())}")

Checking top-level keys in sample files:

File 0: ['usCongressBioId', 'familyName', 'givenName', 'middleName', 'honorificPrefix', 'unaccentedFamilyName', 'unaccentedGivenName', 'unaccentedMiddleName', 'birthDate', 'birthCirca', 'birthDateUnknown', 'deathDate', 'deathCirca', 'deathDateUnknown', 'image', 'profileText', 'relationship', 'asset', 'jobPositions', 'creativeWork', 'researchRecord', 'deleted']
File 1: ['usCongressBioId', 'familyName', 'givenName', 'middleName', 'honorificPrefix', 'unaccentedFamilyName', 'unaccentedGivenName', 'unaccentedMiddleName', 'birthDate', 'birthCirca', 'deathDate', 'deathCirca', 'image', 'profileText', 'relationship', 'asset', 'jobPositions', 'creativeWork', 'researchRecord', 'deleted']
File 2: ['usCongressBioId', 'familyName', 'givenName', 'honorificPrefix', 'unaccentedFamilyName', 'unaccentedGivenName', 'birthDate', 'birthCirca', 'birthDateUnknown', 'deathDate', 'deathCirca', 'deathDateUnknown', 'profileText', 'relationship', 'asset', 'jobPositions', '

In [None]:
# make a dataframe from all json files, the dataframe will have columns for all keys found in any file and retain the order of keys as they appear in the files. Each row corresponds to a file.
data_rows = []
for file in json_files:
    with open(file, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    
    # Create a row dictionary for the dataframe
    row = {}
    for key in json_data.keys():
        row[key] = json_data[key]
    
    data_rows.append(row)
# Create the dataframe
df = pd.DataFrame(data_rows)
print(f"Dataframe shape: {df.shape}")
print("Dataframe columns:")
print(df.columns.tolist())



Dataframe shape: (13047, 25)
Dataframe columns:
['usCongressBioId', 'familyName', 'givenName', 'middleName', 'honorificPrefix', 'unaccentedFamilyName', 'unaccentedGivenName', 'unaccentedMiddleName', 'birthDate', 'birthCirca', 'birthDateUnknown', 'deathDate', 'deathCirca', 'deathDateUnknown', 'image', 'profileText', 'relationship', 'asset', 'jobPositions', 'creativeWork', 'researchRecord', 'deleted', 'nickName', 'honorificSuffix', 'nameHistory']


In [None]:
# Identify the order of keys to standardize on based on the files where the keys appear