In [107]:
import pandas as pd
import re

In [153]:
file_path = './data/health insurance coverage by age.csv'
df = pd.read_csv(file_path)

In [154]:
# Delete the first column
new_df = df.drop(df.columns[0], axis=1)

# Get the columns to keep (every other column)
columns_to_keep = new_df.columns[::2]

# Keep only the columns to keep and create a new DataFrame

new_df = new_df[columns_to_keep]

# Display the DataFrame with every other column deleted
print("\nDataFrame with every other column deleted:")
print(new_df)


DataFrame with every other column deleted:
    Block Group 1, Census Tract 702, Montgomery County, New York!!Estimate  \
0                                                 435                        
1                                                  72                        
2                                                  72                        
3                                                  43                        
4                                                   0                        
..                                                ...                        
61                                                  0                        
62                                                  0                        
63                                                  0                        
64                                                 22                        
65                                                  0                        

    Block Group 2, 

In [155]:
# Convert rows containing string values to float
for column in new_df.columns:
    new_df[column] = pd.to_numeric(new_df[column], errors='coerce', downcast='integer')

# Rows to compute sum for
specific_rows = [[1, 17, 33], [6, 22, 38], [3, 4, 19, 20, 35, 36]]

# Create a new DataFrame to store the sums of specific rows for each column
sums_df = pd.DataFrame(columns=new_df.columns)

# Iterate through each column and calculate the sum for specific rows
for i, specific_row in enumerate(specific_rows):
    for column in new_df.columns:
        sum_values = new_df.iloc[specific_row, :][column].sum()
        sums_df.at[i, column] = int(sum_values)

# Display the sum_df DataFrame
print(sums_df)

  Block Group 1, Census Tract 702, Montgomery County, New York!!Estimate  \
0                                                316                       
1                                                 50                       
2                                                237                       

  Block Group 2, Census Tract 702, Montgomery County, New York!!Estimate  \
0                                                677                       
1                                                533                       
2                                                 75                       

  Block Group 3, Census Tract 702, Montgomery County, New York!!Estimate  \
0                                                467                       
1                                                127                       
2                                                203                       

  Block Group 1, Census Tract 703, Montgomery County, New York!!Estimate  \
0        

In [156]:
# Replace column names with geo_ids format
geo_ids = []

for column in sums_df.columns:
    arr = column.split(',')

    # Extract numeric values using regex
    census_tract = re.findall(r"[-+]?\d*\.\d+|\d+", arr[1])[0]
    block_group = re.findall(r"[-+]?\d*\.\d+|\d+", arr[0])[0]

    val = '0' + census_tract + '00' + block_group
    geo_ids.append(val)

# Flip the DataFrame matrix
sums_df_flip = sums_df.transpose()

# Add geo_ids to the flipped data as a new column
sums_df_flip.columns = ["total", "medicaid", "private"]
sums_df_flip["geo_id"] = geo_ids
    
# sums_df.columns = []
print(sums_df_flip)


                                                   total medicaid private  \
Block Group 1, Census Tract 702, Montgomery Cou...   316       50     237   
Block Group 2, Census Tract 702, Montgomery Cou...   677      533      75   
Block Group 3, Census Tract 702, Montgomery Cou...   467      127     203   
Block Group 1, Census Tract 703, Montgomery Cou...   796      443      96   
Block Group 2, Census Tract 703, Montgomery Cou...  1013      618     275   
Block Group 1, Census Tract 704, Montgomery Cou...   495       82     266   
Block Group 2, Census Tract 704, Montgomery Cou...  1342      390     668   
Block Group 1, Census Tract 705, Montgomery Cou...  1164      402     702   
Block Group 2, Census Tract 705, Montgomery Cou...   891       99     638   
Block Group 1, Census Tract 706, Montgomery Cou...   936      449     411   
Block Group 2, Census Tract 706, Montgomery Cou...  1171      487     493   
Block Group 1, Census Tract 707, Montgomery Cou...   947      266     530   

In [157]:
# File path where you want to save the CSV file
output_file_path = './data/outputs/health_insurance_estimates.csv'

# Export 'result_df' DataFrame to a CSV file
sums_df_flip.to_csv(output_file_path, index=False)

print(f"CSV file '{output_file_path}' has been created.")

CSV file './data/outputs/health_insurance_estimates.csv' has been created.
