# Data preparation for the similarity measure

### Goal: Merge the individual datasets into a single DataFrame with key attributes that will facilitate computing the similarity between counties. 

```markdown
Inputs: 
    - cleaned_countyoutflow.csv: cleaned outlfow migration data, 
    - Zillow_Data_Yearly_FIPS.csv: housing data (used only the last 6 years that had minimal null values to calculate the house index)
    - health_data_clean.csv: cleaned health data
    - Temperature.csv: temperature by county in F

outputs: 
    - merged dataset of shape (3054, 22)

# Imports

In [510]:
import pandas as pd

# Importing data

In [511]:

outflow = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/network_graph/inputs_data/cleaned_countyoutflow.csv')
house_pricing = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/network_graph/inputs_data/zillow_data_raw.csv')
health = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/network_graph/inputs_data/health_data_clean.csv')
temperature = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/network_graph/inputs_data/temperature.csv', on_bad_lines='skip') # There is 1 record with bad lines

## Part 1: Migration data

In [512]:
outflow.head(3)

Unnamed: 0,year,origin_state_fips,origin_county_fips,destination_state_fips,destination_county_fips,y1fips,y2fips,y2_state,y2_countyname,num_returns,num_individuals,adjusted_gross_income
0,16-17,1,1,1,1,1001,1001,AL,Autauga County Non-migrants,17484,39711,1106647
1,16-17,1,21,1,1,1021,1001,AL,Autauga County,83,220,3006
2,16-17,1,47,1,1,1047,1001,AL,Autauga County,82,195,2917


In [513]:
unique_states = outflow['y2_state'].unique()
state_fips_mapping = dict(zip(unique_states, outflow['destination_state_fips'].unique()))
state_fips_mapping = {int(v): k for k, v in state_fips_mapping.items()}

# Remap origin_state_fips to state initials using state_fips_mapping
outflow['origin_state'] = outflow['origin_state_fips'].map(state_fips_mapping)

# Display the updated DataFrame
outflow

Unnamed: 0,year,origin_state_fips,origin_county_fips,destination_state_fips,destination_county_fips,y1fips,y2fips,y2_state,y2_countyname,num_returns,num_individuals,adjusted_gross_income,origin_state
0,16-17,1,1,1,1,1001,1001,AL,Autauga County Non-migrants,17484,39711,1106647,AL
1,16-17,1,21,1,1,1021,1001,AL,Autauga County,83,220,3006,AL
2,16-17,1,47,1,1,1047,1001,AL,Autauga County,82,195,2917,AL
3,16-17,1,51,1,1,1051,1001,AL,Autauga County,535,1155,23666,AL
4,16-17,1,73,1,1,1073,1001,AL,Autauga County,53,108,2488,AL
...,...,...,...,...,...,...,...,...,...,...,...,...,...
447477,20-21,56,45,59,0,56045,59000,DS,Other flows - Different State,79,158,4320,WY
447478,20-21,56,45,59,1,56045,59001,DS,Other flows - Northeast,-1,-1,-1,WY
447479,20-21,56,45,59,3,56045,59003,DS,Other flows - Midwest,43,88,2166,WY
447480,20-21,56,45,59,5,56045,59005,DS,Other flows - South,-1,-1,-1,WY


In [589]:
mapping = {}
for index, row in outflow.iterrows():
    y2fips = row['y2_state'] + '_' + str(row['destination_state_fips'])  
    y2_state = row['y2_state']
    y2_countyname = row['y2_countyname']
    mapping[y2fips] =  y2_state+'_' + y2_countyname

In [595]:
mapping_df = pd.DataFrame.from_dict(mapping, orient='index', columns=['county'])
mapping_df.reset_index(inplace=True)
mapping_df.rename(columns={'index': 'code'}, inplace=True)
mapping_df

Unnamed: 0,code,county
0,AL_1,AL_Madison County
1,AK_2,AK_Anchorage Municipality
2,AZ_4,AZ_Maricopa County
3,AR_5,AR_Benton County
4,CA_6,CA_Santa Barbara County
...,...,...
202,VT_98,VT_Windsor County Total Migration-Foreign
203,WA_98,WA_Thurston County Total Migration-Foreign
204,WI_98,WI_Milwaukee County Total Migration-Foreign
205,WV_98,WV_Wood County Total Migration-Foreign


In [598]:
inflow = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/network_graph/inputs_data/countyinflow16-21_compiled.csv')

In [599]:
inflow.head()

Unnamed: 0,year,y2_statefips,y2_countyfips,y1_statefips,y1_countyfips,y2_fips,y1_fips,y1_state,y1_countyname,n1,n2,agi
0,16-17,1,0,96,0,1000,96000,AL,Total Migration-US and Foreign,130855,273891,7058182
1,16-17,1,0,97,0,1000,97000,AL,Total Migration-US,130235,272345,7022706
2,16-17,1,0,97,1,1000,97001,AL,Total Migration-Same State,71530,148378,3510398
3,16-17,1,0,97,3,1000,97003,AL,Total Migration-Different State,58705,123967,3512308
4,16-17,1,0,98,0,1000,98000,AL,Total Migration-Foreign,620,1546,35476


In [None]:
mapping = {}
for index, row in inflow.iterrows():
    y2fips = row['y1_state'] + '_' + str(row['y1_statefips'])  
    y2_state = row['y2_state']
    y2_countyname = row['y2_countyname']
    mapping[y2fips] =  y2_state+'_' + y2_countyname

In [596]:
mapping_df.to_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/network_graph/inputs_data/mapping_df.csv', index=False)

In [514]:
outflow.y1fips.min()

np.int64(1)

In [515]:
len(outflow)

447482

In [516]:
outflow.year.unique()

array(['16-17', '17-18', '18-19', '19-20', '20-21'], dtype=object)

Grouping outflow by year and to have a single line for each county. The aggregation method is mean across attributes.

In [517]:
cols = ['origin_state', 'origin_county_fips', 'num_returns', 'num_individuals','adjusted_gross_income']

In [518]:
outflow_group  = outflow[cols]
outflow_group = outflow_group .groupby(['origin_state','origin_county_fips']).mean().reset_index()
outflow_group.head(3)

Unnamed: 0,origin_state,origin_county_fips,num_returns,num_individuals,adjusted_gross_income
0,AK,0,16976.9,34483.7,1025512.0
1,AK,13,76.803279,141.491803,3927.951
2,AK,16,178.544118,341.294118,11898.51


In [519]:
print(f"lenght of grouped outflow: ",len(outflow_group))

lenght of grouped outflow:  3195


## Part 2: House pricing

``` markdown
- Clean up the county idenfiers
- Calcule the net change by county between  2000 and 2024 (net_housing_price_change)
- Extract the current house price as 2024 (2024_house_price)
- Merge with migration data

In [520]:
# house_pricing = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/Network_graph_analysis/Zillow_Data_Yearly_FIPS.csv')
house_pricing.head(3)

Unnamed: 0,FIPS,RegionName,State,FIPS.1,2000,2001,2002,2003,2004,2005,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,2020,Anchorage Borough,AK,2020,146144.3161,169973.1478,186475.8294,195969.8382,209724.6331,229868.8594,...,304919.1673,311721.6694,310289.3725,312758.8579,330962.7465,338964.0436,344423.7324,365491.1856,377498.0278,384018.9248
1,2090,Fairbanks North Star Borough,AK,2090,,,,,,188322.2179,...,225121.4489,233876.6726,243537.0955,252751.346,265862.9022,277602.7961,285834.8267,300548.1123,296752.7637,301036.6606
2,2100,Haines Borough,AK,2100,,,,,,,...,,,,,,,284891.8249,315908.794,298008.7765,294193.3742


Use the raw dataset that has all the counties

In [521]:
len(house_pricing)

3074

In [522]:
house_pricing.isna().sum()

FIPS             0
RegionName       0
State            0
FIPS.1           0
2000          2020
2001          1979
2002          1938
2003          1880
2004          1837
2005          1788
2006          1758
2007          1704
2008          1634
2009          1059
2010           985
2011           921
2012           647
2013           597
2014           560
2015           529
2016           101
2017            94
2018            86
2019            68
2020            63
2021            51
2022            12
2023             2
2024             0
dtype: int64

In [523]:
cols_to_drop = ['2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015'] 
cols_to_fillna = ['2016','2017','2018','2019','2020','2021','2022','2023','2024'] 

In [524]:
house_pricing_clean = house_pricing.drop(cols_to_drop, axis=1)
house_pricing_clean[cols_to_fillna] = house_pricing_clean[cols_to_fillna].fillna(house_pricing_clean[cols_to_fillna].mean())

In [525]:
house_pricing_clean.isna().sum()

FIPS          0
RegionName    0
State         0
FIPS.1        0
2016          0
2017          0
2018          0
2019          0
2020          0
2021          0
2022          0
2023          0
2024          0
dtype: int64

Calculating the mean house price and price index

In [526]:
house_pricing_clean['mean_house_price'] = house_pricing_clean[cols_to_fillna].mean(axis=1)

In [527]:
house_pricing_clean['price_index'] = house_pricing_clean['mean_house_price'] / house_pricing_clean['2016'] * 100

In [528]:
house_pricing_clean.head(3)

Unnamed: 0,FIPS,RegionName,State,FIPS.1,2016,2017,2018,2019,2020,2021,2022,2023,2024,mean_house_price,price_index
0,2020,Anchorage Borough,AK,2020,311721.6694,310289.3725,312758.8579,330962.7465,338964.0436,344423.7324,365491.1856,377498.0278,384018.9248,341792.062278,109.646552
1,2090,Fairbanks North Star Borough,AK,2090,233876.6726,243537.0955,252751.346,265862.9022,277602.7961,285834.8267,300548.1123,296752.7637,301036.6606,273089.241744,116.766345
2,2100,Haines Borough,AK,2100,152894.894751,158926.361118,167502.013802,175436.00027,187249.80867,284891.8249,315908.794,298008.7765,294193.3742,226112.427579,147.887494


In [529]:
# defining a function to format the house_pricing data to be the same as the merged migration 
def county_code_reformat(value):
        # ensure the value is a string
        num_str = str(value)
        num_str = num_str[-3:]
            # check if the new first character is zero
        if num_str[0] == '0':
            # remove the zero
            num_str = num_str[1:]
            # repeat
            if num_str[0] == '0':
                num_str = num_str[1:]
                return int(num_str)
            else:
                 return int(num_str)
        else:
             return int(num_str)

In [530]:
house_pricing_clean['short_county_code'] = house_pricing_clean['FIPS'].apply(county_code_reformat) 
null_values = house_pricing_clean['short_county_code'].isnull().sum()
print(f"Number of null values in short_county_code column: {null_values}")

Number of null values in short_county_code column: 0


In [531]:
# cast the typpe for the short_county_code as integer
house_pricing_clean['short_county_code'] = house_pricing_clean['short_county_code'].astype(int)
house_pricing_clean.head(3)


Unnamed: 0,FIPS,RegionName,State,FIPS.1,2016,2017,2018,2019,2020,2021,2022,2023,2024,mean_house_price,price_index,short_county_code
0,2020,Anchorage Borough,AK,2020,311721.6694,310289.3725,312758.8579,330962.7465,338964.0436,344423.7324,365491.1856,377498.0278,384018.9248,341792.062278,109.646552,20
1,2090,Fairbanks North Star Borough,AK,2090,233876.6726,243537.0955,252751.346,265862.9022,277602.7961,285834.8267,300548.1123,296752.7637,301036.6606,273089.241744,116.766345,90
2,2100,Haines Borough,AK,2100,152894.894751,158926.361118,167502.013802,175436.00027,187249.80867,284891.8249,315908.794,298008.7765,294193.3742,226112.427579,147.887494,100


In [532]:
house_attributes = house_pricing_clean[['State','short_county_code', '2024','price_index']]
house_attributes.rename(columns={'2024': 'current_house_price_2024'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  house_attributes.rename(columns={'2024': 'current_house_price_2024'}, inplace=True)


In [533]:
# house_pricing_clean = house_pricing_clean.groupby(['State', 'FIPS']).mean().reset_index()
len(house_pricing)

3074

In [534]:
house_attributes.head(3)

Unnamed: 0,State,short_county_code,current_house_price_2024,price_index
0,AK,20,384018.9248,109.646552
1,AK,90,301036.6606,116.766345
2,AK,100,294193.3742,147.887494


In [535]:
# house_pricing_cleaned = house_pricing[['State','short_county_code', 'house_index', 'housing_avg_value']]
full_dataset = outflow_group.merge(house_attributes, left_on=['origin_state','origin_county_fips'], right_on=['State', 'short_county_code'], how='left')
print(len(full_dataset))
full_dataset.isna().sum()

3195


origin_state                  0
origin_county_fips            0
num_returns                   0
num_individuals               0
adjusted_gross_income         0
State                       121
short_county_code           121
current_house_price_2024    121
price_index                 121
dtype: int64

In [536]:
full_dataset.dropna(inplace=True)

In [537]:
full_dataset

Unnamed: 0,origin_state,origin_county_fips,num_returns,num_individuals,adjusted_gross_income,State,short_county_code,current_house_price_2024,price_index
3,AK,20,1515.256757,3049.256757,128632.540541,AK,20.0,3.840189e+05,109.646552
10,AK,90,1109.310484,2316.608871,79035.072581,AK,90.0,3.010367e+05,116.766345
11,AK,100,116.833333,212.129630,8160.592593,AK,100.0,2.941934e+05,147.887494
13,AK,110,983.977778,1853.744444,82135.311111,AK,110.0,4.841831e+05,112.008456
14,AK,122,1631.415730,3294.910112,119191.123596,AK,122.0,3.251015e+05,118.538734
...,...,...,...,...,...,...,...,...,...
3190,WY,37,855.254098,1889.409836,64669.524590,WY,37.0,2.754624e+05,107.864895
3191,WY,39,857.385417,1534.052083,216688.625000,WY,39.0,2.293970e+06,154.976006
3192,WY,41,635.213333,1509.626667,43552.973333,WY,41.0,3.103467e+05,126.141561
3193,WY,43,323.068966,695.241379,23354.465517,WY,43.0,2.506729e+05,119.822445


# Temperature

In [538]:
temperature.head(3)

Unnamed: 0,ID,Name,State,Value,Anomaly (1901-2000 base period),Rank,1901-2000 Mean
0,AL-001,Autauga County,Alabama,66.8,2.2,102,64.6
1,AL-003,Baldwin County,Alabama,70.7,2.7,109,68.0
2,AL-005,Barbour County,Alabama,67.2,1.6,92,65.6


In [539]:
temperature[['state', 'county']] = temperature['ID'].str.split('-', expand=True)
temperature.drop(columns='ID', inplace=True)

In [540]:
temperature.head(3)

Unnamed: 0,Name,State,Value,Anomaly (1901-2000 base period),Rank,1901-2000 Mean,state,county
0,Autauga County,Alabama,66.8,2.2,102,64.6,AL,1
1,Baldwin County,Alabama,70.7,2.7,109,68.0,AL,3
2,Barbour County,Alabama,67.2,1.6,92,65.6,AL,5


In [541]:
temperature['short_code_county'] = temperature['county'].apply(county_code_reformat)

In [542]:
temperature.head(3)

Unnamed: 0,Name,State,Value,Anomaly (1901-2000 base period),Rank,1901-2000 Mean,state,county,short_code_county
0,Autauga County,Alabama,66.8,2.2,102,64.6,AL,1,1
1,Baldwin County,Alabama,70.7,2.7,109,68.0,AL,3,3
2,Barbour County,Alabama,67.2,1.6,92,65.6,AL,5,5


In [543]:
temperature_attributes = temperature[['state', 'short_code_county', 'Value']]
temperature_attributes.rename(columns={'Value': 'temperature'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temperature_attributes.rename(columns={'Value': 'temperature'}, inplace=True)


In [544]:
full_dataset = full_dataset.merge(temperature_attributes, left_on=['origin_state', 'origin_county_fips'], right_on=['state', 'short_code_county'], how='left')
full_dataset.drop(columns=['state', 'short_code_county'], inplace=True)

In [545]:
full_dataset

Unnamed: 0,origin_state,origin_county_fips,num_returns,num_individuals,adjusted_gross_income,State,short_county_code,current_house_price_2024,price_index,temperature
0,AK,20,1515.256757,3049.256757,128632.540541,AK,20.0,3.840189e+05,109.646552,
1,AK,90,1109.310484,2316.608871,79035.072581,AK,90.0,3.010367e+05,116.766345,
2,AK,100,116.833333,212.129630,8160.592593,AK,100.0,2.941934e+05,147.887494,
3,AK,110,983.977778,1853.744444,82135.311111,AK,110.0,4.841831e+05,112.008456,
4,AK,122,1631.415730,3294.910112,119191.123596,AK,122.0,3.251015e+05,118.538734,
...,...,...,...,...,...,...,...,...,...,...
3069,WY,37,855.254098,1889.409836,64669.524590,WY,37.0,2.754624e+05,107.864895,49.0
3070,WY,39,857.385417,1534.052083,216688.625000,WY,39.0,2.293970e+06,154.976006,41.5
3071,WY,41,635.213333,1509.626667,43552.973333,WY,41.0,3.103467e+05,126.141561,47.4
3072,WY,43,323.068966,695.241379,23354.465517,WY,43.0,2.506729e+05,119.822445,51.6


In [546]:
full_dataset.isna().sum()

origin_state                 0
origin_county_fips           0
num_returns                  0
num_individuals              0
adjusted_gross_income        0
State                        0
short_county_code            0
current_house_price_2024     0
price_index                  0
temperature                 20
dtype: int64

In [547]:
full_dataset.dropna(subset=['temperature'], inplace=True)

In [548]:
len(full_dataset)

3054

In [549]:
full_dataset.isna().sum()

origin_state                0
origin_county_fips          0
num_returns                 0
num_individuals             0
adjusted_gross_income       0
State                       0
short_county_code           0
current_house_price_2024    0
price_index                 0
temperature                 0
dtype: int64

In [550]:
full_dataset = full_dataset.drop(columns=['State', 'short_county_code'])

## Health data

In [551]:
# health = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/network_graph/inputs_data/health_data_clean.csv')

In [552]:
# health = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/Network_graph_analysis/health_data_clean.csv')
health.head(3)

Unnamed: 0,Year,FIPS,State,County,X..Fair.Poor,Physically.Unhealthy.Days,Mentally.Unhealthy.Days,X..Smokers,X..Obese,X..Physically.Inactive,X..Excessive.Drinking,X..Some.College,Population.1,X..Some.College.1,X..Social.Associations,Association.Rate,X..Severe.Housing.Problems,X..Insufficient.Sleep
0,2017,1001,Alabama,Autauga,18,4.2,4.2,17,34,29,15,8617,14440,59.7,76,13.7,15,38
1,2017,1003,Alabama,Baldwin,16,3.7,4.0,18,27,22,15,29788,47367,62.9,231,11.5,15,33
2,2017,1005,Alabama,Barbour,25,4.8,4.8,23,44,32,12,2839,7230,39.3,22,8.2,16,40


In [553]:
health_to_rename = ['X..Fair.Poor', 'Physically.Unhealthy.Days',
       'Mentally.Unhealthy.Days', 'X..Smokers', 'X..Obese',
       'X..Physically.Inactive', 'X..Excessive.Drinking', 'X..Some.College',
       'Population.1', 'X..Some.College.1', 'X..Social.Associations',
       'Association.Rate', 'X..Severe.Housing.Problems',
       'X..Insufficient.Sleep']

health_cols_to_drop = ['X..Some.College.1',
       'Association.Rate', 'X..Insufficient.Sleep']

In [554]:
health.drop(health_cols_to_drop, axis=1, inplace=True)

In [555]:
# rename the only the health_to_rename columns in  health columns to drop the X.. prefix, capital letters and replace . with _
health.rename(columns={col: col.replace('X..', '').replace('.', '_').replace(' ', '_').lower() for col in health_to_rename}, inplace=True)
health


Unnamed: 0,Year,FIPS,State,County,fair_poor,physically_unhealthy_days,mentally_unhealthy_days,smokers,obese,physically_inactive,excessive_drinking,some_college,population_1,social_associations,severe_housing_problems
0,2017,1001,Alabama,Autauga,18,4.2,4.2,17,34,29,15,8617,14440,76,15
1,2017,1003,Alabama,Baldwin,16,3.7,4.0,18,27,22,15,29788,47367,231,15
2,2017,1005,Alabama,Barbour,25,4.8,4.8,23,44,32,12,2839,7230,22,16
3,2017,1007,Alabama,Bibb,20,4.4,4.4,20,40,34,14,3280,6531,23,14
4,2017,1009,Alabama,Blount,19,4.5,4.5,21,35,28,14,7279,14213,49,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15699,2021,56037,Wyoming,Sweetwater,16,3.6,4.0,19,30,24,19,7529,12002,43,11
15700,2021,56039,Wyoming,Teton,11,2.9,3.3,13,11,11,24,6517,8207,40,16
15701,2021,56041,Wyoming,Uinta,17,4.0,4.2,21,35,25,18,2821,5150,6,11
15702,2021,56043,Wyoming,Washakie,17,3.7,4.1,19,29,29,19,1072,1779,12,10


In [556]:
health.isna().sum()

Year                         0
FIPS                         0
State                        0
County                       0
fair_poor                    0
physically_unhealthy_days    0
mentally_unhealthy_days      0
smokers                      0
obese                        0
physically_inactive          0
excessive_drinking           0
some_college                 0
population_1                 0
social_associations          0
severe_housing_problems      0
dtype: int64

In [557]:
health['short_county_code'] = health['FIPS'].apply(county_code_reformat)    

In [558]:
state_to_abbreviation = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", 
    "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA",
    "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS",
    "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA",
    "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT",
    "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM",
    "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK",
    "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC",
    "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT",
    "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"
}

In [559]:
health['State_short'] = health['State'].map(state_to_abbreviation)
health['State_short'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', nan, 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'], dtype=object)

In [560]:
health.drop(columns=['State', 'County', 'FIPS'], inplace=True)
health.rename(columns={'State_short': 'State', 'short_county_code':'FIPS'}, inplace=True)

In [561]:
health.head(3)

Unnamed: 0,Year,fair_poor,physically_unhealthy_days,mentally_unhealthy_days,smokers,obese,physically_inactive,excessive_drinking,some_college,population_1,social_associations,severe_housing_problems,FIPS,State
0,2017,18,4.2,4.2,17,34,29,15,8617,14440,76,15,1,AL
1,2017,16,3.7,4.0,18,27,22,15,29788,47367,231,15,3,AL
2,2017,25,4.8,4.8,23,44,32,12,2839,7230,22,16,5,AL


In [562]:
health_grouped = health.groupby(by=['State', 'FIPS']).mean().reset_index()
health_grouped.drop(columns= 'Year', inplace = True)
len(health_grouped)

3142

In [563]:
health.columns

Index(['Year', 'fair_poor', 'physically_unhealthy_days',
       'mentally_unhealthy_days', 'smokers', 'obese', 'physically_inactive',
       'excessive_drinking', 'some_college', 'population_1',
       'social_associations', 'severe_housing_problems', 'FIPS', 'State'],
      dtype='object')

In [564]:
health_grouped.head(3)

Unnamed: 0,State,FIPS,fair_poor,physically_unhealthy_days,mentally_unhealthy_days,smokers,obese,physically_inactive,excessive_drinking,some_college,population_1,social_associations,severe_housing_problems
0,AK,13,17.6,3.62,2.94,18.2,42.2,26.6,16.8,434.8,1005.6,1.6,13.0
1,AK,16,15.6,3.22,2.78,16.2,41.2,25.8,18.6,987.8,1892.8,2.8,19.0
2,AK,20,13.6,3.48,3.32,15.4,29.2,17.4,20.0,63400.2,89256.8,339.4,16.6


In [565]:
health_grouped.isna().sum()

State                        0
FIPS                         0
fair_poor                    0
physically_unhealthy_days    0
mentally_unhealthy_days      0
smokers                      0
obese                        0
physically_inactive          0
excessive_drinking           0
some_college                 0
population_1                 0
social_associations          0
severe_housing_problems      0
dtype: int64

In [566]:
full_dataset

Unnamed: 0,origin_state,origin_county_fips,num_returns,num_individuals,adjusted_gross_income,current_house_price_2024,price_index,temperature
14,AL,1,1157.678261,2613.026087,73080.478261,2.399009e+05,119.895294,66.8
15,AL,3,2318.377990,4804.588517,165935.966507,3.876260e+05,136.772357,70.7
16,AL,5,488.840426,1026.468085,21870.553191,1.435597e+05,127.522538,67.2
17,AL,7,506.294872,1136.474359,27152.615385,1.883447e+05,123.796660,66.2
18,AL,9,1099.470588,2524.274510,61169.490196,2.245398e+05,128.134665,64.9
...,...,...,...,...,...,...,...,...
3069,WY,37,855.254098,1889.409836,64669.524590,2.754624e+05,107.864895,49.0
3070,WY,39,857.385417,1534.052083,216688.625000,2.293970e+06,154.976006,41.5
3071,WY,41,635.213333,1509.626667,43552.973333,3.103467e+05,126.141561,47.4
3072,WY,43,323.068966,695.241379,23354.465517,2.506729e+05,119.822445,51.6


In [567]:
similarity_attributes = full_dataset.merge(health_grouped, left_on=['origin_state', 'origin_county_fips'], right_on=['State', 'FIPS'], how= 'left')


In [568]:
similarity_attributes.head(3)

Unnamed: 0,origin_state,origin_county_fips,num_returns,num_individuals,adjusted_gross_income,current_house_price_2024,price_index,temperature,State,FIPS,...,physically_unhealthy_days,mentally_unhealthy_days,smokers,obese,physically_inactive,excessive_drinking,some_college,population_1,social_associations,severe_housing_problems
0,AL,1,1157.678261,2613.026087,73080.478261,239900.8779,119.895294,66.8,AL,1,...,4.36,4.48,18.6,34.8,31.4,15.6,8765.4,14370.6,71.6,14.8
1,AL,3,2318.37799,4804.588517,165935.966507,387625.9624,136.772357,70.7,AL,3,...,3.94,4.3,17.6,29.6,24.6,17.2,31604.2,48264.6,225.0,14.0
2,AL,5,488.840426,1026.468085,21870.553191,143559.7134,127.522538,67.2,AL,5,...,5.2,4.96,23.0,43.0,28.6,12.6,2579.8,6918.6,20.6,15.0


In [569]:
similarity_attributes = similarity_attributes.drop(columns=['State','FIPS'])

In [570]:
similarity_attributes

Unnamed: 0,origin_state,origin_county_fips,num_returns,num_individuals,adjusted_gross_income,current_house_price_2024,price_index,temperature,fair_poor,physically_unhealthy_days,mentally_unhealthy_days,smokers,obese,physically_inactive,excessive_drinking,some_college,population_1,social_associations,severe_housing_problems
0,AL,1,1157.678261,2613.026087,73080.478261,2.399009e+05,119.895294,66.8,19.0,4.36,4.48,18.6,34.8,31.4,15.6,8765.4,14370.6,71.6,14.8
1,AL,3,2318.377990,4804.588517,165935.966507,3.876260e+05,136.772357,70.7,17.2,3.94,4.30,17.6,29.6,24.6,17.2,31604.2,48264.6,225.0,14.0
2,AL,5,488.840426,1026.468085,21870.553191,1.435597e+05,127.522538,67.2,27.4,5.20,4.96,23.0,43.0,28.6,12.6,2579.8,6918.6,20.6,15.0
3,AL,7,506.294872,1136.474359,27152.615385,1.883447e+05,123.796660,66.2,20.6,4.54,4.58,20.4,38.2,34.8,15.4,2894.6,6211.4,21.6,11.2
4,AL,9,1099.470588,2524.274510,61169.490196,2.245398e+05,128.134665,64.9,21.0,4.68,4.84,20.6,34.4,29.8,14.8,7531.2,14020.6,49.2,10.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3049,WY,37,855.254098,1889.409836,64669.524590,2.754624e+05,107.864895,49.0,15.4,3.58,3.58,18.2,30.6,25.4,20.2,7459.8,12645.4,43.4,10.2
3050,WY,39,857.385417,1534.052083,216688.625000,2.293970e+06,154.976006,41.5,11.8,3.06,3.18,14.0,12.6,11.6,22.4,6407.0,8052.4,41.6,17.2
3051,WY,41,635.213333,1509.626667,43552.973333,3.103467e+05,126.141561,47.4,16.0,3.78,3.80,18.4,32.6,24.6,17.2,2965.0,5285.8,9.2,11.6
3052,WY,43,323.068966,695.241379,23354.465517,2.506729e+05,119.822445,51.6,15.8,3.64,3.64,17.0,28.6,26.4,16.6,1069.4,1812.0,14.4,10.4


In [571]:
similarity_attributes.rename(columns={'origin_county_fips': 'county', 'origin_state': 'state'}, inplace=True)

In [574]:
# create a unique identifier for each county using both State and FIPS codes
similarity_attributes['id'] = similarity_attributes['state'] + '_' + similarity_attributes['county'].astype(str)
similarity_attributes = similarity_attributes.drop(['state', 'county'], axis=1)
similarity_attributes.head(3)

Unnamed: 0,num_returns,num_individuals,adjusted_gross_income,current_house_price_2024,price_index,temperature,fair_poor,physically_unhealthy_days,mentally_unhealthy_days,smokers,obese,physically_inactive,excessive_drinking,some_college,population_1,social_associations,severe_housing_problems,id
0,1157.678261,2613.026087,73080.478261,239900.8779,119.895294,66.8,19.0,4.36,4.48,18.6,34.8,31.4,15.6,8765.4,14370.6,71.6,14.8,AL_1
1,2318.37799,4804.588517,165935.966507,387625.9624,136.772357,70.7,17.2,3.94,4.3,17.6,29.6,24.6,17.2,31604.2,48264.6,225.0,14.0,AL_3
2,488.840426,1026.468085,21870.553191,143559.7134,127.522538,67.2,27.4,5.2,4.96,23.0,43.0,28.6,12.6,2579.8,6918.6,20.6,15.0,AL_5


In [575]:
similarity_attributes.shape

(3054, 18)

In [576]:
similarity_attributes.to_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/network_graph/inputs_data/similarity_attributes.csv', index=False)