# Graph network analysis and visualization code for CSE6242 group project

In [1]:
import pandas as pd

## Part 1: Migration data

``` markdown
- rename the key attributes to state either inflow or outflow
- Group by county name and sum attributes (returs, individuals and gross income)
- Merge both inflow and outflow as a single dataset: merged_migration

In [2]:
inflow = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/Network_graph_analysis/cleaned_countyinflow.csv')
outflow = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/Network_graph_analysis/cleaned_countyoutflow.csv')

In [3]:
inflow.head(3)

Unnamed: 0,year,destination_state_fips,destination_county_fips,origin_state_fips,origin_county_fips,y2_fips,y1_fips,y1_state,y1_countyname,num_returns,num_individuals,adjusted_gross_income
0,16-17,1,0,96,0,1000,96000,AL,Total Migration-US and Foreign,130855,273891,7058182
1,16-17,1,0,97,0,1000,97000,AL,Total Migration-US,130235,272345,7022706
2,16-17,1,0,97,1,1000,97001,AL,Total Migration-Same State,71530,148378,3510398


In [4]:
outflow.head(3)

Unnamed: 0,year,origin_state_fips,origin_county_fips,destination_state_fips,destination_county_fips,y1fips,y2fips,y2_state,y2_countyname,num_returns,num_individuals,adjusted_gross_income
0,16-17,1,1,1,1,1001,1001,AL,Autauga County Non-migrants,17484,39711,1106647
1,16-17,1,21,1,1,1021,1001,AL,Autauga County,83,220,3006
2,16-17,1,47,1,1,1047,1001,AL,Autauga County,82,195,2917


In [5]:
len(inflow)

447970

``` markdown
From both datasets, the columns carrying information are returns, individuas and gross income.
Renaming those attributes is required to ensure they are distict after merging.

In [6]:
# rename the key attributes to state either inflow or outflow
inflow_attributes_dict = {'num_returns': 'in_return', 'num_individuals': 'in_individuals', 'adjusted_gross_income': 'in_gross_income'}
outflow_attributes_dict = {'num_returns': 'out_return', 'num_individuals': 'out_individuals', 'adjusted_gross_income': 'out_gross_income'}

In [7]:
inflow.rename(columns=inflow_attributes_dict, inplace=True)
outflow.rename(columns=outflow_attributes_dict, inplace=True)

``` markdown
Grouping each dataset to have a single line for each county. The aggregation method is sum across the ckey attributes idenfified earlier.

In [8]:
grouped_inflow = inflow.groupby(by= ['y1_state','origin_county_fips'])[['in_return', 'in_individuals', 'in_gross_income']].sum().reset_index()
grouped_inflow.head(3)

Unnamed: 0,y1_state,origin_county_fips,in_return,in_individuals,in_gross_income
0,AK,0,306337,598585,17297723
1,AK,1,48135,91368,2847809
2,AK,3,102323,201584,5664784


In [9]:
grouped_outflow= outflow.groupby(by= ['y2_state','destination_county_fips'])[['out_return', 'out_individuals', 'out_gross_income']].sum().reset_index()
grouped_outflow.head(3)

Unnamed: 0,y2_state,destination_county_fips,out_return,out_individuals,out_gross_income
0,AK,0,364248,733000,22577044
1,AK,1,48217,91547,2853137
2,AK,3,131234,269101,8294827


In [10]:
print(f"lenght of grouped inflow: ",len(grouped_inflow))
print(f"lenght of grouped outflow: ",len(grouped_outflow))

lenght of grouped inflow:  3209
lenght of grouped outflow:  3209


In [11]:
merged_migration = grouped_inflow.merge(grouped_outflow, left_on=['y1_state', 'origin_county_fips'],  right_on=['y2_state','destination_county_fips'], how='inner')
print(len(merged_migration))
merged_migration.isna().sum()

3209


y1_state                   0
origin_county_fips         0
in_return                  0
in_individuals             0
in_gross_income            0
y2_state                   0
destination_county_fips    0
out_return                 0
out_individuals            0
out_gross_income           0
dtype: int64

In [12]:
cols_to_keep = ['y2_state', 'origin_county_fips', 'in_return', 'in_individuals', 'in_gross_income', 'out_return', 'out_individuals', 'out_gross_income']
col_to_rename = {'y2_state': 'State', 'origin_county_fips': 'FIPS'}

In [13]:
merged_migration = merged_migration[cols_to_keep]
merged_migration.rename(columns=col_to_rename, inplace=True)

## Part 2: House pricing

``` markdown
- Clean up the county idenfiers
- Calcule the net change by county between  2000 and 2024 (net_housing_price_change)
- Extract the current house price as 2024 (2024_house_price)
- Merge with migration data

In [14]:
house_pricing = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/Network_graph_analysis/Zillow_Data_Yearly_FIPS.csv')

In [15]:
house_pricing.head(3)   

Unnamed: 0,FIPS,RegionName,State,FIPS.1,2000,2001,2002,2003,2004,2005,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,2020,Anchorage Borough,AK,2020,146144.3161,169973.1478,186475.8294,195969.8382,209724.6331,229868.8594,...,304919.1673,311721.6694,310289.3725,312758.8579,330962.7465,338964.0436,344423.7324,365491.1856,377498.0278,384018.9248
1,2090,Fairbanks North Star Borough,AK,2090,,,,,,188322.2179,...,225121.4489,233876.6726,243537.0955,252751.346,265862.9022,277602.7961,285834.8267,300548.1123,296752.7637,301036.6606
2,2100,Haines Borough,AK,2100,,,,,,,...,,,,,,,284891.8249,315908.794,298008.7765,294193.3742


In [16]:
house_pricing.isna().sum()

FIPS             0
RegionName       0
State            0
FIPS.1           0
2000          2020
2001          1979
2002          1938
2003          1880
2004          1837
2005          1788
2006          1758
2007          1704
2008          1634
2009          1059
2010           985
2011           921
2012           647
2013           597
2014           560
2015           529
2016           101
2017            94
2018            86
2019            68
2020            63
2021            51
2022            12
2023             2
2024             0
dtype: int64

In [17]:
cols_to_fillna = ['2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020','2021','2022','2023','2024'] 
n = int(len(cols_to_fillna)/2) # first half of the columns are years_min and the second half are years_max
cols_years_min = cols_to_fillna[:n]
cols_years_max = cols_to_fillna[n:]


In [18]:
house_pricing[cols_to_fillna] = house_pricing[cols_to_fillna].fillna(house_pricing[cols_to_fillna].mean())  
house_pricing.isna().sum()

FIPS          0
RegionName    0
State         0
FIPS.1        0
2000          0
2001          0
2002          0
2003          0
2004          0
2005          0
2006          0
2007          0
2008          0
2009          0
2010          0
2011          0
2012          0
2013          0
2014          0
2015          0
2016          0
2017          0
2018          0
2019          0
2020          0
2021          0
2022          0
2023          0
2024          0
dtype: int64

In [19]:
house_pricing['house_index'] = house_pricing[cols_years_max].mean(axis=1)  - house_pricing[cols_years_min].mean(axis=1) 
house_pricing['housing_avg_value'] = house_pricing['2024']

In [20]:
# house_pricing_al = house_pricing[house_pricing['State'] == 'AL']
# house_pricing_al['FIPS'].unique()

In [21]:
# defining a function to format the house_pricing data to be the same as the merged migration 
def county_code_reformat(value):
        # ensure the value is a string
        num_str = str(value)
        num_str = num_str[-3:]
            # check if the new first character is zero
        if num_str[0] == '0':
            # remove the zero
            num_str = num_str[1:]
            # repeat
            if num_str[0] == '0':
                num_str = num_str[1:]
                return int(num_str)
            else:
                 return int(num_str)
        else:
             return int(num_str)

In [22]:
house_pricing['short_county_code'] = house_pricing['FIPS'].apply(county_code_reformat) 
null_values = house_pricing['short_county_code'].isnull().sum()
print(f"Number of null values in short_county_code column: {null_values}")

Number of null values in short_county_code column: 0


In [23]:
# cast the typpe for the short_county_code as integer
house_pricing['short_county_code'] = house_pricing['short_county_code'].astype(int)
house_pricing.isna().sum()


FIPS                 0
RegionName           0
State                0
FIPS.1               0
2000                 0
2001                 0
2002                 0
2003                 0
2004                 0
2005                 0
2006                 0
2007                 0
2008                 0
2009                 0
2010                 0
2011                 0
2012                 0
2013                 0
2014                 0
2015                 0
2016                 0
2017                 0
2018                 0
2019                 0
2020                 0
2021                 0
2022                 0
2023                 0
2024                 0
house_index          0
housing_avg_value    0
short_county_code    0
dtype: int64

In [24]:
house_pricing_cleaned = house_pricing[['State','short_county_code', 'house_index', 'housing_avg_value']]
full_dataset = merged_migration.merge(house_pricing_cleaned, left_on=['State', 'FIPS'], right_on=['State', 'short_county_code'], how='inner')
print(len(full_dataset))
full_dataset.isna().sum()

3074


State                0
FIPS                 0
in_return            0
in_individuals       0
in_gross_income      0
out_return           0
out_individuals      0
out_gross_income     0
short_county_code    0
house_index          0
housing_avg_value    0
dtype: int64

## Health datda

``` markdown
- clean up the FIPS
- Check for null values
-  grroup by state and FIPS
- Create a function that maps the State to the right abbreviation for merge
- get rid of usefell columns
- merge



In [25]:
health = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/Network_graph_analysis/Health_Data.csv')

  health = pd.read_csv('/Users/judithyemeli/Documents/CSE_6242/Project/MVR/Network_graph_analysis/Health_Data.csv')


In [26]:
health.head(3)

Unnamed: 0,Year,FIPS,State,County,Premature Deaths,Premature Death Years of Potential Life Lost Rate,% Fair/Poor,Physically Unhealthy Days,Mentally Unhealthy Days,Unreliable Ind LowBirth,...,% Native Hawaiian/Other Pacific Islander,# Hispanic,% Hispanic,# Non-Hispanic White,% Non-Hispanic White,# Not Proficient in English,% Not Proficient in English,% Female,# Rural,% Rural
0,2017,1001,Alabama,Autauga,830.0,9158.0,18,4.2,4.2,,...,2.8,41459.0,74.9,338.0,0.7,51.5,22921.0,42.0,22921.0,42.0
1,2017,1003,Alabama,Baldwin,2573.0,7394.0,16,3.7,4.0,,...,4.5,168996.0,83.0,2159.0,1.2,51.3,77060.0,42.3,77060.0,42.3
2,2017,1005,Alabama,Barbour,413.0,8573.0,25,4.8,4.8,,...,4.4,12340.0,46.6,447.0,1.8,46.6,18613.0,67.8,18613.0,67.8


In [27]:
health['short_county_code'] = health['FIPS'].apply(county_code_reformat)    

In [28]:
state_to_abbreviation = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", 
    "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA",
    "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS",
    "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA",
    "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT",
    "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM",
    "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK",
    "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC",
    "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT",
    "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"
}

In [29]:
health['State_short'] = health['State'].map(state_to_abbreviation)
health['State_short'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', nan, 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'], dtype=object)

In [30]:
health.drop(columns=['State', 'County', 'FIPS'], inplace=True)
health.rename(columns={'State_short': 'State', 'short_county_code':'FIPS'}, inplace=True)

In [31]:
threshold = len(health) * 0.1  # 10% threshold
health.dropna(thresh=threshold, axis=1, inplace=True)

# fill the remaining missing values with the mean of the column
attribute_cols = []
for col in health.columns:
    if col not in ['State', 'FIPS', 'Year'] and health[col].dtype != 'object':
        attribute_cols.append(col)

In [32]:
health_grouped = health.groupby(by=['State', 'FIPS'])[attribute_cols].mean().reset_index()
len(health_grouped)

3142

In [33]:
health_grouped.head(3)

Unnamed: 0,State,FIPS,Premature Deaths,Premature Death Years of Potential Life Lost Rate,% Fair/Poor,Physically Unhealthy Days,Mentally Unhealthy Days,Low birthweight rate,% Smokers,% Obese,...,% Native Hawaiian/Other Pacific Islander,# Hispanic,% Hispanic,# Non-Hispanic White,% Non-Hispanic White,# Not Proficient in English,% Not Proficient in English,% Female,# Rural,% Rural
0,AK,13,,,17.6,3.62,2.94,,18.2,42.2,...,4.425,455.5,13.45,345.75,10.4,299.2,794.175,49.175,21640.333333,82.8
1,AK,16,,,15.6,3.22,2.78,7.0,16.2,41.2,...,5.45,958.5,16.8,1096.25,19.275,320.675,1395.9,50.225,34955.666667,50.333333
2,AK,20,3113.666667,7558.6,13.6,3.48,3.32,6.0,15.4,29.2,...,4.225,64306.75,21.625,129787.0,44.05,5005.65,3005.575,37.8,37109.0,50.766667


In [34]:
health_grouped.isna().sum()

State                                                  0
FIPS                                                   0
Premature Deaths                                     171
Premature Death Years of Potential Life Lost Rate    162
% Fair/Poor                                            0
                                                    ... 
# Not Proficient in English                            0
% Not Proficient in English                            0
% Female                                               0
# Rural                                                6
% Rural                                                6
Length: 128, dtype: int64

In [35]:
full_dataset = full_dataset.merge(health_grouped, left_on=['State', 'FIPS'], right_on=['State', 'FIPS'], how='inner')
len(full_dataset)

3073

In [37]:
full_dataset.shape

(3073, 137)

In [39]:
full_dataset.to_csv('full_dataset.csv', index=False)