In [39]:
import numpy as np
import geopandas as gpd
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Load in Data

In [2]:
housing = pd.read_csv('data/msa_med_housing_2011-2020.csv')

In [8]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907 entries, 0 to 906
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CBSAA        907 non-null    int64 
 1   NAME_E.y     907 non-null    object
 2   housing_chg  907 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 21.4+ KB


In [3]:
housing.head()

Unnamed: 0,CBSAA,NAME_E.y,housing_chg
0,10100,"Aberdeen, SD Micro Area",26700
1,10140,"Aberdeen, WA Micro Area",34300
2,10180,"Abilene, TX Metro Area",30900
3,10220,"Ada, OK Micro Area",26900
4,10300,"Adrian, MI Micro Area",28100


In [31]:
flows = pd.read_csv('data/sea_mig_2020.csv')

In [32]:
flows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   GEOID1      354 non-null    int64  
 1   GEOID2      344 non-null    float64
 2   FULL1_NAME  354 non-null    object 
 3   FULL2_NAME  354 non-null    object 
 4   variable    354 non-null    object 
 5   estimate    345 non-null    float64
 6   moe         308 non-null    float64
 7   centroid1   354 non-null    object 
 8   centroid2   354 non-null    object 
 9   j           354 non-null    object 
 10  k           354 non-null    object 
dtypes: float64(3), int64(1), object(7)
memory usage: 30.6+ KB


In [33]:
# remove NAs
flows = flows[flows['estimate'].notna()]

In [34]:
flows.head()

Unnamed: 0,GEOID1,GEOID2,FULL1_NAME,FULL2_NAME,variable,estimate,moe,centroid1,centroid2,j,k
0,42660,,"Seattle-Tacoma-Bellevue, WA Metro Area",Outside Metro Area within U.S. or Puerto Rico,MOVEDOUT,25243.0,1528.0,c(-121.881387433741,47.5574033466574),c(NA,NA)
10,42660,10180.0,"Seattle-Tacoma-Bellevue, WA Metro Area","Abilene, TX Metro Area",MOVEDOUT,48.0,34.0,c(-121.881387433741,c(-99.717678358008,c(-99.717678358008,32.4496900313019)
11,42660,10380.0,"Seattle-Tacoma-Bellevue, WA Metro Area","Aguadilla-Isabela, PR Metro Area",MOVEDOUT,0.0,,c(-121.881387433741,c(-67.0703611537066,c(-67.0703611537066,18.3797715984677)
12,42660,10420.0,"Seattle-Tacoma-Bellevue, WA Metro Area","Akron, OH Metro Area",MOVEDOUT,49.0,45.0,c(-121.881387433741,c(-81.3495122130132,c(-81.3495122130132,41.1487250686295)
13,42660,10500.0,"Seattle-Tacoma-Bellevue, WA Metro Area","Albany, GA Metro Area",MOVEDOUT,0.0,,c(-121.881387433741,c(-84.1738860941887,c(-84.1738860941887,31.5895065709885)


In [9]:
race = pd.read_csv('data/race_2010-2020.csv')

In [30]:
race.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 942 entries, 0 to 941
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   GISJOIN                942 non-null    object 
 1   CBSA                   942 non-null    object 
 2   chg_white_2010_2020    942 non-null    float64
 3   chg_black_2010_2020    942 non-null    float64
 4   chg_aiak_2010_2020     942 non-null    float64
 5   chg_asian_2010_2020    942 non-null    float64
 6   chg_nhpi_2010_2020     942 non-null    float64
 7   chg_other_2010_2020    942 non-null    float64
 8   chg_twoplus_2010_2020  942 non-null    float64
dtypes: float64(7), object(2)
memory usage: 66.4+ KB


In [21]:
race.head()

Unnamed: 0,GISJOIN,CBSA,chg_white_2010_2020,chg_black_2010_2020,chg_aiak_2010_2020,chg_asian_2010_2020,chg_nhpi_2010_2020,chg_other_2010_2020,chg_twoplus_2010_2020
0,G10020,"Abbeville, LA Micro Area",-2444.97,-355.0,-14.0,103.0,8.0,366.0,1697.0
1,G10100,"Aberdeen, SD Micro Area",-1140.0,379.0,114.0,585.0,3.0,556.0,1188.0
2,G10140,"Aberdeen, WA Micro Area",-2784.95,223.0,553.03,17.96,-48.0,849.97,4017.1
3,G10180,"Abilene, TX Metro Area",-7227.0,1744.0,378.0,882.0,71.0,720.0,14759.0
4,G10220,"Ada, OK Micro Area",-3415.96,-18.0,659.0,129.0,-4.0,349.0,2874.02
5,G10300,"Adrian, MI Micro Area",-5625.0,75.0,29.0,-22.0,-9.0,257.0,4826.0
6,G10420,"Akron, OH Metro Area",-44830.06,2956.0,16.0,11856.0,33.0,3644.0,25339.0
7,G10460,"Alamogordo, NM Micro Area",-8187.0,411.0,710.0,307.0,30.0,1019.0,9750.0
8,G10500,"Albany, GA Metro Area",-6800.0,-2698.0,-50.0,214.0,-93.0,727.0,3190.0
9,G10540,"Albany-Lebanon, OR Micro Area",2030.67,147.0,239.04,366.0,133.0,828.01,8194.08


# Basic Stats

In [35]:
metro_flows = np.array(flows['estimate'])

In [37]:
stats.describe(np.array(flows['estimate']))

DescribeResult(nobs=345, minmax=(0.0, 25243.0), mean=456.71014492753625, variance=2753845.8343444555, skewness=10.665841656628476, kurtosis=145.59969358746895)