### Initialise

In [1]:
import os
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import sklearn
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

### Load data

In [16]:
data_path = os.path.join('datasets', 'dataopen')

def load_data(filename, path):
    csv_path = os.path.join(path, filename)
    return pd.read_csv(csv_path)

census_2010 = load_data("Census2010.csv", data_path)

### Explore data

In [17]:
census_2010.head()

Unnamed: 0.1,Unnamed: 0,geoid,year,NAME,B01001_001E,B19013_001E,B25077_001E,B03002_003E,B03002_004E,B02001_004E,B03002_006E,B03002_007E,B03002_008E,B03002_009E,B03002_012E,state,county,tract
0,0,34003001000,2010,"Census Tract 10, Bergen County, New Jersey",6489.0,131563.0,741400.0,5523.0,73.0,0.0,676.0,0.0,50.0,41.0,126.0,34,3,1000
1,1,34003002100,2010,"Census Tract 21, Bergen County, New Jersey",1926.0,172054.0,1000001.0,1217.0,31.0,7.0,509.0,0.0,0.0,9.0,153.0,34,3,2100
2,2,34003002200,2010,"Census Tract 22, Bergen County, New Jersey",5104.0,87270.0,530300.0,3518.0,13.0,0.0,1136.0,0.0,18.0,40.0,379.0,34,3,2200
3,3,34003002300,2010,"Census Tract 23, Bergen County, New Jersey",5682.0,102132.0,720800.0,3690.0,95.0,0.0,1782.0,0.0,0.0,35.0,80.0,34,3,2300
4,4,34003003100,2010,"Census Tract 31, Bergen County, New Jersey",5095.0,91106.0,367300.0,2152.0,346.0,0.0,1498.0,0.0,0.0,45.0,1054.0,34,3,3100


In [19]:
census_2010.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4701 entries, 0 to 4700
Data columns (total 18 columns):
index                     4701 non-null int64
geoid                     4701 non-null int64
year                      4701 non-null int64
name                      4701 non-null object
population                4701 non-null float64
household_income          4701 non-null float64
home_value                4701 non-null float64
no_nonh_caucasians        4701 non-null float64
no_nonh_blacks/aas        4701 non-null float64
no_amerinds_alskns        4701 non-null float64
no_nonh_asians            4701 non-null float64
no_nonh_hawaii_pacific    4701 non-null float64
no_nonh_others            4701 non-null float64
no_nonh_multi             4701 non-null float64
no_hisp_latin             4701 non-null float64
state                     4701 non-null int64
county                    4701 non-null int64
tract                     4701 non-null int64
dtypes: float64(11), int64(6), object(1)
m

No null objects, however upon exploring the data in Excel, negative values were found in some columns, and therefore require removal.

### Change heading names for clarity

In [21]:
census_2010.columns = ['index', 'geoid', 'year', 'name', 'population', 'household_income', 'home_value', 
                        'no_nonh_caucasians', 'no_nonh_blacks/aas', 'no_amerinds_alskns', 'no_nonh_asians',
                        'no_nonh_hawaii_pacific', 'no_nonh_others', 'no_nonh_multi', 'no_hisp_latin', 'state', 'county', 'tract']
census_2010.head()

Unnamed: 0,index,geoid,year,name,population,household_income,home_value,no_nonh_caucasians,no_nonh_blacks/aas,no_amerinds_alskns,no_nonh_asians,no_nonh_hawaii_pacific,no_nonh_others,no_nonh_multi,no_hisp_latin,state,county,tract
0,0,34003001000,2010,"Census Tract 10, Bergen County, New Jersey",6489.0,131563.0,741400.0,5523.0,73.0,0.0,676.0,0.0,50.0,41.0,126.0,34,3,1000
1,1,34003002100,2010,"Census Tract 21, Bergen County, New Jersey",1926.0,172054.0,1000001.0,1217.0,31.0,7.0,509.0,0.0,0.0,9.0,153.0,34,3,2100
2,2,34003002200,2010,"Census Tract 22, Bergen County, New Jersey",5104.0,87270.0,530300.0,3518.0,13.0,0.0,1136.0,0.0,18.0,40.0,379.0,34,3,2200
3,3,34003002300,2010,"Census Tract 23, Bergen County, New Jersey",5682.0,102132.0,720800.0,3690.0,95.0,0.0,1782.0,0.0,0.0,35.0,80.0,34,3,2300
4,4,34003003100,2010,"Census Tract 31, Bergen County, New Jersey",5095.0,91106.0,367300.0,2152.0,346.0,0.0,1498.0,0.0,0.0,45.0,1054.0,34,3,3100


### Remove corrupted entries

In [31]:
census_2010 = census_2010[census_2010['population'] > 0]
census_2010 = census_2010[census_2010['household_income'] > 0]
census_2010 = census_2010[census_2010['home_value'] > 0]
census_2010.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4498 entries, 0 to 4700
Data columns (total 18 columns):
index                     4498 non-null int64
geoid                     4498 non-null int64
year                      4498 non-null int64
name                      4498 non-null object
population                4498 non-null float64
household_income          4498 non-null float64
home_value                4498 non-null float64
no_nonh_caucasians        4498 non-null float64
no_nonh_blacks/aas        4498 non-null float64
no_amerinds_alskns        4498 non-null float64
no_nonh_asians            4498 non-null float64
no_nonh_hawaii_pacific    4498 non-null float64
no_nonh_others            4498 non-null float64
no_nonh_multi             4498 non-null float64
no_hisp_latin             4498 non-null float64
state                     4498 non-null int64
county                    4498 non-null int64
tract                     4498 non-null int64
dtypes: float64(11), int64(6), object(1)
m

### Gentrification Eligibility

Implementation of Governing Magazine's  test to check if a tract is eligible for gentrification:
- Tract had a population of at least 500 residents and was located within a central city,
- Tract's median household income was in the bottom 40th percentile of all tracts within the metropolitan statistical area,
- Tract's median home value was in the bottom 40th percentile of all tracts within the metropolitan statistical area.

Create new column 'gentrify_elig', initialised with zeros.

In [33]:
census_2010['gentrify_elig'] = 0
census_2010.head()

Unnamed: 0,index,geoid,year,name,population,household_income,home_value,no_nonh_caucasians,no_nonh_blacks/aas,no_amerinds_alskns,no_nonh_asians,no_nonh_hawaii_pacific,no_nonh_others,no_nonh_multi,no_hisp_latin,state,county,tract,gentrify_elig
0,0,34003001000,2010,"Census Tract 10, Bergen County, New Jersey",6489.0,131563.0,741400.0,5523.0,73.0,0.0,676.0,0.0,50.0,41.0,126.0,34,3,1000,0
1,1,34003002100,2010,"Census Tract 21, Bergen County, New Jersey",1926.0,172054.0,1000001.0,1217.0,31.0,7.0,509.0,0.0,0.0,9.0,153.0,34,3,2100,0
2,2,34003002200,2010,"Census Tract 22, Bergen County, New Jersey",5104.0,87270.0,530300.0,3518.0,13.0,0.0,1136.0,0.0,18.0,40.0,379.0,34,3,2200,0
3,3,34003002300,2010,"Census Tract 23, Bergen County, New Jersey",5682.0,102132.0,720800.0,3690.0,95.0,0.0,1782.0,0.0,0.0,35.0,80.0,34,3,2300,0
4,4,34003003100,2010,"Census Tract 31, Bergen County, New Jersey",5095.0,91106.0,367300.0,2152.0,346.0,0.0,1498.0,0.0,0.0,45.0,1054.0,34,3,3100,0


Value of 40th percentile for median household income

In [37]:
income_40_percentile = census_2010['household_income'].quantile(0.4)
income_40_percentile

57082.4

Value of 40th percentile for median home value

In [38]:
value_40_percentile = census_2010['home_value'].quantile(0.4)
value_40_percentile

408180.00000000006

Change value of 'gentrify_elig' from 0 to 1 if the entry passes the test.

In [41]:
census_2010.loc[(census_2010['household_income'] <= income_40_percentile) & 
               (census_2010['home_value'] <= value_40_percentile) &
               (census_2010['population'] >= 500), 'gentrify_elig'] = 1

In [44]:
census_2010.head(10)

Unnamed: 0,index,geoid,year,name,population,household_income,home_value,no_nonh_caucasians,no_nonh_blacks/aas,no_amerinds_alskns,no_nonh_asians,no_nonh_hawaii_pacific,no_nonh_others,no_nonh_multi,no_hisp_latin,state,county,tract,gentrify_elig
0,0,34003001000,2010,"Census Tract 10, Bergen County, New Jersey",6489.0,131563.0,741400.0,5523.0,73.0,0.0,676.0,0.0,50.0,41.0,126.0,34,3,1000,0
1,1,34003002100,2010,"Census Tract 21, Bergen County, New Jersey",1926.0,172054.0,1000001.0,1217.0,31.0,7.0,509.0,0.0,0.0,9.0,153.0,34,3,2100,0
2,2,34003002200,2010,"Census Tract 22, Bergen County, New Jersey",5104.0,87270.0,530300.0,3518.0,13.0,0.0,1136.0,0.0,18.0,40.0,379.0,34,3,2200,0
3,3,34003002300,2010,"Census Tract 23, Bergen County, New Jersey",5682.0,102132.0,720800.0,3690.0,95.0,0.0,1782.0,0.0,0.0,35.0,80.0,34,3,2300,0
4,4,34003003100,2010,"Census Tract 31, Bergen County, New Jersey",5095.0,91106.0,367300.0,2152.0,346.0,0.0,1498.0,0.0,0.0,45.0,1054.0,34,3,3100,0
5,5,34003003200,2010,"Census Tract 32, Bergen County, New Jersey",3966.0,57621.0,352600.0,1570.0,132.0,0.0,1121.0,0.0,75.0,36.0,1032.0,34,3,3200,0
6,6,34003003300,2010,"Census Tract 33, Bergen County, New Jersey",6664.0,95948.0,384000.0,3131.0,272.0,0.0,1717.0,0.0,0.0,188.0,1356.0,34,3,3300,0
7,7,34003003401,2010,"Census Tract 34.01, Bergen County, New Jersey",3049.0,102841.0,397000.0,1779.0,59.0,0.0,544.0,0.0,0.0,97.0,570.0,34,3,3401,0
8,8,34003003402,2010,"Census Tract 34.02, Bergen County, New Jersey",3713.0,102135.0,404300.0,2413.0,163.0,0.0,791.0,0.0,8.0,62.0,276.0,34,3,3402,0
9,9,34003003500,2010,"Census Tract 35, Bergen County, New Jersey",4030.0,53622.0,399300.0,1381.0,411.0,0.0,934.0,0.0,0.0,114.0,1190.0,34,3,3500,1
