In [16]:
import os
import joblib
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import sklearn
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [17]:
data_path = os.path.join('datasets', 'nyc education + jobs')

def load_data(filename, path):
    csv_path = os.path.join(path, filename)
    return pd.read_csv(csv_path)

census_2010 = load_data("nyc_census_2010.csv", data_path)
census_2018 = load_data("nyc_census_2018.csv", data_path)

In [18]:
census_2010 = census_2010.drop(columns=['year', 'NAME', 'state', 'county', 'tract', 'B08528_010E'])
census_2018 = census_2018.drop(columns=['year', 'NAME', 'state', 'county', 'tract', 'B08528_010E'])

13.5%
2018 price * 0.865 = 2010 price

In [19]:
def column_names(dataframe, year):
    year_columns = []
    columns = ['population', 'household_income', 'home_value', 
                        'no_nonh_caucasians', 'no_nonh_blacks/aas', 'no_amerinds_alskns', 'no_nonh_asians',
                        'no_nonh_hawaii_pacific', 'no_nonh_others', 'no_nonh_multi', 'no_hisp_latin',
                        'construction', 'retail', 'information', 'finance', 'edu_health_social',
                        'arts_recreation_accom', 'public_admin', 'armed_forces', 'bachelors', 'male<25_bachelors',
                        'female<25_bachelors']
    year_columns.append('geoid')
    for i in columns:
        year_columns.append(year + '_' + i)
    dataframe.columns = year_columns

column_names(census_2010, '2010')
column_names(census_2018, '2018')

In [20]:
census_2010.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4701 entries, 0 to 4700
Data columns (total 23 columns):
geoid                          4701 non-null int64
2010_population                4701 non-null float64
2010_household_income          4701 non-null float64
2010_home_value                4701 non-null float64
2010_no_nonh_caucasians        4701 non-null float64
2010_no_nonh_blacks/aas        4701 non-null float64
2010_no_amerinds_alskns        4701 non-null float64
2010_no_nonh_asians            4701 non-null float64
2010_no_nonh_hawaii_pacific    4701 non-null float64
2010_no_nonh_others            4701 non-null float64
2010_no_nonh_multi             4701 non-null float64
2010_no_hisp_latin             4701 non-null float64
2010_construction              4701 non-null float64
2010_retail                    4701 non-null float64
2010_information               4701 non-null float64
2010_finance                   4701 non-null float64
2010_edu_health_social         4701 non-null fl

In [21]:
census_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4700 entries, 0 to 4699
Data columns (total 23 columns):
geoid                          4700 non-null int64
2018_population                4700 non-null float64
2018_household_income          4700 non-null float64
2018_home_value                4700 non-null float64
2018_no_nonh_caucasians        4700 non-null float64
2018_no_nonh_blacks/aas        4700 non-null float64
2018_no_amerinds_alskns        4700 non-null float64
2018_no_nonh_asians            4700 non-null float64
2018_no_nonh_hawaii_pacific    4700 non-null float64
2018_no_nonh_others            4700 non-null float64
2018_no_nonh_multi             4700 non-null float64
2018_no_hisp_latin             4700 non-null float64
2018_construction              4700 non-null float64
2018_retail                    4700 non-null float64
2018_information               4700 non-null float64
2018_finance                   4700 non-null float64
2018_edu_health_social         4700 non-null fl

In [22]:
census_data = pd.merge(census_2010, census_2018, on='geoid')

In [23]:
census_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4700 entries, 0 to 4699
Data columns (total 45 columns):
geoid                          4700 non-null int64
2010_population                4700 non-null float64
2010_household_income          4700 non-null float64
2010_home_value                4700 non-null float64
2010_no_nonh_caucasians        4700 non-null float64
2010_no_nonh_blacks/aas        4700 non-null float64
2010_no_amerinds_alskns        4700 non-null float64
2010_no_nonh_asians            4700 non-null float64
2010_no_nonh_hawaii_pacific    4700 non-null float64
2010_no_nonh_others            4700 non-null float64
2010_no_nonh_multi             4700 non-null float64
2010_no_hisp_latin             4700 non-null float64
2010_construction              4700 non-null float64
2010_retail                    4700 non-null float64
2010_information               4700 non-null float64
2010_finance                   4700 non-null float64
2010_edu_health_social         4700 non-null fl

In [24]:
census_data['2018_home_value']

0        710600.0
1       2000001.0
2        466600.0
3        631000.0
4        332800.0
          ...    
4695     161100.0
4696     158100.0
4697     125600.0
4698     141600.0
4699     161200.0
Name: 2018_home_value, Length: 4700, dtype: float64

In [25]:
census_data['2018_home_value'] = census_data['2018_home_value'] * 0.865

In [26]:
census_data['2018_home_value']

0        614669.000
1       1730000.865
2        403609.000
3        545815.000
4        287872.000
           ...     
4695     139351.500
4696     136756.500
4697     108644.000
4698     122484.000
4699     139438.000
Name: 2018_home_value, Length: 4700, dtype: float64

In [27]:
census_data['2018_>25bachelors'] = census_data['2018_bachelors'] - (census_data['2018_male<25_bachelors'] + census_data['2018_female<25_bachelors'])
census_data['2010_>25bachelors'] = census_data['2010_bachelors'] - (census_data['2010_male<25_bachelors'] + census_data['2010_female<25_bachelors'])

In [28]:
census_data = census_data[census_data['2010_population'] > 0]
census_data = census_data[census_data['2010_household_income'] > 0]
census_data = census_data[census_data['2010_home_value'] > 0]
census_data = census_data[census_data['2010_>25bachelors'] >= 0]
census_data = census_data[census_data['2018_population'] > 0]
census_data = census_data[census_data['2018_household_income'] > 0]
census_data = census_data[census_data['2018_home_value'] > 0]
census_data = census_data[census_data['2018_>25bachelors'] >= 0]

In [29]:
census_data['percent_change_home_value'] = 100*(census_data['2018_home_value'] - census_data['2010_home_value']) / census_data['2010_home_value']

In [30]:
census_data['percent_change_home_value']

0      -17.093472
1       72.999914
2      -23.890439
3      -24.276498
4      -21.624830
          ...    
4695   -19.774611
4696   -36.273765
4697   -36.981439
4698   -32.329282
4699   -29.219289
Name: percent_change_home_value, Length: 4370, dtype: float64

In [31]:
census_data['2018_percent_>25bachelors'] = census_data['2018_>25bachelors'] / census_data['2018_population']
census_data['2010_percent_>25bachelors'] = census_data['2010_>25bachelors'] / census_data['2010_population']

In [32]:
census_data['2018_percent_>25bachelors']

0       0.252292
1       0.313285
2       0.251109
3       0.224265
4       0.199845
          ...   
4695    0.070352
4696    0.086196
4697    0.095834
4698    0.108200
4699    0.096890
Name: 2018_percent_>25bachelors, Length: 4370, dtype: float64

In [33]:
census_data['change_percent_>25bachelors'] = (census_data['2018_percent_>25bachelors'] - census_data['2010_percent_>25bachelors'])

In [34]:
census_data['change_percent_>25bachelors']

0       0.030841
1       0.023565
2       0.135513
3      -0.018079
4       0.061081
          ...   
4695   -0.042749
4696    0.045309
4697    0.017939
4698    0.002024
4699    0.046221
Name: change_percent_>25bachelors, Length: 4370, dtype: float64

In [35]:
education_97_percentile = census_data['change_percent_>25bachelors'].quantile(0.97)
value_97_percentile = census_data['percent_change_home_value'].quantile(0.97)

In [36]:
education_97_percentile

0.09374129682024057

In [37]:
value_97_percentile

53.14415970464042

In [39]:
census_data['gentrify_elig'] = 0

income_40_percentile = census_data['2010_household_income'].quantile(0.4)
value_40_percentile = census_data['2010_home_value'].quantile(0.4)

census_data.loc[(census_data['2010_household_income'] <= income_40_percentile) & 
               (census_data['2010_home_value'] <= value_40_percentile) &
               (census_data['2010_population'] >= 500), 'gentrify_elig'] = 1

In [41]:
census_data['test_two'] = 0

census_data.loc[(census_data['change_percent_>25bachelors'] >= education_97_percentile) & 
               (census_data['percent_change_home_value'] >= value_97_percentile) &
               (census_data['percent_change_home_value'] > 0), 'test_two'] = 1

In [42]:
census_data['gentrified'] = 0

census_data.loc[(census_data['gentrify_elig'] == 1) & 
               (census_data['test_two'] == 1), 'gentrified'] = 1

In [43]:
census_data['gentrified']

0       0
1       0
2       0
3       0
4       0
       ..
4695    0
4696    0
4697    0
4698    0
4699    0
Name: gentrified, Length: 4370, dtype: int64

In [44]:
census_data['gentrified'].value_counts()

0    4369
1       1
Name: gentrified, dtype: int64

In [45]:
census_data.loc[census_data['gentrified'] == 1]

Unnamed: 0,geoid,2010_population,2010_household_income,2010_home_value,2010_no_nonh_caucasians,2010_no_nonh_blacks/aas,2010_no_amerinds_alskns,2010_no_nonh_asians,2010_no_nonh_hawaii_pacific,2010_no_nonh_others,...,2018_female<25_bachelors,2018_>25bachelors,2010_>25bachelors,percent_change_home_value,2018_percent_>25bachelors,2010_percent_>25bachelors,change_percent_>25bachelors,test_two,gentrify_elig,gentrified
3760,36081024500,4941.0,32100.0,34100.0,1909.0,101.0,0.0,1099.0,0.0,0.0,...,0.0,834.0,313.0,585.151026,0.161066,0.063348,0.097719,1,1,1


In [46]:
census_data['gentrify_elig'].value_counts()

0    3528
1     842
Name: gentrify_elig, dtype: int64

In [47]:
census_data['test_two'].value_counts()

0    4359
1      11
Name: test_two, dtype: int64