In [16]:
import os
import joblib
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import sklearn
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [17]:
data_path = os.path.join('datasets', 'nyc education + jobs')

def load_data(filename, path):
    csv_path = os.path.join(path, filename)
    return pd.read_csv(csv_path)

census_2010 = load_data("nyc_census_2010.csv", data_path)
census_2018 = load_data("nyc_census_2018.csv", data_path)

In [18]:
census_2010 = census_2010.drop(columns=['year', 'NAME', 'state', 'county', 'tract', 'B08528_010E'])
census_2018 = census_2018.drop(columns=['year', 'NAME', 'state', 'county', 'tract', 'B08528_010E'])

13.5%
2018 price * 0.865 = 2010 price

In [19]:
def column_names(dataframe, year):
    year_columns = []
    columns = ['population', 'household_income', 'home_value', 
                        'no_nonh_caucasians', 'no_nonh_blacks/aas', 'no_amerinds_alskns', 'no_nonh_asians',
                        'no_nonh_hawaii_pacific', 'no_nonh_others', 'no_nonh_multi', 'no_hisp_latin',
                        'construction', 'retail', 'information', 'finance', 'edu_health_social',
                        'arts_recreation_accom', 'public_admin', 'armed_forces', 'bachelors', 'male<25_bachelors',
                        'female<25_bachelors']
    year_columns.append('geoid')
    for i in columns:
        year_columns.append(year + '_' + i)
    dataframe.columns = year_columns

column_names(census_2010, '2010')
column_names(census_2018, '2018')

In [20]:
census_2010.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4701 entries, 0 to 4700
Data columns (total 23 columns):
geoid                          4701 non-null int64
2010_population                4701 non-null float64
2010_household_income          4701 non-null float64
2010_home_value                4701 non-null float64
2010_no_nonh_caucasians        4701 non-null float64
2010_no_nonh_blacks/aas        4701 non-null float64
2010_no_amerinds_alskns        4701 non-null float64
2010_no_nonh_asians            4701 non-null float64
2010_no_nonh_hawaii_pacific    4701 non-null float64
2010_no_nonh_others            4701 non-null float64
2010_no_nonh_multi             4701 non-null float64
2010_no_hisp_latin             4701 non-null float64
2010_construction              4701 non-null float64
2010_retail                    4701 non-null float64
2010_information               4701 non-null float64
2010_finance                   4701 non-null float64
2010_edu_health_social         4701 non-null fl

In [21]:
census_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4700 entries, 0 to 4699
Data columns (total 23 columns):
geoid                          4700 non-null int64
2018_population                4700 non-null float64
2018_household_income          4700 non-null float64
2018_home_value                4700 non-null float64
2018_no_nonh_caucasians        4700 non-null float64
2018_no_nonh_blacks/aas        4700 non-null float64
2018_no_amerinds_alskns        4700 non-null float64
2018_no_nonh_asians            4700 non-null float64
2018_no_nonh_hawaii_pacific    4700 non-null float64
2018_no_nonh_others            4700 non-null float64
2018_no_nonh_multi             4700 non-null float64
2018_no_hisp_latin             4700 non-null float64
2018_construction              4700 non-null float64
2018_retail                    4700 non-null float64
2018_information               4700 non-null float64
2018_finance                   4700 non-null float64
2018_edu_health_social         4700 non-null fl

In [22]:
census_data = pd.merge(census_2010, census_2018, on='geoid')

In [23]:
census_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4700 entries, 0 to 4699
Data columns (total 45 columns):
geoid                          4700 non-null int64
2010_population                4700 non-null float64
2010_household_income          4700 non-null float64
2010_home_value                4700 non-null float64
2010_no_nonh_caucasians        4700 non-null float64
2010_no_nonh_blacks/aas        4700 non-null float64
2010_no_amerinds_alskns        4700 non-null float64
2010_no_nonh_asians            4700 non-null float64
2010_no_nonh_hawaii_pacific    4700 non-null float64
2010_no_nonh_others            4700 non-null float64
2010_no_nonh_multi             4700 non-null float64
2010_no_hisp_latin             4700 non-null float64
2010_construction              4700 non-null float64
2010_retail                    4700 non-null float64
2010_information               4700 non-null float64
2010_finance                   4700 non-null float64
2010_edu_health_social         4700 non-null fl

In [24]:
census_data['2018_home_value']

0        710600.0
1       2000001.0
2        466600.0
3        631000.0
4        332800.0
          ...    
4695     161100.0
4696     158100.0
4697     125600.0
4698     141600.0
4699     161200.0
Name: 2018_home_value, Length: 4700, dtype: float64

In [25]:
census_data['2018_home_value'] = census_data['2018_home_value'] * 0.865

In [26]:
census_data['2018_home_value']

0        614669.000
1       1730000.865
2        403609.000
3        545815.000
4        287872.000
           ...     
4695     139351.500
4696     136756.500
4697     108644.000
4698     122484.000
4699     139438.000
Name: 2018_home_value, Length: 4700, dtype: float64

In [27]:
census_data['2018_>25bachelors'] = census_data['2018_bachelors'] - census_data['2018_male<25_bachelors'] - census_data['2018_female<25_bachelors']
census_data['2010_>25bachelors'] = census_data['2010_bachelors'] - census_data['2010_male<25_bachelors'] - census_data['2010_female<25_bachelors']

In [28]:
census_data = census_data[census_data['2010_population'] > 0]
census_data = census_data[census_data['2010_household_income'] > 0]
census_data = census_data[census_data['2010_home_value'] > 0]
census_data = census_data[census_data['2010_>25bachelors'] >= 0]
census_data = census_data[census_data['2018_population'] > 0]
census_data = census_data[census_data['2018_household_income'] > 0]
census_data = census_data[census_data['2018_home_value'] > 0]
census_data = census_data[census_data['2018_>25bachelors'] >= 0]

In [29]:
census_data['percent_change_home_value'] = 100*(census_data['2018_home_value'] - census_data['2010_home_value']) / census_data['2010_home_value']

In [30]:
census_data['percent_change_home_value']

0      -17.093472
1       72.999914
2      -23.890439
3      -24.276498
4      -21.624830
          ...    
4695   -19.774611
4696   -36.273765
4697   -36.981439
4698   -32.329282
4699   -29.219289
Name: percent_change_home_value, Length: 4370, dtype: float64

In [31]:
census_data['2018_percent_>25bachelors'] = census_data['2018_>25bachelors'] / census_data['2018_population']
census_data['2010_percent_>25bachelors'] = census_data['2010_>25bachelors'] / census_data['2010_population']

In [32]:
census_data['2018_percent_>25bachelors']

0       0.252292
1       0.313285
2       0.251109
3       0.224265
4       0.199845
          ...   
4695    0.070352
4696    0.086196
4697    0.095834
4698    0.108200
4699    0.096890
Name: 2018_percent_>25bachelors, Length: 4370, dtype: float64

In [33]:
census_data['change_percent_>25bachelors'] = (census_data['2018_percent_>25bachelors'] - census_data['2010_percent_>25bachelors'])

In [34]:
census_data['change_percent_>25bachelors']

0       0.030841
1       0.023565
2       0.135513
3      -0.018079
4       0.061081
          ...   
4695   -0.042749
4696    0.045309
4697    0.017939
4698    0.002024
4699    0.046221
Name: change_percent_>25bachelors, Length: 4370, dtype: float64

In [51]:
education_97_percentile = census_data['change_percent_>25bachelors'].quantile(0.66)
value_97_percentile = census_data['percent_change_home_value'].quantile(0.66)

In [52]:
education_97_percentile

0.02772383331732397

In [53]:
value_97_percentile

-8.284639050441829

In [54]:
census_data['gentrify_elig'] = 0

income_40_percentile = census_data['2010_household_income'].quantile(0.4)
value_40_percentile = census_data['2010_home_value'].quantile(0.4)

census_data.loc[(census_data['2010_household_income'] <= income_40_percentile) & 
               (census_data['2010_home_value'] <= value_40_percentile) &
               (census_data['2010_population'] >= 500), 'gentrify_elig'] = 1

In [55]:
census_data['test_two'] = 0

census_data.loc[(census_data['change_percent_>25bachelors'] >= education_97_percentile) & 
               (census_data['percent_change_home_value'] >= value_97_percentile) &
               (census_data['percent_change_home_value'] > 0), 'test_two'] = 1

In [56]:
census_data['gentrified'] = 0

census_data.loc[(census_data['gentrify_elig'] == 1) & 
               (census_data['test_two'] == 1), 'gentrified'] = 1

In [57]:
census_data['gentrified']

0       0
1       0
2       0
3       0
4       0
       ..
4695    0
4696    0
4697    0
4698    0
4699    0
Name: gentrified, Length: 4370, dtype: int64

In [58]:
census_data['gentrified'].value_counts()

0    4305
1      65
Name: gentrified, dtype: int64

In [59]:
census_data.loc[census_data['gentrified'] == 1]

Unnamed: 0,geoid,2010_population,2010_household_income,2010_home_value,2010_no_nonh_caucasians,2010_no_nonh_blacks/aas,2010_no_amerinds_alskns,2010_no_nonh_asians,2010_no_nonh_hawaii_pacific,2010_no_nonh_others,...,2018_female<25_bachelors,2018_>25bachelors,2010_>25bachelors,percent_change_home_value,2018_percent_>25bachelors,2010_percent_>25bachelors,change_percent_>25bachelors,test_two,gentrify_elig,gentrified
240,34013007600,2988.0,41447.0,356300.0,1662.0,11.0,41.0,68.0,0.0,188.0,...,15.0,163.0,39.0,16.336795,0.048817,0.013052,0.035765,1,1,1
247,34013008700,4142.0,39648.0,239800.0,170.0,423.0,50.0,11.0,0.0,0.0,...,0.0,375.0,143.0,3.309425,0.110327,0.034524,0.075802,1,1,1
406,34017001900,1776.0,31543.0,247900.0,294.0,26.0,0.0,1069.0,0.0,0.0,...,71.0,300.0,160.0,13.681727,0.202566,0.090090,0.112476,1,1,1
412,34017002800,5551.0,29310.0,291300.0,1491.0,1224.0,39.0,1426.0,0.0,12.0,...,0.0,1013.0,611.0,1.050292,0.179387,0.110070,0.069317,1,1,1
447,34017007100,2747.0,34438.0,303700.0,797.0,294.0,31.0,752.0,0.0,118.0,...,23.0,702.0,388.0,60.211557,0.211893,0.141245,0.070648,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3852,36081099701,2562.0,57382.0,374300.0,1674.0,24.0,0.0,630.0,0.0,0.0,...,70.0,692.0,258.0,20.841170,0.229595,0.100703,0.128893,1,1,1
3961,36085000300,1732.0,46731.0,345800.0,788.0,201.0,13.0,318.0,0.0,0.0,...,12.0,396.0,238.0,10.839069,0.201322,0.137413,0.063908,1,1,1
3971,36085002100,3491.0,36131.0,308300.0,889.0,849.0,106.0,299.0,0.0,0.0,...,28.0,398.0,211.0,3.698994,0.098005,0.060441,0.037564,1,1,1
4543,36119002203,1665.0,35417.0,141400.0,1230.0,178.0,0.0,73.0,0.0,0.0,...,0.0,396.0,147.0,9.134371,0.227064,0.088288,0.138776,1,1,1


In [60]:
census_data['gentrify_elig'].value_counts()

0    3528
1     842
Name: gentrify_elig, dtype: int64

In [61]:
census_data['test_two'].value_counts()

0    4001
1     369
Name: test_two, dtype: int64

In [62]:
corr_matrix = census_data.corr()
corr_matrix["gentrify_elig"].sort_values(ascending=False)

gentrify_elig                  1.000000
2018_no_hisp_latin             0.338117
2010_no_hisp_latin             0.321773
2010_no_nonh_blacks/aas        0.268569
2018_no_nonh_blacks/aas        0.253479
gentrified                     0.251523
2018_arts_recreation_accom     0.088252
2018_no_amerinds_alskns        0.087745
2010_no_amerinds_alskns        0.054162
2018_no_nonh_others            0.049124
2018_retail                    0.047614
percent_change_home_value      0.046813
2010_arts_recreation_accom     0.043675
2010_no_nonh_others            0.038822
2018_population                0.037102
2018_construction              0.028580
2010_armed_forces              0.027868
2010_population                0.023721
2010_construction              0.010709
2018_no_nonh_hawaii_pacific    0.000167
2010_retail                   -0.000540
2018_armed_forces             -0.003987
2010_no_nonh_hawaii_pacific   -0.005709
test_two                      -0.012725
2010_no_nonh_multi            -0.021541


In [63]:
corr_matrix["gentrified"].sort_values(ascending=False)

gentrified                     1.000000
test_two                       0.404614
gentrify_elig                  0.251523
2010_no_hisp_latin             0.128952
change_percent_>25bachelors    0.123246
2018_no_hisp_latin             0.113287
2018_arts_recreation_accom     0.106724
percent_change_home_value      0.097033
2010_no_nonh_blacks/aas        0.059090
2010_arts_recreation_accom     0.055381
2018_no_nonh_blacks/aas        0.054278
2010_no_amerinds_alskns        0.048064
geoid                          0.043401
2010_armed_forces              0.026268
2018_male<25_bachelors         0.023399
2010_no_nonh_hawaii_pacific    0.022025
2010_no_nonh_multi             0.020334
2018_population                0.016686
2010_female<25_bachelors       0.016320
2010_population                0.013971
2018_no_nonh_others            0.013744
2018_no_amerinds_alskns        0.009153
2018_edu_health_social         0.008342
2010_edu_health_social         0.005764
2010_retail                    0.004073


In [64]:
corr_matrix['test_two'].sort_values(ascending=False)

test_two                       1.000000
gentrified                     0.404614
change_percent_>25bachelors    0.376332
2018_home_value                0.358023
2018_male<25_bachelors         0.226632
2018_information               0.195081
2018_percent_>25bachelors      0.179045
2010_home_value                0.162763
2010_male<25_bachelors         0.152639
2010_female<25_bachelors       0.144463
percent_change_home_value      0.135492
geoid                          0.125240
2018_bachelors                 0.107685
2010_information               0.090843
2018_finance                   0.084092
2018_no_nonh_multi             0.080118
2018_arts_recreation_accom     0.077433
2018_>25bachelors              0.067863
2018_no_nonh_asians            0.060827
2010_arts_recreation_accom     0.060225
2018_female<25_bachelors       0.049593
2010_no_nonh_asians            0.035481
2010_finance                   0.034136
2010_no_nonh_multi             0.031734
2010_no_nonh_blacks/aas        0.028445


In [65]:
census_data.to_csv('test_one__test_two.csv', index=False)