# Retrieve data from PostGres to rank neighborhoods


# Import Dependencies

In [1]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect,join
from key import url
import numpy as np
import pandas as pd

# Connect to PostgreSQL

In [2]:
#Create the engine
engine = create_engine(url)

In [3]:
# reflect an existing database into a new model
Base = automap_base()

In [4]:
# reflect the tables
Base.prepare(engine, reflect=True)

In [5]:
session=Session(engine)

In [6]:
budget=100000

# Using Pandas for Data Analysis

In [7]:
#  Read the appraisal Table
appraisal=pd.read_sql_table('appraisal',engine)
appraisal.to_csv('appraisal.csv')
appraisal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38496 entries, 0 to 38495
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     38496 non-null  int64  
 1   account                38496 non-null  int64  
 2   land_value             38496 non-null  float64
 3   total_appraised_value  38496 non-null  float64
 4   total_market_value     38496 non-null  float64
 5   tax_year               38496 non-null  int64  
dtypes: float64(3), int64(3)
memory usage: 1.8 MB


In [8]:
# Calculate % of change of value between 2018 and 2019
appraisal_2018=appraisal.loc[appraisal.tax_year==2018,:]
appraisal_2019=appraisal.loc[appraisal.tax_year==2019,:]


In [9]:
appraisal_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19369 entries, 0 to 19368
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     19369 non-null  int64  
 1   account                19369 non-null  int64  
 2   land_value             19369 non-null  float64
 3   total_appraised_value  19369 non-null  float64
 4   total_market_value     19369 non-null  float64
 5   tax_year               19369 non-null  int64  
dtypes: float64(3), int64(3)
memory usage: 1.0 MB


In [10]:
appraisal_df=pd.merge(appraisal_2019,appraisal_2018,on='account', suffixes=('_2019','_2018'))
appraisal_df.to_csv('appraisal_df.csv')
appraisal_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19127 entries, 0 to 19126
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id_2019                     19127 non-null  int64  
 1   account                     19127 non-null  int64  
 2   land_value_2019             19127 non-null  float64
 3   total_appraised_value_2019  19127 non-null  float64
 4   total_market_value_2019     19127 non-null  float64
 5   tax_year_2019               19127 non-null  int64  
 6   id_2018                     19127 non-null  int64  
 7   land_value_2018             19127 non-null  float64
 8   total_appraised_value_2018  19127 non-null  float64
 9   total_market_value_2018     19127 non-null  float64
 10  tax_year_2018               19127 non-null  int64  
dtypes: float64(6), int64(5)
memory usage: 1.8 MB


In [11]:
appraisal_df['pct_value_change']=(appraisal_df['total_appraised_value_2019']-appraisal_df['total_appraised_value_2018'])\
                                  /appraisal_df['total_appraised_value_2018']*100
appraisal_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19127 entries, 0 to 19126
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id_2019                     19127 non-null  int64  
 1   account                     19127 non-null  int64  
 2   land_value_2019             19127 non-null  float64
 3   total_appraised_value_2019  19127 non-null  float64
 4   total_market_value_2019     19127 non-null  float64
 5   tax_year_2019               19127 non-null  int64  
 6   id_2018                     19127 non-null  int64  
 7   land_value_2018             19127 non-null  float64
 8   total_appraised_value_2018  19127 non-null  float64
 9   total_market_value_2018     19127 non-null  float64
 10  tax_year_2018               19127 non-null  int64  
 11  pct_value_change            19127 non-null  float64
dtypes: float64(7), int64(5)
memory usage: 1.9 MB


In [12]:
results_df=appraisal_df[['id_2019','account','total_appraised_value_2019', 'pct_value_change']]
results_df=results_df.rename(columns={'id_2019':'id'})


In [13]:
results_df.head()

Unnamed: 0,id,account,total_appraised_value_2019,pct_value_change
0,1,21750000003,299000.0,0.0
1,2,21750000013,815000.0,0.0
2,3,21750000018,296400.0,0.0
3,4,21750000019,299803.0,0.0
4,5,41410000034,330500.0,-7.162921


In [14]:
# Filter by budget on year 2019
results_df=results_df.loc[results_df.total_appraised_value_2019<=budget,:]
results_df.to_csv('results_df.csv')
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17 entries, 20 to 14133
Data columns (total 4 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          17 non-null     int64  
 1   account                     17 non-null     int64  
 2   total_appraised_value_2019  17 non-null     float64
 3   pct_value_change            17 non-null     float64
dtypes: float64(2), int64(2)
memory usage: 680.0 bytes


In [15]:
#Read properties table
properties_df=pd.read_sql_table('properties',engine)
properties_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19369 entries, 0 to 19368
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   account            19369 non-null  int64         
 1   latitude           19369 non-null  float64       
 2   longitude          19369 non-null  float64       
 3   address            19369 non-null  object        
 4   Zip_code           19369 non-null  int64         
 5   neighborhood_code  19369 non-null  float64       
 6   acreage            19369 non-null  float64       
 7   new_owner_date     19368 non-null  datetime64[ns]
 8   sq_ft              19366 non-null  float64       
 9   flood_description  15293 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 1.5+ MB


In [16]:
properties_df.to_csv('properties_df.csv')

In [17]:
results_df=pd.merge(results_df,properties_df,on="account")
results_df.head()

Unnamed: 0,id,account,total_appraised_value_2019,pct_value_change,latitude,longitude,address,Zip_code,neighborhood_code,acreage,new_owner_date,sq_ft,flood_description
0,22,50240000010,83954.0,9.999738,29.75373,-95.381,"1315 VICTOR ST HOUSTON, TEXAS 77019",77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD
1,30,50440000001,35918.0,9.999081,29.7555,-95.38322,"1308 GILLETTE ST HOUSTON, TEXAS 77019",77019,8317.0,0.0367,1997-03-22,1668.0,AREA OF MINIMAL FLOOD HAZARD
2,67,90870000017,53214.0,-7.121165,29.75707,-95.38324,"1020 GILLETTE ST HOUSTON, TEXAS 77019",77019,8317.0,0.0234,1988-01-02,756.0,AREA OF MINIMAL FLOOD HAZARD
3,68,90870000020,57554.0,-2.285229,29.75713,-95.38336,"1606 SAULNIER ST HOUSTON, TEXAS 77019",77019,8317.0,0.0252,2013-09-10,654.0,AREA OF MINIMAL FLOOD HAZARD
4,2416,552590390001,19343.0,0.0,29.68139,-95.3984,"2126 ENGELMOHR ST HOUSTON, TEXAS 77054",77054,7601.0,0.1217,2006-06-16,624.0,AREA OF MINIMAL FLOOD HAZARD


In [18]:
del results_df['latitude']
del results_df['longitude']
del results_df['address']

In [19]:
results_df.head()

Unnamed: 0,id,account,total_appraised_value_2019,pct_value_change,Zip_code,neighborhood_code,acreage,new_owner_date,sq_ft,flood_description
0,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD
1,30,50440000001,35918.0,9.999081,77019,8317.0,0.0367,1997-03-22,1668.0,AREA OF MINIMAL FLOOD HAZARD
2,67,90870000017,53214.0,-7.121165,77019,8317.0,0.0234,1988-01-02,756.0,AREA OF MINIMAL FLOOD HAZARD
3,68,90870000020,57554.0,-2.285229,77019,8317.0,0.0252,2013-09-10,654.0,AREA OF MINIMAL FLOOD HAZARD
4,2416,552590390001,19343.0,0.0,77054,7601.0,0.1217,2006-06-16,624.0,AREA OF MINIMAL FLOOD HAZARD


In [20]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17 entries, 0 to 16
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          17 non-null     int64         
 1   account                     17 non-null     int64         
 2   total_appraised_value_2019  17 non-null     float64       
 3   pct_value_change            17 non-null     float64       
 4   Zip_code                    17 non-null     int64         
 5   neighborhood_code           17 non-null     float64       
 6   acreage                     17 non-null     float64       
 7   new_owner_date              17 non-null     datetime64[ns]
 8   sq_ft                       17 non-null     float64       
 9   flood_description           17 non-null     object        
dtypes: datetime64[ns](1), float64(5), int64(3), object(1)
memory usage: 1.5+ KB


In [21]:
#Read crime table and merge to results
crime_df=pd.read_sql_table('crime',engine)
crime_df=crime_df.rename(columns={'Zip_Code':'Zip_code'})
crime_df.head()

Unnamed: 0,id,Incident,Date,Hour,NIBRS_Class,NIBRS_Description,Offense_Count,Premise,Block_Range,Street_Name,Street_Type,City,Zip_code
0,1,20519,2019-01-01,0,13A,Aggravated Assault,1,"Residence, Home (Includes Apartment)",4034,OSBY,DR,HOUSTON,77025
1,2,20519,2019-01-01,0,23H,All other larceny,1,"Residence, Home (Includes Apartment)",4034,OSBY,DR,HOUSTON,77025
2,3,20519,2019-01-01,0,290,"Destruction, damage, vandalism",1,"Residence, Home (Includes Apartment)",4034,OSBY,DR,HOUSTON,77025
3,4,20519,2019-01-01,0,35A,"Drug, narcotic violations",1,"Residence, Home (Includes Apartment)",4034,OSBY,DR,HOUSTON,77025
4,5,34819,2019-01-01,0,290,"Destruction, damage, vandalism",1,"Residence, Home (Includes Apartment)",4065,SILVERWOOD,DR,HOUSTON,77025


In [22]:
crime_aggr=crime_df.groupby(['Zip_code']).count()['Offense_Count']
crime_aggr_df=pd.DataFrame(crime_aggr)
crime_aggr_df.head()

Unnamed: 0_level_0,Offense_Count
Zip_code,Unnamed: 1_level_1
77002,4743
77005,1021
77006,3669
77019,2310
77025,2216


In [23]:
# crime_filtered_df=crime_df[['Zip_code','Offense_Count']]
# crime_filtered_df.info()

In [24]:
results_df=pd.merge(results_df,crime_aggr_df,on="Zip_code")
results_df.head()

Unnamed: 0,id,account,total_appraised_value_2019,pct_value_change,Zip_code,neighborhood_code,acreage,new_owner_date,sq_ft,flood_description,Offense_Count
0,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD,2310
1,30,50440000001,35918.0,9.999081,77019,8317.0,0.0367,1997-03-22,1668.0,AREA OF MINIMAL FLOOD HAZARD,2310
2,67,90870000017,53214.0,-7.121165,77019,8317.0,0.0234,1988-01-02,756.0,AREA OF MINIMAL FLOOD HAZARD,2310
3,68,90870000020,57554.0,-2.285229,77019,8317.0,0.0252,2013-09-10,654.0,AREA OF MINIMAL FLOOD HAZARD,2310
4,3037,571630020001,73268.0,9.998799,77019,8334.0,0.0,1990-08-31,720.0,AREA OF MINIMAL FLOOD HAZARD,2310


In [25]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17 entries, 0 to 16
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          17 non-null     int64         
 1   account                     17 non-null     int64         
 2   total_appraised_value_2019  17 non-null     float64       
 3   pct_value_change            17 non-null     float64       
 4   Zip_code                    17 non-null     int64         
 5   neighborhood_code           17 non-null     float64       
 6   acreage                     17 non-null     float64       
 7   new_owner_date              17 non-null     datetime64[ns]
 8   sq_ft                       17 non-null     float64       
 9   flood_description           17 non-null     object        
 10  Offense_Count               17 non-null     int64         
dtypes: datetime64[ns](1), float64(5), int64(4), object(1)
memory

In [26]:
#Read property_school table and merge to results
property_school_df=pd.read_sql_table('property_school',engine)
property_school_df.head()

Unnamed: 0,account,school_id,school_type
0,21750000003,101912110,Elementary
1,21750000003,101912467,Middle
2,21750000003,101912025,High
3,21750000013,101912110,Elementary
4,21750000013,101912463,Middle


In [27]:
property_school_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58107 entries, 0 to 58106
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   account      58107 non-null  int64 
 1   school_id    58107 non-null  int64 
 2   school_type  58107 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.3+ MB


In [28]:
results_df=pd.merge(results_df,property_school_df,on="account")
results_df.head()

Unnamed: 0,id,account,total_appraised_value_2019,pct_value_change,Zip_code,neighborhood_code,acreage,new_owner_date,sq_ft,flood_description,Offense_Count,school_id,school_type
0,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD,2310,101912058,Elementary
1,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD,2310,101912058,Middle
2,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD,2310,101912322,High
3,30,50440000001,35918.0,9.999081,77019,8317.0,0.0367,1997-03-22,1668.0,AREA OF MINIMAL FLOOD HAZARD,2310,101912058,Elementary
4,30,50440000001,35918.0,9.999081,77019,8317.0,0.0367,1997-03-22,1668.0,AREA OF MINIMAL FLOOD HAZARD,2310,101912058,Middle


In [29]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          51 non-null     int64         
 1   account                     51 non-null     int64         
 2   total_appraised_value_2019  51 non-null     float64       
 3   pct_value_change            51 non-null     float64       
 4   Zip_code                    51 non-null     int64         
 5   neighborhood_code           51 non-null     float64       
 6   acreage                     51 non-null     float64       
 7   new_owner_date              51 non-null     datetime64[ns]
 8   sq_ft                       51 non-null     float64       
 9   flood_description           51 non-null     object        
 10  Offense_Count               51 non-null     int64         
 11  school_id                   51 non-null     int64         
 

In [30]:
#Read school table and merge to results
school_df=pd.read_sql_table('school',engine)
school_df.head()

Unnamed: 0,school_id,school_type,name,address,city,zip_code,district_id,latitude,longitude,school_rating
0,101907107,Elementary,ADAM ELEMENTARY,11303 HONEYGROVE LN,HOUSTON,77065,1,29.926556,-95.603242,85
1,101905043,Middle,AGUIRRE JUNIOR HIGH,15726 WALLISVILLE RD,HOUSTON,77049,2,29.809586,-95.156563,85
2,101911101,Elementary,ALAMO ELEMENTARY,6100 N MAIN,BAYTOWN,77521,3,29.79278,-94.963885,95
3,101903045,Middle,ALBRIGHT MIDDLE,6315 WINKLEMAN,HOUSTON,77083,4,29.709561,-95.654675,95
4,101912102,Elementary,ALCOTT ELEMENTARY,5859 BELLFORT,HOUSTON,77033,5,29.667765,-95.329295,85


In [31]:
school_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822 entries, 0 to 821
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   school_id      822 non-null    int64  
 1   school_type    822 non-null    object 
 2   name           822 non-null    object 
 3   address        822 non-null    object 
 4   city           822 non-null    object 
 5   zip_code       822 non-null    object 
 6   district_id    822 non-null    int64  
 7   latitude       822 non-null    float64
 8   longitude      822 non-null    float64
 9   school_rating  822 non-null    int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 64.3+ KB


In [32]:
results_df=pd.merge(results_df,school_df,on=['school_id','school_type'])
results_df.head()

Unnamed: 0,id,account,total_appraised_value_2019,pct_value_change,Zip_code,neighborhood_code,acreage,new_owner_date,sq_ft,flood_description,...,school_id,school_type,name,address,city,zip_code,district_id,latitude,longitude,school_rating
0,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD,...,101912058,Elementary,GREGORY-LINCOLN EDUCATIONAL CENTER ELEMENTARY,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65
1,30,50440000001,35918.0,9.999081,77019,8317.0,0.0367,1997-03-22,1668.0,AREA OF MINIMAL FLOOD HAZARD,...,101912058,Elementary,GREGORY-LINCOLN EDUCATIONAL CENTER ELEMENTARY,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65
2,67,90870000017,53214.0,-7.121165,77019,8317.0,0.0234,1988-01-02,756.0,AREA OF MINIMAL FLOOD HAZARD,...,101912058,Elementary,GREGORY-LINCOLN EDUCATIONAL CENTER ELEMENTARY,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65
3,68,90870000020,57554.0,-2.285229,77019,8317.0,0.0252,2013-09-10,654.0,AREA OF MINIMAL FLOOD HAZARD,...,101912058,Elementary,GREGORY-LINCOLN EDUCATIONAL CENTER ELEMENTARY,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65
4,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD,...,101912058,Middle,GREGORY-LINCOLN EDUCATIONAL CENTER MIDDLE,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65


In [33]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          51 non-null     int64         
 1   account                     51 non-null     int64         
 2   total_appraised_value_2019  51 non-null     float64       
 3   pct_value_change            51 non-null     float64       
 4   Zip_code                    51 non-null     int64         
 5   neighborhood_code           51 non-null     float64       
 6   acreage                     51 non-null     float64       
 7   new_owner_date              51 non-null     datetime64[ns]
 8   sq_ft                       51 non-null     float64       
 9   flood_description           51 non-null     object        
 10  Offense_Count               51 non-null     int64         
 11  school_id                   51 non-null     int64         
 

In [34]:
#Read flood_zone table and merge to results
# flood_zone_df=pd.read_sql_table('flood_zone',engine)
# flood_zone_df.head()

In [35]:
#Add flood ranking
#3- High Risk
#2 - Medium Risk
#1- Low Risk

results_df['flood_risk']=np.where(results_df['flood_description']=='AREA OF MINIMAL FLOOD HAZARD',1," ")
results_df.loc[(results_df['flood_description']=='0.2 PCT ANNUAL CHANCE FLOOD HAZARD'),'flood_risk']=2
results_df.loc[(results_df['flood_description']=='FLOODWAY'),'flood_risk']=3
results_df.loc[(results_df['flood_description']=='High-Risk Flood Zone'),'flood_risk']=3
results_df.head(50)

Unnamed: 0,id,account,total_appraised_value_2019,pct_value_change,Zip_code,neighborhood_code,acreage,new_owner_date,sq_ft,flood_description,...,school_type,name,address,city,zip_code,district_id,latitude,longitude,school_rating,flood_risk
0,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD,...,Elementary,GREGORY-LINCOLN EDUCATIONAL CENTER ELEMENTARY,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65,1
1,30,50440000001,35918.0,9.999081,77019,8317.0,0.0367,1997-03-22,1668.0,AREA OF MINIMAL FLOOD HAZARD,...,Elementary,GREGORY-LINCOLN EDUCATIONAL CENTER ELEMENTARY,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65,1
2,67,90870000017,53214.0,-7.121165,77019,8317.0,0.0234,1988-01-02,756.0,AREA OF MINIMAL FLOOD HAZARD,...,Elementary,GREGORY-LINCOLN EDUCATIONAL CENTER ELEMENTARY,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65,1
3,68,90870000020,57554.0,-2.285229,77019,8317.0,0.0252,2013-09-10,654.0,AREA OF MINIMAL FLOOD HAZARD,...,Elementary,GREGORY-LINCOLN EDUCATIONAL CENTER ELEMENTARY,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65,1
4,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD,...,Middle,GREGORY-LINCOLN EDUCATIONAL CENTER MIDDLE,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65,1
5,30,50440000001,35918.0,9.999081,77019,8317.0,0.0367,1997-03-22,1668.0,AREA OF MINIMAL FLOOD HAZARD,...,Middle,GREGORY-LINCOLN EDUCATIONAL CENTER MIDDLE,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65,1
6,67,90870000017,53214.0,-7.121165,77019,8317.0,0.0234,1988-01-02,756.0,AREA OF MINIMAL FLOOD HAZARD,...,Middle,GREGORY-LINCOLN EDUCATIONAL CENTER MIDDLE,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65,1
7,68,90870000020,57554.0,-2.285229,77019,8317.0,0.0252,2013-09-10,654.0,AREA OF MINIMAL FLOOD HAZARD,...,Middle,GREGORY-LINCOLN EDUCATIONAL CENTER MIDDLE,1101 TAFT,HOUSTON,77019,5,29.756187,-95.384952,65,1
8,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,AREA OF MINIMAL FLOOD HAZARD,...,High,CARNEGIE VANGUARD HIGH,1501 TAFT,HOUSTON,77019,5,29.75432,-95.38568,95,1
9,30,50440000001,35918.0,9.999081,77019,8317.0,0.0367,1997-03-22,1668.0,AREA OF MINIMAL FLOOD HAZARD,...,High,CARNEGIE VANGUARD HIGH,1501 TAFT,HOUSTON,77019,5,29.75432,-95.38568,95,1


In [36]:
del results_df['flood_description']
del results_df['name']
del results_df['address']
del results_df['city']
del results_df['zip_code']
del results_df['district_id']
del results_df['latitude']
del results_df['longitude']

In [37]:
results_df.head()

Unnamed: 0,id,account,total_appraised_value_2019,pct_value_change,Zip_code,neighborhood_code,acreage,new_owner_date,sq_ft,Offense_Count,school_id,school_type,school_rating,flood_risk
0,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,2310,101912058,Elementary,65,1
1,30,50440000001,35918.0,9.999081,77019,8317.0,0.0367,1997-03-22,1668.0,2310,101912058,Elementary,65,1
2,67,90870000017,53214.0,-7.121165,77019,8317.0,0.0234,1988-01-02,756.0,2310,101912058,Elementary,65,1
3,68,90870000020,57554.0,-2.285229,77019,8317.0,0.0252,2013-09-10,654.0,2310,101912058,Elementary,65,1
4,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,2310,101912058,Middle,65,1


In [38]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          51 non-null     int64         
 1   account                     51 non-null     int64         
 2   total_appraised_value_2019  51 non-null     float64       
 3   pct_value_change            51 non-null     float64       
 4   Zip_code                    51 non-null     int64         
 5   neighborhood_code           51 non-null     float64       
 6   acreage                     51 non-null     float64       
 7   new_owner_date              51 non-null     datetime64[ns]
 8   sq_ft                       51 non-null     float64       
 9   Offense_Count               51 non-null     int64         
 10  school_id                   51 non-null     int64         
 11  school_type                 51 non-null     object        
 

In [39]:
#Count the house sale per neighborhood in 2019
sales2019=results_df.loc[results_df.new_owner_date>'2018-12-31']
sales2019.head()

Unnamed: 0,id,account,total_appraised_value_2019,pct_value_change,Zip_code,neighborhood_code,acreage,new_owner_date,sq_ft,Offense_Count,school_id,school_type,school_rating,flood_risk


In [40]:
results_df['sales2019']=np.where(results_df['new_owner_date']>'2018-12-31',1,0)

In [41]:
results_df.head()

Unnamed: 0,id,account,total_appraised_value_2019,pct_value_change,Zip_code,neighborhood_code,acreage,new_owner_date,sq_ft,Offense_Count,school_id,school_type,school_rating,flood_risk,sales2019
0,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,2310,101912058,Elementary,65,1,0
1,30,50440000001,35918.0,9.999081,77019,8317.0,0.0367,1997-03-22,1668.0,2310,101912058,Elementary,65,1,0
2,67,90870000017,53214.0,-7.121165,77019,8317.0,0.0234,1988-01-02,756.0,2310,101912058,Elementary,65,1,0
3,68,90870000020,57554.0,-2.285229,77019,8317.0,0.0252,2013-09-10,654.0,2310,101912058,Elementary,65,1,0
4,22,50240000010,83954.0,9.999738,77019,8317.0,0.0878,2003-05-04,1754.0,2310,101912058,Middle,65,1,0


In [42]:
sales=results_df.groupby('neighborhood_code')['sales2019'].count()

In [43]:
sales=pd.DataFrame(sales)
sales=sales.rename(columns={'sales2019':'sales_neighborhood_2019'})
sales.head()

Unnamed: 0_level_0,sales_neighborhood_2019
neighborhood_code,Unnamed: 1_level_1
7601.0,6
7609.0,27
7617.01,3
8317.0,12
8334.0,3


In [44]:
results_df=pd.merge(results_df,sales, on="neighborhood_code")

In [45]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          51 non-null     int64         
 1   account                     51 non-null     int64         
 2   total_appraised_value_2019  51 non-null     float64       
 3   pct_value_change            51 non-null     float64       
 4   Zip_code                    51 non-null     int64         
 5   neighborhood_code           51 non-null     float64       
 6   acreage                     51 non-null     float64       
 7   new_owner_date              51 non-null     datetime64[ns]
 8   sq_ft                       51 non-null     float64       
 9   Offense_Count               51 non-null     int64         
 10  school_id                   51 non-null     int64         
 11  school_type                 51 non-null     object        
 

In [46]:
del results_df['sales2019']

In [47]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          51 non-null     int64         
 1   account                     51 non-null     int64         
 2   total_appraised_value_2019  51 non-null     float64       
 3   pct_value_change            51 non-null     float64       
 4   Zip_code                    51 non-null     int64         
 5   neighborhood_code           51 non-null     float64       
 6   acreage                     51 non-null     float64       
 7   new_owner_date              51 non-null     datetime64[ns]
 8   sq_ft                       51 non-null     float64       
 9   Offense_Count               51 non-null     int64         
 10  school_id                   51 non-null     int64         
 11  school_type                 51 non-null     object        
 