# SECTION 1: HOUSING DATA CLEANING AND ANALYSIS


###### Importing Libraries, Intitial Look at Housing Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import seaborn as sns
import numpy as np
import math

In [2]:
df = pd.read_csv('/Users/admin/Documents/Flatiron/Housing_Prices_and_Flood_Risk/data/kc_house_data.csv')

In [3]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'greenbelt', 'nuisance', 'view',
       'condition', 'grade', 'heat_source', 'sewer_system', 'sqft_above',
       'sqft_basement', 'sqft_garage', 'sqft_patio', 'yr_built',
       'yr_renovated', 'address', 'lat', 'long'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30155 entries, 0 to 30154
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             30155 non-null  int64  
 1   date           30155 non-null  object 
 2   price          30155 non-null  float64
 3   bedrooms       30155 non-null  int64  
 4   bathrooms      30155 non-null  float64
 5   sqft_living    30155 non-null  int64  
 6   sqft_lot       30155 non-null  int64  
 7   floors         30155 non-null  float64
 8   waterfront     30155 non-null  object 
 9   greenbelt      30155 non-null  object 
 10  nuisance       30155 non-null  object 
 11  view           30155 non-null  object 
 12  condition      30155 non-null  object 
 13  grade          30155 non-null  object 
 14  heat_source    30123 non-null  object 
 15  sewer_system   30141 non-null  object 
 16  sqft_above     30155 non-null  int64  
 17  sqft_basement  30155 non-null  int64  
 18  sqft_g

In [5]:
df['address'].sample

<bound method NDFrame.sample of 0        2102 Southeast 21st Court, Renton, Washington ...
1        11231 Greenwood Avenue North, Seattle, Washing...
2        8504 South 113th Street, Seattle, Washington 9...
3        4079 Letitia Avenue South, Seattle, Washington...
4        2193 Northwest Talus Drive, Issaquah, Washingt...
                               ...                        
30150    4673 Eastern Avenue North, Seattle, Washington...
30151    4131 44th Avenue Southwest, Seattle, Washingto...
30152    910 Martin Luther King Jr Way, Seattle, Washin...
30153    17127 114th Avenue Southeast, Renton, Washingt...
30154    18615 7th Avenue South, Burien, Washington 981...
Name: address, Length: 30155, dtype: object>

In [6]:
df.describe(include="all")

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,...,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,address,lat,long
count,30155.0,30155,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155,30155,...,30141,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155,30155.0,30155.0
unique,,365,,,,,,,2,2,...,4,,,,,,,29560,,
top,,7/1/2021,,,,,,,NO,NO,...,PUBLIC,,,,,,,"Avenue, 108 Foothill Blvd, Rancho Cucamonga, C...",,
freq,,196,,,,,,,29636,29382,...,25777,,,,,,,38,,
mean,4538104000.0,,1108536.0,3.41353,2.334737,2112.424739,16723.6,1.543492,,,...,,1809.826098,476.039396,330.211142,217.412038,1975.163953,90.922301,,47.328076,-121.317397
std,2882587000.0,,896385.7,0.981612,0.889556,974.044318,60382.6,0.567717,,,...,,878.306131,579.631302,285.770536,245.302792,32.067362,416.473038,,1.434005,5.725475
min,1000055.0,,27360.0,0.0,0.0,3.0,402.0,1.0,,,...,,2.0,0.0,0.0,0.0,1900.0,0.0,,21.27424,-157.79148
25%,2064175000.0,,648000.0,3.0,2.0,1420.0,4850.0,1.0,,,...,,1180.0,0.0,0.0,40.0,1953.0,0.0,,47.40532,-122.326045
50%,3874011000.0,,860000.0,3.0,2.5,1920.0,7480.0,1.5,,,...,,1560.0,0.0,400.0,150.0,1977.0,0.0,,47.55138,-122.225585
75%,7287100000.0,,1300000.0,4.0,3.0,2619.5,10579.0,2.0,,,...,,2270.0,940.0,510.0,320.0,2003.0,0.0,,47.669913,-122.116205


Should be able to drop nulls, since they are only a small ratio and only in 2 columns 
(heat_source and sewer_system). need to check and make sure there aren't other null values (encoded, for instance,
as zeroes)

Location is of course going to be fundamental, so need to parse the address data as separate strings to. May also be able to use lat/long, but must be mindful of this creating collinearity

Given interest in climate change effects (pre-determined), flooding might be easiest to look at. do these prices relate to flood-prone areas? how might these values shift given the effects of climate change? look at "greenbelt", "grade", "lat", "long", "waterfront" etc. We can't do a lot of EDA here since we are really looking for correspondences with data that doesn't exist yet.

Random sampling indicates the coordinates do match up exactly with street addresses

Got relevant flood data as a csv file. now need to clean this data so we can merge on a common column, which
in this case in zip code

# Initial cleaning for merge

In [7]:

df[['street address', 'city', 'state','zipcode']] = df['address'].str.split(',', expand=True)
df['street address'] = df['street address'].str.strip()
df['city'] = df['city'].str.strip()
df['state'] = df['state'].str.strip()
df['zipcode'] = df['zipcode'].str.strip()

df.to_csv('housing_cleaned_addresses.csv', index=False)


ValueError: Columns must be same length as key