Cluster Analysis

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from acquire_zillow import get_connection
from acquire_zillow import get_zillow_data
from acquire_mall import get_mall_data
from prepare_zillow import get_numeric_columns
from prepare_zillow import change_numeric_2str
from prepare_zillow import matching_strings_inrows
from prepare_zillow import delete_missing_byrow
from prepare_zillow import delete_missing_bycolumn
from prepare_zillow import replace_nulls

import env


Either use get_zillow_data to connect to sql and load directly to a dataframe (and the write to working directory csv),    or read from csv into dataframe

In [2]:
# df = get_zillow_data()
# df.to_csv('zillow.csv', index=False)

In [3]:
df = pd.read_csv('zillow.csv')

In [4]:
numeric_cols = get_numeric_columns(df,['parcelid'])
# len(numeric_cols)

In [5]:
#  count the number of unique values for all numeric columns
# df.select_dtypes('number').nunique()

In [6]:
# change the numeric columns that only have 1 or 2 unique values to type string
col_list = ['decktypeid','fips','regionidcounty','yearbuilt', 'hashottuborspa', 'fireplaceflag','poolcnt']
df = change_numeric_2str(df, col_list)

In [7]:
# if 30% of the values in a column are null, remove the column
df = delete_missing_bycolumn(df, 0.3)

In [8]:
df.shape

(167854, 27)

In [9]:
text_list = ['single','condominium']

In [10]:
df = matching_strings_inrows(df, 'propertylandusedesc', text_list )

In [11]:
df.shape

(155235, 27)

In [12]:
# delete rows that have 10% of their values as zero, blank, or missing
df = delete_missing_byrow(df, .10, .10, .10)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154466 entries, 0 to 167849
Data columns (total 27 columns):
parcelid                        154466 non-null int64
logerror                        154466 non-null float64
transactiondate                 154466 non-null object
bathroomcnt                     154466 non-null float64
bedroomcnt                      154466 non-null float64
calculatedbathnbr               154430 non-null float64
calculatedfinishedsquarefeet    154452 non-null float64
finishedsquarefeet12            154150 non-null float64
fips                            154466 non-null object
fullbathcnt                     154430 non-null float64
latitude                        154466 non-null float64
longitude                       154466 non-null float64
lotsizesquarefeet               136774 non-null float64
propertycountylandusecode       154466 non-null object
rawcensustractandblock          154466 non-null float64
regionidcity                    151652 non-null float6

In [14]:
# replace the remaining nulls with blanks or zeros
df = replace_nulls(df)

In [15]:
df.propertylandusedesc.unique()

array(['Single Family Residential', 'Condominium'], dtype=object)

In [16]:
df.shape

(154466, 27)