### Imports

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import acquire
import prepare

from acquire import acquire_data
from summerize import summarize_data
from prepare import data_prep
from set_counties import create_county_cols

## Acquire

importing our data from our wrangle_zillow_data file

#### uncomment the two lines beginning with df on first time use

In [2]:
# reads csv file if present
# if no csv file, acquires SQL data and creates csv
df = acquire_data()

Acquiring data ...

- zillow_data.csv successfully created

Data has been acquired


In [3]:
summarize_data(df)

******** Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51586 entries, 0 to 51585
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      51586 non-null  int64  
 1   logerror                      51586 non-null  float64
 2   transactiondate               51586 non-null  object 
 3   bathroomcnt                   51586 non-null  float64
 4   bedroomcnt                    51586 non-null  float64
 5   calculatedfinishedsquarefeet  51536 non-null  float64
 6   fips                          51586 non-null  float64
 7   latitude                      51586 non-null  float64
 8   longitude                     51586 non-null  float64
 9   lotsizesquarefeet             51253 non-null  float64
 10  regionidcity                  50574 non-null  float64
 11  regionidcounty                51586 non-null  float64
 12  regionidneighborhood          18655 non-null  

In [4]:
acquire_data?

[0;31mSignature:[0m [0macquire_data[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns a dataframe:

- acquire_data()
    - call generate_csv function to generate a csv file of the SQL data
    - reads the csv to a dataframe and returns it

- generate_csv()
    - calls csv_exit to determines if a local csv file exist
        - if a csv file exist, prints a message
        - if a csv file does not exist
            - calls get_data_from_zillow()
            - writes dataframe to csv
            
- csv_exist()
    - uses os.path.isfile
    - returns a boolean 

- get_data_from_zillow()
    - acquires data from SQL database
    - return to generate_csv as a dataframe
[0;31mFile:[0m      ~/codeup-data-science/ds-methodologies-exercises/clustering/Zillow-Clustering/acquire.py
[0;31mType:[0m      function


In [5]:
df = data_prep(df)

In [6]:
df

Unnamed: 0,parcelid,logerror,transactiondate,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,regionidcity,regionidcounty,regionidzip,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount
0,14297519,0.025595,2017-01-01,3.5,4.0,3100.0,6059.0,33634931.0,-117869207.0,4506.0,53571.0,1286.0,96978.0,1998.0,485713.0,1023282.0,537569.0,11013.72
1,17052889,0.055619,2017-01-01,1.0,2.0,1465.0,6111.0,34449266.0,-119281531.0,12647.0,13091.0,2061.0,97099.0,1967.0,88000.0,464000.0,376000.0,5672.48
2,14186244,0.005383,2017-01-01,2.0,3.0,1243.0,6059.0,33886168.0,-117823170.0,8432.0,21412.0,1286.0,97078.0,1962.0,85289.0,564778.0,479489.0,6488.30
3,12177905,-0.103410,2017-01-01,3.0,4.0,2376.0,6037.0,34245180.0,-118240722.0,13038.0,396551.0,3101.0,96330.0,1970.0,108918.0,145143.0,36225.0,1777.51
4,12095076,-0.001011,2017-01-01,3.0,4.0,2962.0,6037.0,34145202.0,-118179824.0,63000.0,47019.0,3101.0,96293.0,1950.0,276684.0,773303.0,496619.0,9516.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51581,11000655,0.020615,2017-09-20,2.0,2.0,1286.0,6037.0,34245368.0,-118282383.0,47405.0,12447.0,3101.0,96284.0,1940.0,70917.0,354621.0,283704.0,4478.43
51582,17239384,0.013209,2017-09-21,2.0,4.0,1612.0,6111.0,34300140.0,-118706327.0,12105.0,27110.0,2061.0,97116.0,1964.0,50683.0,67205.0,16522.0,1107.48
51583,12773139,0.037129,2017-09-21,1.0,3.0,1032.0,6037.0,34040895.0,-118038169.0,5074.0,36502.0,3101.0,96480.0,1954.0,32797.0,49546.0,16749.0,876.43
51584,12826780,0.007204,2017-09-25,2.0,3.0,1762.0,6037.0,33937685.0,-117996709.0,6347.0,14634.0,3101.0,96171.0,1955.0,140000.0,522000.0,382000.0,6317.15


In [7]:
pd.crosstab(df.fips, df.regionidcounty)

regionidcounty,1286.0,2061.0,3101.0
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6037.0,0,0,33315
6059.0,13912,0,0
6111.0,0,4359,0


In [8]:
df = create_county_cols(df)
df

Unnamed: 0,parcelid,logerror,transactiondate,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,latitude,longitude,lotsizesquarefeet,regionidcity,regionidzip,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,LA,Orange,Ventura
0,14297519,0.025595,2017-01-01,3.5,4.0,3100.0,33634931.0,-117869207.0,4506.0,53571.0,96978.0,1998.0,485713.0,1023282.0,537569.0,11013.72,0,1,0
1,17052889,0.055619,2017-01-01,1.0,2.0,1465.0,34449266.0,-119281531.0,12647.0,13091.0,97099.0,1967.0,88000.0,464000.0,376000.0,5672.48,0,0,1
2,14186244,0.005383,2017-01-01,2.0,3.0,1243.0,33886168.0,-117823170.0,8432.0,21412.0,97078.0,1962.0,85289.0,564778.0,479489.0,6488.30,0,1,0
3,12177905,-0.103410,2017-01-01,3.0,4.0,2376.0,34245180.0,-118240722.0,13038.0,396551.0,96330.0,1970.0,108918.0,145143.0,36225.0,1777.51,1,0,0
4,12095076,-0.001011,2017-01-01,3.0,4.0,2962.0,34145202.0,-118179824.0,63000.0,47019.0,96293.0,1950.0,276684.0,773303.0,496619.0,9516.26,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51581,11000655,0.020615,2017-09-20,2.0,2.0,1286.0,34245368.0,-118282383.0,47405.0,12447.0,96284.0,1940.0,70917.0,354621.0,283704.0,4478.43,1,0,0
51582,17239384,0.013209,2017-09-21,2.0,4.0,1612.0,34300140.0,-118706327.0,12105.0,27110.0,97116.0,1964.0,50683.0,67205.0,16522.0,1107.48,0,0,1
51583,12773139,0.037129,2017-09-21,1.0,3.0,1032.0,34040895.0,-118038169.0,5074.0,36502.0,96480.0,1954.0,32797.0,49546.0,16749.0,876.43,1,0,0
51584,12826780,0.007204,2017-09-25,2.0,3.0,1762.0,33937685.0,-117996709.0,6347.0,14634.0,96171.0,1955.0,140000.0,522000.0,382000.0,6317.15,1,0,0


In [9]:
df.isna().sum()

parcelid                           0
logerror                           0
transactiondate                    0
bathroomcnt                        0
bedroomcnt                         0
calculatedfinishedsquarefeet      50
latitude                           0
longitude                          0
lotsizesquarefeet                333
regionidcity                    1012
regionidzip                       21
yearbuilt                         84
structuretaxvaluedollarcnt         0
taxvaluedollarcnt                  0
landtaxvaluedollarcnt              0
taxamount                          3
LA                                 0
Orange                             0
Ventura                            0
dtype: int64