In [2]:
import os
import pandas as pd
import numpy as np

import wrangle

### Config variables

In order to make changing the behavior of the script as quick and reusable as possible, all of the variables specific to this dataset are placed up top for easy access.  
These variables are declared with ALL CAPS as a subtle identifier of a config or global variable.

In [3]:
ENVFILE = './env.py'
CSV='./data.csv'

SEED = 8

DB= 'zillow'
QUERY ="""
SELECT 
    id,
    bedroomcnt,
    bathroomcnt,
    calculatedfinishedsquarefeet,
    taxvaluedollarcnt,
    yearbuilt,
    taxamount,
    fips
FROM
    zillow.properties_2017
        JOIN
    propertylandusetype USING (propertylandusetypeid)
WHERE
    propertylandusetypeid = 261
"""

### SQL Acquisition
With the config variables in place I can use some pre-written functions to handle acquiring the data and cacheing it.

In [4]:
def get_db_url(database, hostname='', username='', password='', env=''):
    '''Creates a URL for a specific database and credential set to be used with pymysql.

    Can be used either with a set of credentials passed directly to the function or with an environment file containing the credentials.
    If both are provided, the environment file takes precedence.

    Returns:
    str: Full URL for use with a pymysql connection
    '''
    if env != '':
        d = {}
        file = open(env)
        for line in file:
            (key, value) = line.split('=')
            d[key] = value.replace('\n', '').replace("'",'').replace('"','')
        username = d['username']
        hostname = d['hostname']
        password = d['password']
    url = f'mysql+pymysql://{username}:{password}@{hostname}/{database}'
    return url

def new_data():
    """Downloads a copy of data from CodeUp's SQL Server"""
    url = get_db_url(DB,env=ENVFILE)
    df = pd.read_sql(QUERY, url)
    return df

def get_data():
    """Returns an uncleaned copy of the telco data from telco.csv.
    If the file does not exist, grabs a new copy and creates the file.
    """
    filename = CSV
    
    # if file is available locally, read it
    if os.path.isfile(filename):
        return pd.read_csv(filename, index_col=0)
    
    # if file not available locally, acquire data from SQL database
    # and write it as csv locally for future use
    else:
        # read the SQL query into a dataframe
        df = new_data()
        
        # Write that dataframe to disk for later. Called "caching" the data for later.
        df.to_csv(filename)

        # Return the dataframe to the calling code
        return df  

In [5]:
data = get_data()
data.head()

Unnamed: 0,id,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,1,0.0,0.0,,27516.0,,,6037.0
1,15,0.0,0.0,,10.0,,,6037.0
2,16,0.0,0.0,,10.0,,,6037.0
3,17,0.0,0.0,,2108.0,,174.21,6037.0
4,20,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


In [6]:
df = wrangle.get_data()
df.head()

Unnamed: 0,id,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,1,0.0,0.0,,27516.0,,,6037.0
1,15,0.0,0.0,,10.0,,,6037.0
2,16,0.0,0.0,,10.0,,,6037.0
3,17,0.0,0.0,,2108.0,,174.21,6037.0
4,20,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


### Data investigation

In [7]:
df.info(True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2152863 entries, 0 to 2152862
Data columns (total 8 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   id                            2152863 non-null  int64  
 1   bedroomcnt                    2152852 non-null  float64
 2   bathroomcnt                   2152852 non-null  float64
 3   calculatedfinishedsquarefeet  2144379 non-null  float64
 4   taxvaluedollarcnt             2152370 non-null  float64
 5   yearbuilt                     2143526 non-null  float64
 6   taxamount                     2148421 non-null  float64
 7   fips                          2152863 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 147.8 MB


In [8]:
# Check for any Null values in each column of our DataFrame.

df.isnull().any()


id                              False
bedroomcnt                       True
bathroomcnt                      True
calculatedfinishedsquarefeet     True
taxvaluedollarcnt                True
yearbuilt                        True
taxamount                        True
fips                            False
dtype: bool

In [9]:
# Return the names for any columns in our DataFrame with any Null values.

nul_cols = df.columns[df.isnull().any()]


In [10]:
df[nul_cols[3]].value_counts(dropna=False).loc[[np.NaN]]


NaN    493
Name: taxvaluedollarcnt, dtype: int64

In [11]:
df.isnull().sum()

id                                 0
bedroomcnt                        11
bathroomcnt                       11
calculatedfinishedsquarefeet    8484
taxvaluedollarcnt                493
yearbuilt                       9337
taxamount                       4442
fips                               0
dtype: int64

In [12]:
df[df.isnull().any(axis=1)].shape, df.shape

((12628, 8), (2152863, 8))

In [13]:
len(df[df.isnull().any(axis=1)]) / len(df)

0.005865677472277613

The number of rows with missing values is very small compared to the total dataset. (0.5%)  
I believe it's safe to just drop these rows.

In [14]:
df = df.dropna()
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2140235 entries, 4 to 2152862
Data columns (total 8 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   id                            2140235 non-null  int64  
 1   bedroomcnt                    2140235 non-null  float64
 2   bathroomcnt                   2140235 non-null  float64
 3   calculatedfinishedsquarefeet  2140235 non-null  float64
 4   taxvaluedollarcnt             2140235 non-null  float64
 5   yearbuilt                     2140235 non-null  float64
 6   taxamount                     2140235 non-null  float64
 7   fips                          2140235 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 147.0 MB


A lot of these are floats.  I'm sure some can be converted to int.

In [15]:
df = df.convert_dtypes()
df.head()

Unnamed: 0,id,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
4,20,4,2.0,3633,296425,2005,6941.39,6037
6,31,3,4.0,1620,847770,2011,10244.94,6037
7,33,3,2.0,2077,646760,1926,7924.68,6037
11,62,0,0.0,1200,5328,1972,91.6,6037
14,97,0,0.0,171,6920,1973,255.17,6037


In [16]:
def get_data_dropna():
    df = get_data()
    df = df.dropna()
    df = df.convert_dtypes()
    return df

In [17]:
df = get_data_dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2140235 entries, 4 to 2152862
Data columns (total 8 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   id                            Int64  
 1   bedroomcnt                    Int64  
 2   bathroomcnt                   Float64
 3   calculatedfinishedsquarefeet  Int64  
 4   taxvaluedollarcnt             Int64  
 5   yearbuilt                     Int64  
 6   taxamount                     Float64
 7   fips                          Int64  
dtypes: Float64(2), Int64(6)
memory usage: 163.3 MB


### Train Test Validate split

In [18]:
from sklearn.model_selection import train_test_split
## Generic split data function
def train_validate_test_split(df, seed=SEED, stratify=None):
    """Splits data 60%/20%/20%"""
    # First split off our testing data.
    train, test_validate = train_test_split(
        df, 
        test_size=3/5, 
        random_state=seed, 
        stratify=( df[stratify] if stratify else None)
    )
    # Then split the remaining into train/validate data.
    test, validate = train_test_split(
        test_validate,
        test_size=1/2,
        random_state=seed,
        stratify= (test_validate[stratify] if stratify else None)
    )
    return train, test, validate

Test wrangle function.

In [19]:
train, test, validate = wrangle.wrangle_zillow()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 856094 entries, 1219155 to 1652606
Data columns (total 8 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   id                            856094 non-null  Int64  
 1   bedroomcnt                    856094 non-null  Int64  
 2   bathroomcnt                   856094 non-null  Float64
 3   calculatedfinishedsquarefeet  856094 non-null  Int64  
 4   taxvaluedollarcnt             856094 non-null  Int64  
 5   yearbuilt                     856094 non-null  Int64  
 6   taxamount                     856094 non-null  Float64
 7   fips                          856094 non-null  Int64  
dtypes: Float64(2), Int64(6)
memory usage: 65.3 MB


In [22]:
df = get_data_dropna()
df['fips'] = df['fips'].astype('object')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2140235 entries, 4 to 2152862
Data columns (total 8 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   id                            Int64  
 1   bedroomcnt                    Int64  
 2   bathroomcnt                   Float64
 3   calculatedfinishedsquarefeet  Int64  
 4   taxvaluedollarcnt             Int64  
 5   yearbuilt                     Int64  
 6   taxamount                     Float64
 7   fips                          object 
dtypes: Float64(2), Int64(5), object(1)
memory usage: 161.2+ MB
