In [1]:
from env import get_db_url
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
import os



In [2]:
    conn = get_db_url('zillow')

    query = '''
            SELECT bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, fips
            FROM properties_2017
            WHERE propertylandusetypeid = 261;  
            '''

    
    df = pd.read_sql(query, conn)

KeyboardInterrupt: 

In [None]:
df.dtypes

In [None]:
df = df.replace(r'^\s*$', np.nan, regex=True)

In [None]:
df =  df.dropna()

In [None]:
df = df.drop_duplicates(keep= False)


In [None]:
bin_cat = [0, 500000, 1000000, 1500000, 2000000, 5000000, 10000000, 25000000, 50000000, 75000000, 100000000]

df['value_cat'] = pd.cut(df['taxvaluedollarcnt'], bins=bin_cat, labels=False)

In [None]:
df

## Acquire

In [None]:
def new_zillow_data():
   
    conn = get_db_url('zillow')

    query = '''
            SELECT bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, fips
            FROM properties_2017
            WHERE propertylandusetypeid = 261;  
            '''

    
    df = pd.read_sql(query, conn)
    return df

In [None]:
def get_zillow_data():
    if os.path.isfile('zillow_df.csv'):
        df = pd.read_csv('zillow_df.csv', index_col = 0)

    else:

        df = new_zillow_data()

        df.to_csv('zillow_df.csv')
        return df

## Prep

##### Split

In [None]:
def split_zillow_data(df):
  
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)

    
    return train, validate, test


#### clean

In [None]:
def prep_zillow_data(df):
        df.replace(r'^\s*$', np.nan, regex=True)
        df.dropna()
        new_columns = {
        'bedroomcnt': 'bedroom_count',
        'bathroomcnt': 'bathroom_count',
        'calculatedfinishedsquarefeet': 'calc_sqr_feet',
        'taxvaluedollarcnt': 'tax_value',
        'fips': 'county_code'
        }
        df = df.rename(columns=new_columns)
        
        bin_cat = [0, 500000, 1000000, 1500000, 2000000, 5000000, 10000000, 25000000, 50000000, 75000000, 100000000]
        df['value_cat'] = pd.cut(df['tax_value'], bins=bin_cat, labels=False)
    
        train, validate, test = split_zillow_data(df)
    
        return train, validate, test


In [None]:
for column in df.columns:
    plt.figure(figsize=(10, 3))
    
    # Histogram
    plt.subplot(1, 3, 1)
    sns.histplot(data=df.sample(frac = 0.25) ,x=column, binwidth = 1)
    plt.title(f'Histogram of {column}')
    
    # Box plot
    plt.subplot(1, 3, 2)
    sns.boxplot(data=df.sample(frac = 0.25) ,x=column)
    plt.title(f'Box Plot of {column}')
    
    
    
    plt.show()


Outliers

- keep:
bedroom count: 3-5
bathroom count: 2-4
sqr feet: less than 9,000


In [None]:
df = df[(df['bedroom_count'] < 5 & > 3) &
        (df['bathroom_count'] < 4 & > 2) &
        (df['calc_sqr_feet'] < 9000) &
        (df['yearbuilt'] >1900)
       ]


### all in one

In [None]:
def wrangle_zillow():
    df = new_zillow_data()
    train, validate, test = prep_zillow_data(df)
    return train, validate, test

### trying it out

In [None]:
df = new_zillow_data()

In [None]:
df


In [None]:
train, validate, test = prep_zillow_data(df)

In [None]:
train

In [None]:
train, validate, test = wrangle_zillow()

In [None]:
train