In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import env

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [43]:
# from our acquire.py:
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'
    
def get_zillow_data():
    df = pd.read_sql("""
SELECT prop.*, 
       pred.logerror, 
       pred.transactiondate, 
       air.airconditioningdesc, 
       arch.architecturalstyledesc, 
       build.buildingclassdesc, 
       heat.heatingorsystemdesc, 
       landuse.propertylandusedesc, 
       story.storydesc, 
       construct.typeconstructiondesc 
FROM   properties_2017 prop  
       INNER JOIN (SELECT parcelid,
                   Max(transactiondate) transactiondate 
                   FROM   predictions_2017 
  
                   GROUP  BY parcelid) pred
               USING (parcelid)
               			JOIN predictions_2017 as pred USING (parcelid, transactiondate)
       LEFT JOIN airconditioningtype air USING (airconditioningtypeid) 
       LEFT JOIN architecturalstyletype arch USING (architecturalstyletypeid) 
       LEFT JOIN buildingclasstype build USING (buildingclasstypeid) 
       LEFT JOIN heatingorsystemtype heat USING (heatingorsystemtypeid) 
       LEFT JOIN propertylandusetype landuse USING (propertylandusetypeid) 
       LEFT JOIN storytype story USING (storytypeid) 
       LEFT JOIN typeconstructiontype construct USING (typeconstructiontypeid) 
WHERE  prop.latitude IS NOT NULL 
       AND prop.longitude IS NOT NULL
""", get_connection('zillow'))
    return df

In [44]:
df = get_zillow_data()

In [45]:
df.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,1727539,14297519,,,,3.5,4.0,,,3.5,...,60590630000000.0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,1387261,17052889,,,,1.0,2.0,,,1.0,...,61110010000000.0,0.055619,2017-01-01,,,,,Single Family Residential,,
2,11677,14186244,,,,2.0,3.0,,,2.0,...,60590220000000.0,0.005383,2017-01-01,,,,,Single Family Residential,,
3,2288172,12177905,,,,3.0,4.0,,8.0,3.0,...,60373000000000.0,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,1970746,10887214,1.0,,,3.0,3.0,,8.0,3.0,...,60371240000000.0,0.00694,2017-01-01,Central,,,Central,Condominium,,


In [46]:
def handle_missing_values(df, prop_required_column = .5, prop_required_row = .70):
	#function that will drop rows or columns based on the percent of values that are missing:\
	#handle_missing_values(df, prop_required_column, prop_required_row
    threshold = int(round(prop_required_column*len(df.index),0))
    df = df.dropna(axis=1, thresh=threshold)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

In [47]:
def remove_columns(df, cols_to_remove):  
	#remove columns not needed
    df = df.drop(columns=cols_to_remove)
    return df

In [48]:
def wrangle_zillow():
    if os.path.isfile('zillow_cached.csv') == False:
        df = get_zillow(sql)
        df.to_csv('zillow_cached.csv',index = False)
    else:
        df = pd.read_csv('zillow_cached.csv')

In [23]:
def nulls_by_col(df):
    num_missing = df.isnull().sum()
    rows = df.shape[0]
    prcnt_miss = num_missing / rows * 100
    cols_missing = pd.DataFrame({'num_rows_missing': num_missing, 'percent_rows_missing': prcnt_miss})
    return cols_missing

In [None]:
nulls_by_col(df)

In [25]:
def nulls_by_row(df):
    num_missing = df.isnull().sum(axis=1)
    prcnt_miss = num_missing / df.shape[1] * 100
    rows_missing = pd.DataFrame({'num_cols_missing': num_missing, 'percent_cols_missing': prcnt_miss})\
    .reset_index()\
    .groupby(['num_cols_missing', 'percent_cols_missing']).count()\
    .rename(index=str, columns={'customer_id': 'num_rows'}).reset_index()
    return rows_missing

In [26]:
nulls_by_row(df)

Unnamed: 0,num_cols_missing,percent_cols_missing,index
0,24,34.78260869565217,12
1,25,36.231884057971016,22
2,26,37.68115942028986,46
3,27,39.130434782608695,161
4,28,40.57971014492754,348
5,29,42.028985507246375,4311
6,30,43.47826086956522,2852
7,31,44.927536231884055,8979
8,32,46.3768115942029,11547
9,33,47.82608695652174,15183


In [None]:
def summarize(df):
    '''
    summarize will take in a single argument (a pandas dataframe) 
    and output to console various statistics on said dataframe, including:
    # .head()
    # .info()
    # .describe()
    # value_counts()
    # observation of nulls in the dataframe
    '''
    print('=====================================================\n\n')
    print('Dataframe head: ')
    print(df.head(3).to_markdown())
    print('=====================================================\n\n')
    print('Dataframe info: ')
    print(df.info())
    print('=====================================================\n\n')
    print('Dataframe Description: ')
    print(df.describe().to_markdown())
    num_cols = [col for col in df.columns if df[col].dtype != 'O']
    cat_cols = [col for col in df.columns if col not in num_cols]
    print('=====================================================')
    print('DataFrame value counts: ')
    for col in df.columns:
        if col in cat_cols:
            print(df[col].value_counts())
        else:
            print(df[col].value_counts(bins=10, sort=False))
    print('=====================================================')
    print('nulls in dataframe by column: ')
    print(nulls_by_col(df))
    print('=====================================================')
    print('nulls in dataframe by row: ')
    print(nulls_by_row(df))
    print('=====================================================')

In [28]:
summarize(df)



Dataframe head: 
|    |   id |   parcelid |   airconditioningtypeid |   architecturalstyletypeid |   basementsqft |   bathroomcnt |   bedroomcnt |   buildingclasstypeid |   buildingqualitytypeid |   calculatedbathnbr |   decktypeid |   finishedfloor1squarefeet |   calculatedfinishedsquarefeet |   finishedsquarefeet12 |   finishedsquarefeet13 |   finishedsquarefeet15 |   finishedsquarefeet50 |   finishedsquarefeet6 |   fips |   fireplacecnt |   fullbathcnt |   garagecarcnt |   garagetotalsqft |   hashottuborspa |   heatingorsystemtypeid |    latitude |    longitude |   lotsizesquarefeet |   poolcnt |   poolsizesum |   pooltypeid10 |   pooltypeid2 |   pooltypeid7 | propertycountylandusecode   |   propertylandusetypeid | propertyzoningdesc   |   rawcensustractandblock |   regionidcity |   regionidcounty |   regionidneighborhood |   regionidzip |   roomcnt |   storytypeid |   threequarterbathnbr |   typeconstructiontypeid |   unitcnt |   yardbuildingsqft17 |   yardbuildingsqft26 |   year

|       |      id |        parcelid |   airconditioningtypeid |   architecturalstyletypeid |   basementsqft |   bathroomcnt |   bedroomcnt |   buildingclasstypeid |   buildingqualitytypeid |   calculatedbathnbr |   decktypeid |   finishedfloor1squarefeet |   calculatedfinishedsquarefeet |   finishedsquarefeet12 |   finishedsquarefeet13 |   finishedsquarefeet15 |   finishedsquarefeet50 |   finishedsquarefeet6 |      fips |   fireplacecnt |   fullbathcnt |   garagecarcnt |   garagetotalsqft |   hashottuborspa |   heatingorsystemtypeid |         latitude |        longitude |   lotsizesquarefeet |   poolcnt |   poolsizesum |   pooltypeid10 |   pooltypeid2 |   pooltypeid7 |   propertylandusetypeid |   rawcensustractandblock |   regionidcity |   regionidcounty |   regionidneighborhood |   regionidzip |     roomcnt |   storytypeid |   threequarterbathnbr |   typeconstructiontypeid |     unitcnt |   yardbuildingsqft17 |   yardbuildingsqft26 |   yearbuilt |   numberofstories |   fireplaceflag |

LAR1          7457
LAR3          1746
LARS          1580
LBR1N         1315
LAR2          1204
              ... 
LCR2-1           1
LCRA7500E*       1
DOR1500*         1
BFM1*            1
DUR1A*           1
Name: propertyzoningdesc, Length: 2039, dtype: int64
(60370272.019999996, 60444919.091]    52234
(60444919.091, 60518827.081]              0
(60518827.081, 60592735.072]          18808
(60592735.072, 60666643.062]              0
(60666643.062, 60740551.052]              0
(60740551.052, 60814459.042]              0
(60814459.042, 60888367.032]              0
(60888367.032, 60962275.023]              0
(60962275.023, 61036183.013]              0
(61036183.013, 61110091.003]           6571
Name: rawcensustractandblock, dtype: int64
(3097.9339999999997, 42797.5]    55435
(42797.5, 82104.0]               17940
(82104.0, 121410.5]               1330
(121410.5, 160717.0]                 0
(160717.0, 200023.5]                 0
(200023.5, 239330.0]                 0
(239330.0, 278636.5] 

                          num_rows_missing  percent_rows_missing
id                                       0              0.000000
parcelid                                 0              0.000000
airconditioningtypeid                56157             72.355147
architecturalstyletypeid             77462             99.805445
basementsqft                         77568             99.942020
...                                    ...                   ...
transactiondate                          0              0.000000
last_trans_date                          0              0.000000
propertylandusedesc                      0              0.000000
storydesc                            77568             99.942020
typeconstructiondesc                 77447             99.786118

[69 rows x 2 columns]
nulls in dataframe by row: 
   num_cols_missing percent_cols_missing  index
0                24    34.78260869565217     12
1                25   36.231884057971016     22
2                26    37