In [5]:
import pandas as pd
from env import host, user, password
import seaborn as sns
import acquire
import prepare
import numpy as np
import matplotlib.pyplot as plt
import scipy
from sklearn.metrics import accuracy_score, precision_score, recall_score,  classification_report
import warnings
warnings.filterwarnings("ignore")

In [6]:
def get_db_url(dbname) -> str:
    url = 'mysql+pymysql://{}:{}@{}/{}'
    return url.format(user, password, host, dbname)

In [17]:
def get_df():
    '''
    Function produces a clean dataframe from a SQL querry 
    '''
    query = '''
    SELECT properties_2017.parcelid, propertylandusetypeid, calculatedfinishedsquarefeet, taxamount, taxvaluedollarcnt, bedroomcnt, bathroomcnt, predictions_2017.transactiondate, fips
    FROM properties_2017
    LEFT JOIN predictions_2017 ON properties_2017.parcelid=predictions_2017.parcelid
    WHERE propertylandusetypeid = 261 
	AND (transactiondate >= '2017-05-01' AND transactiondate <= '2017-06-30')
    '''
    df = pd.read_sql(query, get_db_url('zillow'))
    return df


In [18]:
df = get_df()

In [19]:
df

Unnamed: 0,parcelid,propertylandusetypeid,calculatedfinishedsquarefeet,taxamount,taxvaluedollarcnt,bedroomcnt,bathroomcnt,transactiondate,fips
0,11289917,261.0,1458.0,2319.90,136104.0,3.0,2.0,2017-06-23,6037.0
1,11705026,261.0,1421.0,543.69,35606.0,2.0,1.0,2017-06-30,6037.0
2,14269464,261.0,2541.0,9819.72,880456.0,4.0,3.0,2017-06-01,6059.0
3,11389003,261.0,1650.0,7673.19,614000.0,3.0,2.0,2017-06-01,6037.0
4,11967869,261.0,693.0,3267.47,274237.0,2.0,1.0,2017-06-29,6037.0
...,...,...,...,...,...,...,...,...,...
15031,14339902,261.0,2526.0,5718.72,458903.0,5.0,3.0,2017-06-30,6059.0
15032,13940564,261.0,2735.0,1465.88,115387.0,5.0,3.0,2017-06-30,6059.0
15033,12945108,261.0,1536.0,3519.78,297097.0,3.0,2.0,2017-06-30,6037.0
15034,14214719,261.0,2655.0,8065.50,746963.0,5.0,3.0,2017-06-30,6059.0


In [20]:
is_NaN = df.isnull()

In [24]:
is_NaN.any(axis = 1)

0        False
1        False
2        False
3        False
4        False
         ...  
15031    False
15032    False
15033    False
15034    False
15035    False
Length: 15036, dtype: bool

In [26]:
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]
rows_with_NaN

Unnamed: 0,parcelid,propertylandusetypeid,calculatedfinishedsquarefeet,taxamount,taxvaluedollarcnt,bedroomcnt,bathroomcnt,transactiondate,fips
280,14466991,261.0,,25740.28,2493444.0,0.0,0.0,2017-05-01,6059.0
920,13972530,261.0,,3230.88,270009.0,0.0,0.0,2017-05-04,6059.0
921,14325627,261.0,,81640.56,7074075.0,0.0,0.0,2017-05-04,6059.0
940,12686981,261.0,,60905.26,5461875.0,0.0,0.0,2017-05-04,6037.0
2629,12131538,261.0,,2736.14,225126.0,0.0,0.0,2017-05-11,6037.0
4096,17292027,261.0,,34159.22,3210155.0,0.0,0.0,2017-05-18,6111.0
5180,12920381,261.0,944.0,,352000.0,3.0,2.0,2017-05-24,6037.0
5553,14430787,261.0,,80251.5,5126781.0,0.0,0.0,2017-05-25,6059.0
6585,17267536,261.0,,302.94,26237.0,0.0,0.0,2017-05-30,6111.0
6942,167687839,261.0,,22045.81,1842678.0,0.0,0.0,2017-05-31,6037.0
