In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

import acquire
import prepare
import env

from wrangle_zillow import wrangle_zillow_data

SyntaxError: unexpected EOF while parsing (wrangle_zillow.py, line 18)

## acquire

In [None]:
query ='''
select 
    prop.parcelid
    , pred.logerror
    , pred.transactiondate
    , bathroomcnt
    , bedroomcnt
    , calculatedfinishedsquarefeet
    , fips
    , latitude
    , longitude
    , lotsizesquarefeet
    , regionidcity
    , regionidcounty
    , regionidneighborhood
    , regionidzip
    , yearbuilt
    , structuretaxvaluedollarcnt
    , taxvaluedollarcnt
    , landtaxvaluedollarcnt
    , taxamount
from properties_2017 prop
inner join predictions_2017 pred on prop.parcelid = pred.parcelid
where propertylandusetypeid = 261;
'''

df = pd.read_sql(query, env.get_url('zillow'))

In [None]:
#df = pd.read_csv('zillow_data.csv')

In [None]:
#df = df.drop(columns=['Unnamed: 0'])

### Goal: Improve our original estimate of the log error by using clustering methodologies.

## Acquisition, Prep, and Initial Exploration
Using the notebook and files you created during the exercises make any changes, additions, etc. you want at this point. NOTE: You will NOT be splitting into train and test at this point.

Ideas:

   1. Data types:

        - Write a function that takes in a dataframe and a list of column names and returns the dataframe with the datatypes of those columns changed to a non-numeric type.

In [None]:
df.head()

In [None]:
# it works!
cols = ['bathroomcnt', 'latitude']
def change_data(df, cols):
    """
    takes a dataframe and a list of columns and it 
    converts the columns listed that are in the dataframe into 
    objects in the same dataframe
    """
    newdf = pd.DataFrame(df, columns=cols)
    df = df.drop(columns=cols)
    newdf = newdf.astype(object)
    df = pd.concat([df, newdf], axis=1)
    return df
    
        



#zillow = df.copy()
#zillow = zillow.astype(object)

   - Use this function to appropriately transform any numeric columns that should not be treated as numbers.

In [None]:
df.info()

>upon more inspection it looks like we could use this function to convert zipcodes parcelid id fips, rawcensustractbloc, regionid's,

In [None]:
df.head()

In [None]:
cols = ['parcelid','fips', 'regionidcity', 'regionidcounty', 'regionidzip',
       'regionidneighborhood',]
zillow = change_data(df, cols)

In [None]:
zillow.info()

   2. Missing Values: Impute the values in land square feet.

In [None]:
zillow.sample()

In [None]:
round(zillow.isna().sum()/len(zillow)*100,2)

   3. Missing Values: Of the remaining missing values, can they be imputed or otherwise estimated?

        - Impute those that can be imputed with the method you feel best fits the attribute.
        - Decide whether to remove the rows or columns of any that cannot be reasonably imputed.
        - Document your reasons for the decisions on how to handle each of those.

In [None]:
round(zillow.isna().sum()/len(zillow)*100,2)

In [None]:
# we're going to drop regionidneighborhood because it's missing over 50% of it's data

In [None]:
zillow = zillow.drop(columns=['regionidneighborhood'])

In [None]:
# we're going to check out regionidcity and see if that column is vital to our analysis

In [None]:
zillow.regionidcity.value_counts()

In [None]:
# I'm not sure what these codes are, so we're going to drop this 
#columns for now and if we need it later i'll impute
zillow = zillow.drop(columns=['regionidcity'])

In [None]:
# We're going to use the median to fill taxamount
zillow['taxamount'] = zillow['taxamount'].fillna(zillow.taxamount.median())

In [None]:
# fillna's with median for yearbuilt
zillow['yearbuilt'] = zillow['yearbuilt'].fillna(zillow.yearbuilt.median())


In [None]:
# fillna's with median for structuretaxvaluedollarcnt
zillow['structuretaxvaluedollarcnt'] = zillow['structuretaxvaluedollarcnt'].fillna(zillow.structuretaxvaluedollarcnt.median())

In [None]:
# fillna's with median for calculatedfinishedsquarefeet
zillow['calculatedfinishedsquarefeet'] = zillow['calculatedfinishedsquarefeet'].fillna(zillow.calculatedfinishedsquarefeet.median())

In [None]:
zillow['regionidzip'] = zillow['regionidzip'].fillna(zillow.regionidzip.median())

In [None]:
zillow.isna().sum()

In [None]:
# adding median value to those 2 missing rows in landtax and taxvalue & lotsizesquarefeet
zillow['taxvaluedollarcnt'] = zillow['taxvaluedollarcnt'].fillna(zillow.taxvaluedollarcnt.median())
zillow['landtaxvaluedollarcnt'] = zillow['landtaxvaluedollarcnt'].fillna(zillow.landtaxvaluedollarcnt.median())
zillow['lotsizesquarefeet'] = zillow['lotsizesquarefeet'].fillna(zillow.lotsizesquarefeet.median())

In [None]:
def the_master_imputer(df):
    for col in df:
        if col.isna().sum() > 0:
            df[f'{col}'] = df[f'{col}'].fillna(df.col.median())
    return df
    
    

In [None]:
def change_data_to_int(df, cols):
    """
    takes a dataframe and a list of columns and it 
    converts the columns listed that are in the dataframe into 
    objects in the same dataframe
    """
    newdf = pd.DataFrame(df, columns=cols)
    df = df.drop(columns=cols)
    newdf = newdf.astype(int)
    df = pd.concat([df, newdf], axis=1)
    return df

In [None]:
zillow.isna().sum()

In [None]:
# turning our df into a csv a
zillow.to_csv('zillow.csv')

In [None]:
zillow = pd.read_csv('zillow.csv')

In [None]:
zillow.isna().sum()

In [None]:
zillow = zillow.drop(columns=['Unnamed: 0'])

In [None]:
zillow.info()

In [None]:
# we're going to convert id's and zip codes into objects
cols = ['parcelid','fips', 'regionidcounty', 'regionidzip']
zillow = change_data(zillow, cols)

In [None]:
zillow.info()

   4. Outliers: Original from exercises. Adapt as you see fit.

        - Write a function that accepts a series (i.e. one column from a data frame) and summarizes how many outliers are in the series. This function should accept a second parameter that determines how outliers are detected, with the ability to detect outliers in 3 ways: IQR, standard deviations (z-score), percentiles)

In [None]:
# converts df to series
def convert_to_series(df):
    '''
    helper function for the summarize function
    that converts a dataframe into a series and grabs 
    the value counts from that dataframe
    '''
    series = pd.Series([])
    for _, col in enumerate(df.columns.values):
        if df[col].dtype == 'object':
            col_count = df[col].value_counts()
        else:
            col_count = df[col].value_counts(bins=10)
        series = series.append(col_count)
    return series

In [None]:
# outlier detection with IQR as a filter
def get_upper_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for the
    series.

    The values returned will be either 0 (if the point is not an outlier), or a
    number that indicates how far away from the upper bound the observation is.
    '''
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_bound = q3 + k * iqr
    return s.apply(lambda x: max([x - upper_bound, 0]))

In [None]:
def add_upper_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
    # outlier_cols = {col + '_outliers': get_upper_outliers(df[col], k)
    #                 for col in df.select_dtypes('number')}
    # return df.assign(**outlier_cols)

    for col in df.select_dtypes('number'):
        df[col + '_outliers'] = get_upper_outliers(df[col], k)

    return df    

In [None]:
add_upper_outlier_columns(zillow, 1.5)

In [None]:
outlier_cols = [col for col in zillow if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = zillow[col][zillow[col] > 0]
    print(data.describe())

In [None]:
# we're going to drop the rows of houses worth over 1million dollars
zillow[zillow['structuretaxvaluedollarcnt'] > 1000000]

In [None]:
zillow.drop(zillow[zillow.structuretaxvaluedollarcnt >= 1_000_000].index, inplace=True)

In [None]:
zillow.info()

   5. Use your function defined above to identify columns where you should handle the outliers.

In [None]:
newdf = pd.DataFrame()
if col in df.corr() >= 0.5:
    print("got it")
else:
    print("not it")

   6. Write a function that accepts the zillow data frame and removes the outliers. You should make a decision and document how you will remove outliers.

   7. Is there erroneous data you have found that you need to remove or repair? If so, take action.

   8. Are there outliers you want to "squeeze in" to a max value? (e.g. all bathrooms > 6 => bathrooms = 6). If so, make those changes.

# Exploration with Clustering
## Cluster the Target Variable
    Why? By reducing the noise of the continuous variable, we can possibly see trends easier by turning this continuous variable into clusters and then comparing those clusters with respect to other variables through visualizations or tests.

    Perform clustering with logerror as the only feature used in the clustering algorithm. Decide on a number of clusters to use, and store the cluster predictions back onto your data frame as cluster_target. Look at the centroids that were produced in this process. What do they tell you?

    Use the produced clusters to help you explore through visualization how logerror relates to other variables. (A common way to do this is to use color to indicate the cluster id, and the other variables can be your x-axis and y-axis. (hint: look at your swarmplot function)).

## Cluster Independent Variables
   You should also perform some clustering based on a number of independent variables. Create and evaluate several clustering models based on subsets of the independent variables. Here are some ideas:

   - Location, that is, latitude and longitude
   - Size (finished square feet)
   - Location and size
   - Be sure to use these new clusters in exploring your data, and interpret what these clusters tell you.

## Test the Significance of Clusters
    Use statistical testing methods to determine whether the clusters you have created are significant in terms of their relationship to logerror.

# Modeling
## Feature Engineering
   1. Remove variables that are not needed, wanted, useful, or are redundant.
   2. Add any features you think may be useful.
   3. Split your data into training and test sets.
   4. Create subsets of data if you would like to create multiple models and then merge (such as, a different model for each cluster or for each county).

# Model Selection
   1. Train at least 3 different models (a model is different if there are changes in one or more of the following: features, hyper-parameters, algorithm). Create object, fit, predict & evaluate. Use mean absolute error or mean squared error to evaluate. Also, try regression algorithms you have not used before.
   2. Evaluate your best model on your test data set to get an idea of your model's out of sample error.