In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

import acquire
import summarize
import prepare

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# ACQUIRE

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.logerror.mean()

In [None]:
df[(df.bathroomcnt > df.bedroomcnt)].logerror.mean()

In [None]:
df[(df.bathroomcnt < df.bedroomcnt)].logerror.mean()

In [None]:
df[(df.bathroomcnt == df.bedroomcnt)].logerror.mean()

#### Keep the Master Latitude Longitude DataFrame

In [None]:
lat_long = df[['latitude','longitude']]

In [None]:
len(lat_long)

# Prepare

### Get rid of nulls. Heating will get imputed. Null rows will be dropped.

In [None]:
#The columns that have to do with the building themselves. No location data except for lat-long.
house_vars = ['bathroomcnt','bedroomcnt','calculatedfinishedsquarefeet','heatingorsystemtypeid','lotsizesquarefeet','yearbuilt','structuretaxvaluedollarcnt', 'taxvaluedollarcnt','latitude','longitude','logerror']

In [None]:
house_vars

In [None]:
structures_df = df[house_vars]

In [None]:
structures_df.heatingorsystemtypeid.fillna(2, inplace=True)

In [None]:
structures_df.dropna(inplace=True)

In [None]:
summarize.nulls_by_col(structures_df)

### Split into the train and test sets

In [None]:
train, test = train_test_split(structures_df, train_size = .8, random_state = 123)

In [None]:
train.drop(['latitude','longitude'],axis=1,inplace=True)

test.drop(['latitude','longitude'],axis=1,inplace=True)

In [None]:
train.head()

### USE STANDARD SCALING

In [None]:
standard_train, standard_test, standard_object = prepare.standardize_train_test(train, test)

### Drop the upper outliers

In [None]:
no_outliers = prepare.remove_upper_outliers(standard_train.calculatedfinishedsquarefeet, train)
no_outliers = prepare.remove_upper_outliers(standard_train.lotsizesquarefeet, train)

In [None]:
no_outliers.shape

In [None]:
kmean = KMeans(n_clusters=3)

In [None]:
kmean.fit(standard_train)

In [None]:
predictions3 = kmean.labels_

standard_train['cluster_labels3'] = predictions3

In [None]:
np.unique(predictions3, return_counts=True)

In [None]:
standard_train.groupby('cluster_labels3').mean()

In [None]:
## THE 2nd Cluster does the best. Very low log error.

In [None]:
standard_train[['latitude','longitude']] = lat_long

In [None]:
standard_train.head()

In [None]:
sns.scatterplot(data=standard_train, x='longitude', y='latitude', hue='cluster_labels3')

In [None]:
sns.scatterplot(data=df, x='longitude', y='latitude')

In [None]:
import cluster

In [None]:
cluster.show_clusters_on_map()