# Analysis of Data

In [30]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt

housing = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
#housing["price_per_sqft"] = housing["price"] / housing["sqft_living"]

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

import seaborn as sns
import matplotlib.pyplot as plt

def plot_heat(df):
  plt.figure(figsize=(30, 10)) #change figure size
  sns.heatmap(df.drop(["id"], axis = 1).corr(), #data parameter/arguement
            annot = True, # Adds the Correlation value to each square
            fmt = '.1g', #reduce decimal places
            cmap = "coolwarm", #change the color scheme
            square = True,
  )# Model

In [34]:
import xgboost as xgb
#from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
#from datetime import datetime

def create_model(df, show_heat=False):
    if show_heat:
        plot_heat(df)
    df["lot_minus_living"] = df["sqft_lot"] - df["sqft_living"]
    df["year_sold"] = df.date.str[:4]
    df["month_sold"] = df.date.str[4:6]
    df["ratio_to_neighbors"] = df["sqft_living"] / df["sqft_living15"]
    bins = [0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0]
    labels = [1,2,3,4,5,6,7]
    df["bin"] = pd.cut(df['bathrooms'], bins, labels=labels)
    df = df[df['price'] < 2000000]
    df["yr_renovated"] = np.where(df["yr_renovated"] == 0, df["yr_built"], df["yr_renovated"])
    d = df.drop(["id", "date"], axis=1)
    d['zipcode'] = d.zipcode.astype("str")
    d = pd.get_dummies(d)
    scaler = MinMaxScaler()
    d = pd.DataFrame(scaler.fit_transform(d))
    mapper = {0: "price"}
    d.rename(mapper=mapper, axis=1, inplace=True)
    X = d.drop(["price"], axis=1)
    y = df.price
    X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.3, random_state=42)
    model = XGBRegressor(objective="reg:squarederror", subsample=0.3, random_state=42)
    model.fit(X_train, y_train)
    score = model.score(X_train, y_train)
    # print("Training score:", score)
    cv_score = cross_val_score(model, X_train, y_train, cv=10)
    # print("CV mean Score:", cv_score.mean())
    predictions = model.predict(X_test)
    result = mean_squared_error(y_test, predictions, squared=True) ** 0.5
    # print("Result: ", result)
    return result, model

In [10]:
predictions = model.predict(X_test)
result = mean_squared_error(y_test, predictions, squared=False)
result

115204.86766305543

In [47]:
def create_models(df, accuracy, decrease_accuracy=1, bins=10, start=200000, jump=50000):
    end = start #+ jump
    ranges = {}
    for i in range(bins):
        score = 1.0
        while score < accuracy:
            end += jump
            temp = df[df['price'] < end]
            temp = temp[temp['price'] >= start]
            score, model = create_model(temp)
            #end += jump
        print(end)
        print(temp.price.max())
    
    
        accuracy *= decrease_accuracy
        ranges[i + 1] = {"Range": (start, end), 
                         "Percent Price Difference": f"{round((score / end) * 100, 2)}%",
                         "Uncertainty": f"${round(score,2)}",
                         "model": model}
        start = end
        end += jump
    return ranges

# 8 Accurate Models for the Homes Under \$1,600,000
- NOTE: There are 8 models here, there are not more due to the limited number of homes in the database that are above \$1,600,000

In [48]:
data = create_models(housing, 5000, decrease_accuracy=1.5, bins=8)

250000
249950.0
350000
349990.0
450000
449999.0
550000
549995.0
650000
649990.0
800000
799990.0
1000000
999999.0
1250000
1249000.0


In [50]:
for key,value in data.items():
  print("Model:", key)
  print("Price Range:", value['Range'])
  print("Percent Off:", value['Percent Price Difference'])
  print("Uncertainty:", value['Uncertainty'],'\n')

Model: 1
Price Range: (200000, 250000)
Percent Off: 7.04%
Uncertainty: $17602.91 

Model: 2
Price Range: (250000, 350000)
Percent Off: 8.52%
Uncertainty: $29837.05 

Model: 3
Price Range: (350000, 450000)
Percent Off: 7.18%
Uncertainty: $32298.77 

Model: 4
Price Range: (450000, 550000)
Percent Off: 6.55%
Uncertainty: $36010.4 

Model: 5
Price Range: (550000, 650000)
Percent Off: 5.48%
Uncertainty: $35615.91 

Model: 6
Price Range: (650000, 800000)
Percent Off: 6.3%
Uncertainty: $50366.66 

Model: 7
Price Range: (800000, 1000000)
Percent Off: 6.84%
Uncertainty: $68357.07 

Model: 8
Price Range: (1000000, 1250000)
Percent Off: 7.44%
Uncertainty: $93037.87 



In [None]:
print("There are only", len(housing[housing['price'] > 1600000]), "homes that are over $1,600,000. This is hardly enough to really do any good accurate model to predict the price of these homes.")

There are only 426 homes that are over $1,600,000. This is hardly enough to really do any good accurate model to predict the price of these homes.
