# Analysis of Data

In [1]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt

housing = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
#housing["price_per_sqft"] = housing["price"] / housing["sqft_living"]

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

import seaborn as sns
import matplotlib.pyplot as plt

def plot_heat(df):
  plt.figure(figsize=(30, 10)) #change figure size
  sns.heatmap(df.drop(["id"], axis = 1).corr(), #data parameter/arguement
            annot = True, # Adds the Correlation value to each square
            fmt = '.1g', #reduce decimal places
            cmap = "coolwarm", #change the color scheme
            square = True,
  )# Model

In [3]:
import xgboost as xgb
#from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
#from datetime import datetime

def create_model(df, show_heat=False):
    if show_heat:
        plot_heat(df)
    df["lot_minus_living"] = df["sqft_lot"] - df["sqft_living"]
    df["year_sold"] = df.date.str[:4]
    df["month_sold"] = df.date.str[4:6]
    df["ratio_to_neighbors"] = df["sqft_living"] / df["sqft_living15"]
    bins = [0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0]
    labels = [1,2,3,4,5,6,7]
    df["bin"] = pd.cut(df['bathrooms'], bins, labels=labels)
    df = df[df['price'] < 2000000]
    df["yr_renovated"] = np.where(df["yr_renovated"] == 0, df["yr_built"], df["yr_renovated"])
    d = df.drop(["id", "date"], axis=1)
    d['zipcode'] = d.zipcode.astype("str")
    d = pd.get_dummies(d)
    scaler = MinMaxScaler()
    d = pd.DataFrame(scaler.fit_transform(d))
    mapper = {0: "price"}
    d.rename(mapper=mapper, axis=1, inplace=True)
    X = d.drop(["price"], axis=1)
    y = df.price
    X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.3, random_state=42)
    model = XGBRegressor(objective="reg:squarederror", subsample=0.3, random_state=42)
    model.fit(X_train, y_train)
    score = model.score(X_train, y_train)
    # print("Training score:", score)
    cv_score = cross_val_score(model, X_train, y_train, cv=10)
    # print("CV mean Score:", cv_score.mean())
    predictions = model.predict(X_test)
    result = mean_squared_error(y_test, predictions, squared=True) ** 0.5
    # print("Result: ", result)
    return result, model

In [20]:
def create_models(df, bins=10, start=200000, jump=100000):
    i = 0
    end = start + jump
    ranges = {}
    while end < #real_end:
        temp = df[df['price'] < end and df['price'] >= start]
        temp = temp[temp['price'] >= start]
        score, model = create_model(temp)
        ++i

        ranges[i] = {"Range": (start, end),
                         "Percent Price Difference": f"{round((score / end) * 100, 2)}%",
                         "Uncertainty": f"${round(score,2)}",
                         "model": model}
        start = end
    return ranges

SyntaxError: invalid syntax (<ipython-input-20-89ab30abb3b8>, line 4)

In [24]:
start=200000
end=3000000
df = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
temp = df[df['price'] < end][df['price'] >= start]
temp.head()

  temp = df[df['price'] < end][df['price'] >= start]


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
5,7237550310,20140512T000000,1225000.0,4,4.5,5420,101930,1.0,0,0,...,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930


# 8 Accurate Models for the Homes Under \$1,600,000
- NOTE: There are 8 models here, there are not more due to the limited number of homes in the database that are above \$1,600,000

In [16]:
data = create_models(housing, 5000, decrease_accuracy=1.5, bins=8)

17602.913257767246
18301.376175598078
18215.503606214792
18820.245850897252
18801.809661418854
20142.54378222041
19526.144383838062
19522.598764881328


In [17]:
for key,value in data.items():
  print("Model:", key)
  print("Price Range:", value['Range'])
  print("Percent Off:", value['Percent Price Difference'])
  print("Uncertainty:", value['Uncertainty'],'\n')

Model: 1
Price Range: (200000, 250000)
Percent Off: 7.04%
Uncertainty: $17602.91 

Model: 2
Price Range: (250000, 300000)
Percent Off: 6.1%
Uncertainty: $18301.38 

Model: 3
Price Range: (300000, 350000)
Percent Off: 5.2%
Uncertainty: $18215.5 

Model: 4
Price Range: (350000, 400000)
Percent Off: 4.71%
Uncertainty: $18820.25 

Model: 5
Price Range: (400000, 450000)
Percent Off: 4.18%
Uncertainty: $18801.81 

Model: 6
Price Range: (450000, 500000)
Percent Off: 4.03%
Uncertainty: $20142.54 

Model: 7
Price Range: (500000, 550000)
Percent Off: 3.55%
Uncertainty: $19526.14 

Model: 8
Price Range: (550000, 600000)
Percent Off: 3.25%
Uncertainty: $19522.6 



In [7]:
print("There are only", len(housing[housing['price'] > 1600000]), "homes that are over $1,600,000. This is hardly enough to really do any good accurate model to predict the price of these homes.")

There are only 426 homes that are over $1,600,000. This is hardly enough to really do any good accurate model to predict the price of these homes.
