In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wrangle import wrangle_zillow
from wrangle import scale_zillow
from splitter import splitter

import warnings
warnings.filterwarnings("ignore")

## 1. Load your zillow dataset.

In [2]:
df = wrangle_zillow()
train, test, validate = splitter(df)

Train = 56.0% | Validate = 24.0% | Test = 20.0%
You did not stratify.  If looking to stratify, ensure to add argument: "target = variable to stratify on".


Will use train as well as train scaled:

In [3]:
train_scaled, test_scaled, validate_scaled = scale_zillow(train, test, validate)

## 2. Fit a linear regression model (ordinary least squares) and compute yhat, predictions of *taxvaluedollarcnt* using only *calculatedfinishedsqft*.

For this question I will use the non-scaled dataset:

In [4]:
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(train[['sqft']], train.value)
predictions = model.predict(train[['sqft']])

# predictions = yhat
predictions

array([ 789060.83003177,  502744.05797135,  379306.47001673, ...,
       1079394.80170018,  405600.86745085,  567019.2516992 ])

OLS least squares plot based on yhat:

In [None]:
plt.scatter(train.sqft,train.value)
plt.plot(train.sqft, predictions, c='red')
plt.show()

## 3. Plot the residuals for the linear regression model that you made

In [None]:
train_copy = train.copy()

train_copy['yhat'] = predictions
train_copy['residual'] = train_copy.value - train_copy.yhat
train_copy.head(2)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(train_copy.sqft,train_copy.residual)
plt.xlabel('Square Footage')
plt.ylabel('Residual')
plt.show()

## 4. Calculate the sum of squared errors, explained sum of squares, total sum of squares, mean squared error, and root mean squared error for your model.

In [None]:
# Calcuate and add in Baseline and it's residual
train_copy['baseline'] = train_copy.value.mean()
train_copy['baseline_residual'] = train_copy.value - train_copy.baseline

In [None]:
# Now calculate baseline residual squared and OLS residual squared
train_copy['baseline_residual^2'] = train_copy.baseline_residual**2
train_copy['residual^2'] = train_copy.residual**2

#### SSE:

In [None]:
SSE = train_copy['residual^2'].sum()
print(f'SSE = {SSE}')

#### SSE Baseline (TSS):

In [None]:
SSE_baseline = TSS = train_copy['baseline_residual^2'].sum()
print(f'SSE Baseline = TSS = {TSS}')

#### ESS:

In [None]:
ESS = TSS - SSE
print(f'ESS = {ESS}')

#### MSE:

In [None]:
MSE = SSE/train_copy.shape[0]
MSE_baseline = TSS/train_copy.shape[0]
print(f'MSE = {MSE:.0f}; Baseline MSE = {MSE_baseline:.0f}.')
if MSE_baseline - MSE > 0:
      print(f'Model is superior to baseline by {(MSE_baseline-MSE):.0f}')
else: 
      print(f'Model fails to perform better than baseline.')

#### RMSE

In [None]:
from math import sqrt
RMSE = sqrt(MSE)
RMSE_baseline = sqrt(MSE_baseline)
print(f'RMSE = {RMSE:.0f}; Baseline RMSE = {RMSE_baseline:.0f}.')
if RMSE_baseline > RMSE:
      print(f'Model is superior to baseline by {(RMSE_baseline-RMSE):.0f}')
else: 
      print(f'Model fails to perform better than baseline.')

## 5. Calculate the sum of squared errors, mean squared error, and root mean squared error for the baseline model (i.e. a model that always predicts the average taxvaluedollarcnt amount).

##### See #4 above

## 6. Write python code that compares the sum of squared errors for your model against the sum of squared errors for the baseline model and outputs whether or not your model performs better than the baseline model.

##### See #4 above

## 7. What is the amount of variance explained in your model?

In [None]:
R2 = ESS/TSS
print(f'R2 = {R2:.2f}.  Percent of variance in y explained by x = {(100*R2):.1f}%')

## 8. Is your model better than the baseline model?

Yes: SSE, MSE and RMSE all perform better than baseline.

## 9. Create a file named evaluate.py that contains the following functions.

In [None]:
import evaluate

First, the plotting function:

In [None]:
evaluate.plot_residuals(train_copy.sqft, train_copy.value, train_copy.yhat)

Next, the regression and baseline error functions:

In [None]:
evaluate.regression_errors(train_copy.value, train_copy.yhat)

In [None]:
evaluate.baseline_mean_errors(train_copy.value)

Finally, the functions which compares the mdoel to baseline (using previous two functions):

In [None]:
evaluate.better_than_baseline(train_copy.value, train_copy.yhat)