In [1]:
# Data Science Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
from env import host, user, password
import os
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor

# My Zillow Functions
import wrangle

# Block Warning Boxes
import warnings
warnings.filterwarnings("ignore")


<hr style="border:2px solid blue"> </hr>


# Modeling

In [2]:
# Using the functions from my compiled wrangle file
train, validate, test = wrangle.wrangle_zillow()
#train, validate, test = wrangle.Min_Max_Scaler(train, validate, test)

In [3]:
# Assigning my features to X, and target variable to y, for each split dataframe

X_train = train.drop(columns=['tax_value'])
y_train = train.tax_value

X_validate = validate.drop(columns=['tax_value'])
y_validate = validate.tax_value

X_test = test.drop(columns=['tax_value'])
y_test = test.tax_value

In [4]:
# how many observations and features we have in each dataframe
X_train.shape, X_validate.shape, X_test.shape

((25529, 7), (10942, 7), (9118, 7))

In [5]:
# We need y_train and y_validate to be dataframes to append the new columns with predicted values. 
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

In [6]:
# The Project specs told us to use mean for our baseline
# 1. Predict tax_value_pred_mean
tax_value_pred_mean = y_train.tax_value.mean()
y_train['tax_value_pred_mean'] = tax_value_pred_mean
y_validate['tax_value_pred_mean'] = tax_value_pred_mean

y_train.head()

Unnamed: 0,tax_value,tax_value_pred_mean
36317,52725.0,372511.860237
26167,56275.0,372511.860237
635,683000.0,372511.860237
49331,361881.0,372511.860237
10086,465989.0,372511.860237


In [7]:
# Why am i using rmse? describe
# 3. RMSE of tax_value_pred_mean
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_mean) ** (0.5)
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_mean) ** (0.5)

In [8]:
metric_df = pd.DataFrame(data=[
            {
                'model': 'mean_baseline', 
                'RMSE_train': rmse_train,
                'RMSE_validate': rmse_validate
                }
            ])

In [9]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate
0,mean_baseline,243722.125753,245317.957751


# LinearRegression (OLS)

In [10]:
# Create the model 
lr = LinearRegression()

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lr.fit(X_train, y_train.tax_value)

# predict train using the model
y_train['tax_value_pred_LR'] = lr.predict(X_train)

# predict validate
y_validate['tax_value_pred_LR'] = lr.predict(X_validate)


y_train.head()

Unnamed: 0,tax_value,tax_value_pred_mean,tax_value_pred_LR
36317,52725.0,372511.860237,360868.50637
26167,56275.0,372511.860237,292741.942524
635,683000.0,372511.860237,494296.34544
49331,361881.0,372511.860237,511864.105429
10086,465989.0,372511.860237,262734.543629


In [11]:
# Evaluate tax_value_pred_LR using RMSE
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_LR) ** (1/2)
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_LR) ** (1/2)


In [12]:
# Add model results to results dataframe
metric_df = metric_df.append({
    'model': 'OLS Regressor', 
    'RMSE_train': rmse_train,
    'RMSE_validate': rmse_validate,
    }, ignore_index=True)

In [13]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate
0,mean_baseline,243722.125753,245317.957751
1,OLS Regressor,215650.064606,217342.952446


# LassoLars

In [14]:
# create the model object
lars = LassoLars(alpha=0.01)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series!
lars.fit(X_train, y_train.tax_value)

# predict train
y_train['tax_value_lassolars'] = lars.predict(X_train)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_lassolars) ** (1/2)

# predict validate
y_validate['tax_value_lassolars'] = lars.predict(X_validate)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_lassolars) ** (1/2)

metric_df = metric_df.append({
    'model': 'lasso_alpha0.01', 
    'RMSE_train': rmse_train,
    'RMSE_validate': rmse_validate,
    }, ignore_index=True)

metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate
0,mean_baseline,243722.125753,245317.957751
1,OLS Regressor,215650.064606,217342.952446
2,lasso_alpha0.01,215650.064679,217342.981245


# TweedieRegressor (Generalized Linear Model)

In [15]:
# create the model object
glm = TweedieRegressor(power=1, alpha=0)


# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
glm.fit(X_train, y_train.tax_value)

# predict train
y_train['tax_value_pred_glm'] = glm.predict(X_train)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_glm) ** (1/2)

# predict validate
y_validate['tax_value_pred_glm'] = glm.predict(X_validate)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_glm) ** (1/2)

metric_df = metric_df.append({
    'model': 'glm_poisson', 
    'RMSE_train': rmse_train,
    'RMSE_validate': rmse_validate,
    }, ignore_index=True)

metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate
0,mean_baseline,243722.125753,245317.957751
1,OLS Regressor,215650.064606,217342.952446
2,lasso_alpha0.01,215650.064679,217342.981245
3,glm_poisson,243722.125753,245317.957751


In [16]:
def Regression_Modeling(target, X_train, y_train, X_validate, y_validate):
    # Baseline
    tax_value_pred_mean = y_train.tax_value.mean()
    y_train['tax_value_pred_mean'] = tax_value_pred_mean
    y_validate['tax_value_pred_mean'] = tax_value_pred_mean
    rmse_train = mean_squared_error(y_train[target], y_train[target + '_pred']) ** (1/2)
    rmse_validate = mean_squared_error(y_validate[target], y_validate[target + '_pred']) ** (1/2)

    # Add model results to results to new dataframe
    metric_df = pd.DataFrame(data=[
            {
                'model': 'mean_baseline', 
                'RMSE_train': rmse_train,
                'RMSE_validate': rmse_validate
                }
            ]) 

    # Linear Regression Modeling
    lr = LinearRegression() 
    lr.fit(X_train, y_train[target])
    y_train[target + '_pred'] = lr.predict(X_train)
    y_validate[target + '_pred'] = lr.predict(X_validate)
    rmse_train = mean_squared_error(y_train[target], y_train[target + '_pred']) ** (1/2)
    rmse_validate = mean_squared_error(y_validate[target], y_validate[target + '_pred']) ** (1/2)

    # Add model results to results dataframe
    metric_df = metric_df.append({
        'model': 'OLS Regressor', 
        'RMSE_train': rmse_train,
        'RMSE_validate': rmse_validate,
        }, ignore_index=True)

    # LassoLars
    lars = LassoLars(alpha=0.01)
    lars.fit(X_train, y_train[target])
    y_train[target + '_pred'] = lars.predict(X_train)
    rmse_train = mean_squared_error(y_train[target], y_train[target + '_pred']) ** (1/2)
    y_validate[target + '_pred'] = lars.predict(X_validate)
    rmse_validate = mean_squared_error(y_validate[target], y_validate[target + '_pred']) ** (1/2)

    # Add model results to results dataframe
    metric_df = metric_df.append({
        'model': 'LassoLars', 
        'RMSE_train': rmse_train,
        'RMSE_validate': rmse_validate,
        }, ignore_index=True)

    # Tweedie Regressor
    glm = TweedieRegressor(power=1, alpha=0)
    glm.fit(X_train, y_train.tax_value)
    y_train[target + '_pred'] = glm.predict(X_train)
    rmse_train = mean_squared_error(y_train[target], y_train[target + '_pred']) ** (1/2)
    y_validate[target + '_pred'] = glm.predict(X_validate)
    rmse_validate = mean_squared_error(y_validate[target], y_validate[target + '_pred']) ** (1/2)

    # Add model results to results dataframe
    metric_df = metric_df.append({
        'model': 'glm_poisson', 
        'RMSE_train': rmse_train,
        'RMSE_validate': rmse_validate,
        }, ignore_index=True)
    
    return metric_df

In [17]:
example = Regression_Modeling('tax_value', X_train, y_train, X_validate, y_validate)
example

KeyError: 'tax_value_pred'

### What does this mean?
Is bigger number better? 
- For RMSE lower is actually better! We want a lower error

What if my model isn't better than baseline? 
- That means your data has CRAPPY FEATURES! If this is the case you are better off just using baseline.

So which of my models is the best? 
- So far OLS is my best model because lowest numbers. However, Scaling has made it so I cant tell my dollar error. I might want to consider unscaling my target variable so I can provide my RMSE in dollars.

Note: 
- Scaling simply helps our code run faster.
- For the future, scale the features but avoid scaling the target so you can see your error in the unit of your target variable.

check your copy paste shouldnt be so close of numbers

 <span style="color: red">
    
# Modeling (.ipynb) - Evaluate Baseline
Having a baseline tells you whether a model you build using the features you selected is any better than predicting by using only the target variable. One way a baseline is created in classification is by making predictions purely based on the most common outcome class, like predicting that all titanic passengers will die, becuase the majroity did die. By doing that, you end up with the highest accuracy without using extra information from features. The baseline is based on the training dataset. For a continuous target variable, the baseline could be predicting that all salaries will be the median salary of our labeled train data. The predictions should be made on the training data using this information (like the predicted value, y_hat, for all passengers "survived" == 0) and then performance evaluated to measure your models against. If any model you build does not perform as well as a baseline that uses no features, then your features are not significant drivers of the outcome.

# Modeling (.ipynb) - Evaluate on Train
All models should be evaluated on train: the training smaple is our largest sample, and it is a sample of data we have to both fit the model AND see how the model performs. We should never skip straight to validate. We would be missing out on valuable observations.

# Modeling (.ipynb) - Evaluate on Validate
The top models should be evaluated with the validation sample dataset. It is important to use the validate sample for checking for any overfitting that may have occurred when fitting the model on train. If you are creating 10's of models, it is also important to only validate a handful of your top models with the Validate dataset. Otherwise, your data will have seen validate as much as train and you could accidentally introduce some implicit bias based on data and results you see while validating on so many models.

# Modeling (.ipynb) - Select Evaluation Metric
Clear communication as to how you evaluated and compared models. What metric(s) did you use and why? For example, in one case, you may decide to use precision over accuracy. If so, why? If you use multiple metrics, how will you decide which to select if metric is better for model A but another is better for model B? Will you rank them? Find a way to aggregate them into a single metric you can use to rank?
    
# Modeling (.ipynb) - Evaluate Top Model on Test
Your top performing model, and only your top performing model should be evaluated on your test dataset. The purpose of having a test dataset to evaluate only the final model on is to have an estimate of how the model will perform in the future on data it has never seen.

# Modeling (.ipynb) - Develope 3 Models
The 3 models can differ based on the features used, the hyperparameters selected, and/or the algorithm used to fit the data.
 
</span>