In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#These imports were constructed
from wrangle import train_validate, wrangle_zillow, scale_zillow, find_baseline, train_val_test

#Removes warnings and imporves asthenics
import warnings
warnings.filterwarnings("ignore")

#Sklearn
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import mean_squared_error

In [2]:
#Sets options to show more stuff
pd.options.display.max_columns = None
pd.options.display.width = 100
pd.options.display.max_colwidth = None
pd.options.display.max_rows = 200

# Select a dataset with a continuous target variable.

In [3]:
df = wrangle_zillow()

In [4]:
target_col = 'tax_value'

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2131116 entries, 0 to 2131115
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   bedrooms    float64
 1   bathrooms   float64
 2   sqft        float64
 3   tax_value   float64
 4   year_built  float64
 5   tax_amount  float64
 6   fips        float64
dtypes: float64(7)
memory usage: 113.8 MB


# Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [6]:
#Create dummy columns of my categorical features
df = pd.get_dummies(columns=['bedrooms', 'bathrooms', 'fips'], data=df)

In [7]:
train, val, test = train_validate(df)

In [8]:
train_rscaled1, val_rscaled1, test_rscaled1 = scale_zillow(train, val, test)

In [9]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test(train_rscaled1, val_rscaled1, test_rscaled1, target_col)

# Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [10]:
y_train.head()

826977     133059.0
1188647    248917.0
400580     340689.0
1390551    240199.0
1625307    409199.0
Name: tax_value, dtype: float64

In [11]:
baseline = find_baseline(y_train)

RMSE Mean training baseline: 435,471
RMSE Mean validate baseline: 438,630
The difference:   -3,159
*****************************************
RMSE Median training baseline: 448,737
RMSE Median validate baseline: 452,039
The difference:   -3,302


In [12]:
#metrics
rmse_scores = []

## LinearRegression(OLS)

In [13]:
lm = LinearRegression(normalize=True)

In [14]:
lm.fit(X_train, y_train)

LinearRegression(normalize=True)

In [15]:
lm_preds = pd.DataFrame({'actual':y_train})

In [16]:
lm_preds['pred_lm'] = lm.predict(X_train)

In [17]:
lm_val_preds = pd.DataFrame({'actual':y_val})

In [18]:
lm_val_preds['lm_val_preds'] = lm.predict(X_val)

In [19]:
lm_val_preds.head()

Unnamed: 0,actual,lm_val_preds
1894038,65327.0,91192.0
1141135,217742.0,214400.0
90824,176120.0,170712.0
96180,616000.0,581552.0
1731006,336257.0,325344.0


In [20]:
rmse_train = mean_squared_error(lm_preds['actual'], lm_preds['pred_lm'], squared=False) 

In [21]:
rmse_val = mean_squared_error(lm_val_preds['actual'], lm_val_preds['lm_val_preds'], squared=False) 

In [22]:
rmse_scores.append({'Model':'OLS Linear Regression',
                    'RMSE on Train': rmse_train,
                    'RMSE on Validate': rmse_val})

In [23]:
rmse_scores

[{'Model': 'OLS Linear Regression',
  'RMSE on Train': 69181.77739064753,
  'RMSE on Validate': 69192.57636832046}]

## LassoLars

In [24]:
lars = LassoLars(alpha=1.0)

In [25]:
lars.fit(X_train, y_train)

LassoLars()

In [26]:
ll_preds = pd.DataFrame({'actual':y_train})

In [27]:
ll_preds['pred_ll'] = lars.predict(X_train)

In [28]:
ll_val_preds = pd.DataFrame({'actual':y_val})

In [29]:
ll_val_preds['ll_val_preds'] = lars.predict(X_val)

In [30]:
ll_val_preds.head()

Unnamed: 0,actual,ll_val_preds
1894038,65327.0,93357.05639
1141135,217742.0,210541.872743
90824,176120.0,167088.063551
96180,616000.0,578365.214951
1731006,336257.0,323196.334317


In [31]:
rmse_train = mean_squared_error(ll_preds['actual'], ll_preds['pred_ll'], squared=False) 

In [32]:
rmse_val = mean_squared_error(ll_val_preds['actual'], ll_val_preds['ll_val_preds'], squared=False) 

In [33]:
rmse_scores.append({'Model':'Lasso Lars',
                    'RMSE on Train': rmse_train,
                    'RMSE on Validate': rmse_val})

In [34]:
rmse_scores

[{'Model': 'OLS Linear Regression',
  'RMSE on Train': 69181.77739064753,
  'RMSE on Validate': 69192.57636832046},
 {'Model': 'Lasso Lars',
  'RMSE on Train': 69400.93531965362,
  'RMSE on Validate': 69418.54918411924}]

## TweedieRegressor (GLM)

In [35]:
glm = TweedieRegressor(power=1, alpha=0)

In [36]:
glm.fit(X_train, y_train)

TweedieRegressor(alpha=0, power=1)

In [37]:
glm_preds = pd.DataFrame({'actual':y_train})

In [38]:
glm_preds['pred_glm'] = glm.predict(X_train)

In [39]:
glm_val_preds = pd.DataFrame({'actual':y_val})

In [40]:
glm_val_preds['glm_val_preds'] = glm.predict(X_val)

In [41]:
rmse_train = mean_squared_error(glm_preds['actual'], glm_preds['pred_glm'], squared=False) 

In [42]:
rmse_val = mean_squared_error(glm_val_preds['actual'], glm_val_preds['glm_val_preds'], squared=False) 

In [43]:
rmse_scores.append({'Model':'TweedieRegressor',
                    'RMSE on Train': rmse_train,
                    'RMSE on Validate': rmse_val})

In [44]:
rmse_scores

[{'Model': 'OLS Linear Regression',
  'RMSE on Train': 69181.77739064753,
  'RMSE on Validate': 69192.57636832046},
 {'Model': 'Lasso Lars',
  'RMSE on Train': 69400.93531965362,
  'RMSE on Validate': 69418.54918411924},
 {'Model': 'TweedieRegressor',
  'RMSE on Train': 435471.3127599482,
  'RMSE on Validate': 438631.04560702544}]

## Polynomial Regression

In [45]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)

In [46]:
# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)

In [47]:
# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_val)
X_test_degree2 = pf.transform(X_test)

### Linear Regression

In [48]:
lm2 = LinearRegression(normalize=True)

In [49]:
lm2.fit(X_train_degree2, y_train)

LinearRegression(normalize=True)

In [50]:
lm2_preds = pd.DataFrame({'actual':y_train})

In [51]:
lm2_preds['pred_lm2'] = lm2.predict(X_train_degree2)

In [52]:
lm2_val_preds = pd.DataFrame({'actual':y_val})

In [54]:
lm2_val_preds['lm2_val_preds'] = lm2.predict(X_validate_degree2)

In [55]:
rmse_train = mean_squared_error(lm2_preds['actual'], lm2_preds['pred_lm2'], squared=False) 

In [56]:
rmse_val = mean_squared_error(lm2_val_preds['actual'], lm2_val_preds['lm2_val_preds'], squared=False) 

In [57]:
rmse_scores.append({'Model':'Polynomial Regression',
                    'RMSE on Train': rmse_train,
                    'RMSE on Validate': rmse_val})

In [58]:
rmse_scores

[{'Model': 'OLS Linear Regression',
  'RMSE on Train': 69181.77739064753,
  'RMSE on Validate': 69192.57636832046},
 {'Model': 'Lasso Lars',
  'RMSE on Train': 69400.93531965362,
  'RMSE on Validate': 69418.54918411924},
 {'Model': 'TweedieRegressor',
  'RMSE on Train': 435471.3127599482,
  'RMSE on Validate': 438631.04560702544},
 {'Model': 'Polynomial Regression',
  'RMSE on Train': 66661.85812112724,
  'RMSE on Validate': 1.2394215160193334e+16}]