<div class="alert alert-block alert-warning">

# Modeling Exercises

In [1]:
# Imports

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import time

#from scipy import stats
#from math import sqrt

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

from wrangle import zillow_pipeline, split_train_val_test, scale_train_val_test, xy_split

from explore import plot_variable_pairs, plot_categorical_and_continuous_vars

from evaluate import create_model, dataframe_model, plot_residuals, regression_errors, baseline_mean_errors, better_than_baseline

from model import train_model, estimate_t_time

from features import make_features

import os

<div class="alert alert-block alert-success">

Do your work for this exercise in a jupyter notebook named modeling within the regression-exercises repo. Add, commit, and push your work.


<div class="alert alert-block alert-info">

1. Select a dataset with a continuous target variable.


In [2]:
df = zillow_pipeline()
df.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,taxamount,fips
4,4,2.0,3633,296425,2005,6941.39,6037
6,3,4.0,1620,847770,2011,10244.94,6037
7,3,2.0,2077,646760,1926,7924.68,6037
18,3,1.0,1244,169471,1950,2532.88,6037
19,3,2.0,1300,233266,1950,3110.99,6037


<div class="alert alert-block alert-info">

2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.


In [4]:
train, val, test = split_train_val_test(df)

train.shape, val.shape, test.shape

((1494436, 7), (320236, 7), (320237, 7))

In [5]:
train, val, test =scale_train_val_test(train, val, test)

<div class="alert alert-block alert-info">

3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [6]:
X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)

In [7]:
X_train = pd.get_dummies(X_train, columns=['fips'], prefix='county')
X_val = pd.get_dummies(X_val, columns=['fips'], prefix='county')
X_train.shape, X_val.shape

((1494436, 8), (320236, 8))

In [8]:
X_train.head()

Unnamed: 0,bedrooms,bathrooms,area,year_built,taxamount,county_6037,county_6059,county_6111
177512,3,2.0,0.00131,0.693023,0.006076,True,False,False
1273481,4,4.0,0.002767,0.962791,0.011938,True,False,False
1374916,4,3.0,0.002647,0.865116,0.005639,True,False,False
1754843,4,2.0,0.001553,0.739535,0.002749,False,True,False
435736,5,3.0,0.002611,0.693023,0.006787,True,False,False


In [9]:
y_train.mean(), y_train.median()

(460730.7356527814, 328884.0)

In [10]:
baselines = pd.DataFrame({'y_actual': y_train,
                          'y_mean': y_train.mean(),
                          'y_median': y_train.median()})

baselines.head()

Unnamed: 0,y_actual,y_mean,y_median
177512,414000,460730.735653,328884.0
1273481,896029,460730.735653,328884.0
1374916,365817,460730.735653,328884.0
1754843,191464,460730.735653,328884.0
435736,376667,460730.735653,328884.0


In [12]:
sqrt(mean_squared_error(baselines.y_actual, baselines.y_mean))

663705.714944513

In [13]:
sqrt(mean_squared_error(baselines.y_actual, baselines.y_median))

676674.839012285

We are going to evaluate our models using RMSE. Our baseline is 663705.714944513 using the mean.

In [16]:
lm = LinearRegression()

In [17]:
train_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 86600.38682625978.
The validate RMSE is 90958.15007374868.


[LASSO LARS](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html) is next. Let's play around with alpha.

In [27]:
ll = LassoLars(alpha=0)

train_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 86600.38514962741.
The validate RMSE is 90958.14289195553.


Let's do some [polynomial regression](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html) next.

In [28]:
poly = PolynomialFeatures()
X_train_s = poly.fit_transform(X_train)
X_val_s = poly.transform(X_val)

In [29]:
lm = LinearRegression()

train_model(lm, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 83255.2027147614.
The validate RMSE is 87801.11598442799.


The [TweedieRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.TweedieRegressor.html) is the most flexible algorithm from the curriculum.

In [30]:
tweedie = TweedieRegressor()

train_model(tweedie, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 530425.9403463455.
The validate RMSE is 554858.4469507411.


Let's have some fun with the [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) next.

first on the Original X_train

In [31]:
rf = RandomForestRegressor()

train_model(rf, X_train, y_train, X_val, y_val)

The train RMSE is 29227.908420992466.
The validate RMSE is 90497.76511628271.


In [None]:
rf = RandomForestRegressor()

train_model(rf, X_train_s, y_train, X_val_s, y_val)

!pip install xgboost

Final, a little [xgboost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor) to finish things off.

In [32]:
xgbr = XGBRegressor()

train_model(xgbr, X_train, y_train, X_val, y_val)

The train RMSE is 61071.651161159985.
The validate RMSE is 89336.06681075679.


Now on the poly X_train_s

In [83]:
def estimate_full_training_time(model, X_train, y_train, X_val, y_val, subsample_fraction=0.1):
    
    def format_time(seconds):
        minutes, seconds = divmod(seconds, 60)
        hours, minutes = divmod(minutes, 60)
        return f"{int(hours):02d}:{int(minutes):02d}"
    
    # Convert X_train and y_train to pandas DataFrames or Series if they are not already
    if not isinstance(X_train, pd.DataFrame):
        X_train = pd.DataFrame(X_train)
    if not isinstance(y_train, pd.Series):
        y_train = pd.Series(y_train)
        
    if not isinstance(X_val, pd.DataFrame):
        X_val = pd.DataFrame(X_val)
    if not isinstance(y_val, pd.Series):
        y_val = pd.Series(y_val)
    
    # Determine the size of the subset based on the subsample_fraction
    subset_size = int(len(X_train) * subsample_fraction)
    
    # Create a random subset of the data for training
    random_indices_train = np.random.choice(len(X_train), size=subset_size, replace=False)
    X_subset_train = X_train.iloc[random_indices_train]
    y_subset_train = y_train.iloc[random_indices_train]

    # Create a random subset of the validation data
    random_indices_val = np.random.choice(len(X_val), size=subset_size, replace=False)
    X_subset_val = X_val.iloc[random_indices_val]
    y_subset_val = y_val.iloc[random_indices_val]
    
    # Record the start time
    start_time = time.time()
    
    # Train the model on the subset of the training data
    train_model(model, X_subset_train, y_subset_train, X_subset_val, y_subset_val)
    
    # Calculate the elapsed time for training the subset
    elapsed_time_subset = time.time() - start_time
    
    # Estimate the full training time based on the subset time
    estimated_full_time = elapsed_time_subset / subsample_fraction
    
    # Format the estimated time in hours and minutes
    formatted_time = format_time(estimated_full_time)
    
    return formatted_time

In [84]:
estimate_full_training_time(rf, X_train_s, y_train, X_val_s, y_val, subsample_fraction=0.005)

The train RMSE is 83816.13514616969.
The validate RMSE is 96208.43081623212.


'00:49'