In [1]:
import pandas as pd
import numpy as np
import wrangle
import prepare
import model
import matplotlib.pyplot as plt
from pydataset import data

# modeling methods
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore")

# Exercises

- Do your work for this exercise in a jupyter notebook named modeling within the regression-exercises repo. Add, commit, and push your work.

1. Select a dataset with a continuous target variable.

In [2]:
df = data('iris')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [3]:
df.isna().sum()

Sepal.Length    0
Sepal.Width     0
Petal.Length    0
Petal.Width     0
Species         0
dtype: int64

3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [4]:
df = pd.get_dummies(df)
df = df.iloc[:,:-2]
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa
1,5.1,3.5,1.4,0.2,1
2,4.9,3.0,1.4,0.2,1
3,4.7,3.2,1.3,0.2,1
4,4.6,3.1,1.5,0.2,1
5,5.0,3.6,1.4,0.2,1


In [5]:
train, val, test = prepare.train_val_test(df)
train.shape, val.shape, test.shape

((105, 5), (22, 5), (23, 5))

Scale before splitting

In [6]:
to_scale = ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']
train, val, test = wrangle.scale_data(train, val, test, to_scale)
train.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa
82,0.352941,0.181818,0.464286,0.375,0
134,0.588235,0.363636,0.714286,0.583333,0
138,0.617647,0.5,0.785714,0.708333,0
76,0.676471,0.454545,0.589286,0.541667,0
110,0.852941,0.727273,0.892857,1.0,0


In [7]:
X_train, y_train = wrangle.xysplit(train, 'Species_setosa')
X_val, y_val = wrangle.xysplit(val, 'Species_setosa')

In [8]:
X_train.shape, X_val.shape

((105, 4), (22, 4))

Same Shape!

In [9]:
X_train.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
82,0.352941,0.181818,0.464286,0.375
134,0.588235,0.363636,0.714286,0.583333
138,0.617647,0.5,0.785714,0.708333
76,0.676471,0.454545,0.589286,0.541667
110,0.852941,0.727273,0.892857,1.0


In [10]:
y_train.head()

82     0
134    0
138    0
76     0
110    0
Name: Species_setosa, dtype: uint8

In [11]:
y_train.mean(), y_train.median()
bl = pd.DataFrame({"y_actual" : y_train,
                   "y_mean" : y_train.mean(),
                   "y_median" : y_train.median()})
bl.head()

Unnamed: 0,y_actual,y_mean,y_median
82,0,0.295238,0.0
134,0,0.295238,0.0
138,0,0.295238,0.0
76,0,0.295238,0.0
110,0,0.295238,0.0


In [12]:
lm = LinearRegression()

trained_model, train_rmse, val_rmse = model.train_and_evaluate_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 0.14.

The validation RMSE is 0.15.




In [13]:
ll = LassoLars(alpha=0)

trained_model, train_rmse, val_rmse = model.train_and_evaluate_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 0.14.

The validation RMSE is 0.15.




In [14]:
ll = LassoLars(alpha=0.5)

trained_model, train_rmse, val_rmse = model.train_and_evaluate_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 0.46.

The validation RMSE is 0.57.




In [15]:
poly = PolynomialFeatures()
X_train_s = poly.fit_transform(X_train)
X_val_s = poly.transform(X_val)

In [16]:
len(X_train_s[0])

15

In [17]:
X_train_s.shape

(105, 15)

In [18]:
lm = LinearRegression()
trained_model, train_rmse, val_rmse = model.train_and_evaluate_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 0.14.

The validation RMSE is 0.15.




In [19]:
tweedie = TweedieRegressor()

trained_model, train_rmse, val_rmse = model.train_and_evaluate_model(tweedie, X_train, y_train, X_val, y_val)

The train RMSE is 0.39.

The validation RMSE is 0.48.




In [20]:
rfr = RandomForestRegressor()

trained_model, train_rmse, val_rmse = model.train_and_evaluate_model(rfr, X_train, y_train, X_val, y_val)

The train RMSE is 0.00.

The validation RMSE is 0.00.




In [21]:
xgboost = XGBRegressor()

trained_model, train_rmse, val_rmse = model.train_and_evaluate_model(xgboost, X_train, y_train, X_val, y_val)

The train RMSE is 0.00.

The validation RMSE is 0.00.





## Notes

1
 Regularization = "Regularizations are techniques used to reduce the error by fitting a function appropriately on the given training set and avoid overfitting." Towards Data Science

2
 TweedieRegressor: Requires sklearn v0.23 or greater installed. To update, run in terminal: conda install scikit-learn=0.23