In [1]:
%autosave 0

Autosave disabled


In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from adam_wrangle import train_val_test, xy_split, scale_data
from adam_model import eval_model, train_model, train_and_evaluate_model

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [3]:
df = pd.read_csv('diamonds.csv', index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


sliced  'x, y, z' becuase they had a great linear relationship with carat

In [4]:
df = df.iloc[:,:-3]
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
1,0.23,Ideal,E,SI2,61.5,55.0,326
2,0.21,Premium,E,SI1,59.8,61.0,326
3,0.23,Good,E,VS1,56.9,65.0,327
4,0.29,Premium,I,VS2,62.4,58.0,334
5,0.31,Good,J,SI2,63.3,58.0,335


splitting data as always

In [5]:
train, val, test = train_val_test(df)
train.shape, val.shape, test.shape

((37758, 7), (8091, 7), (8091, 7))

scaling our numerical columns

In [6]:
to_scale = ['carat', 'depth', 'table']

train, val, test = scale_data(train, val, test, to_scale)
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
19498,0.209979,Ideal,H,VVS2,0.508333,0.269231,8131
31230,0.022869,Ideal,E,VS2,0.527778,0.25,756
22312,0.209979,Ideal,E,VS1,0.538889,0.269231,10351
279,0.126819,Ideal,F,SI2,0.544444,0.230769,2795
6647,0.122661,Ideal,I,VVS2,0.519444,0.25,4092


Splitting into X and y dataframes. We'll leave test alone for the lesson.

In [7]:
X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)

In [43]:
X_train.head()

Unnamed: 0,carat,depth,table,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
19498,0.209979,0.508333,0.269231,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
31230,0.022869,0.527778,0.25,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
22312,0.209979,0.538889,0.269231,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
279,0.126819,0.544444,0.230769,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6647,0.122661,0.519444,0.25,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


Getting some dummies for my categorical data and making sure the shape is the same

In [8]:
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)
X_train.shape, X_val.shape

((37758, 23), (8091, 23))

In [9]:
X_train.columns.to_list()

['carat',
 'depth',
 'table',
 'cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2']

In [47]:
y_train.head()

19498     8131
31230      756
22312    10351
279       2795
6647      4092
Name: price, dtype: int64

Now we're ready for some modeling. Let's generate a baseline and evaluate it first.

In [45]:
y_train.mean(), y_train.median()

(3951.495312251708, 2404.0)

In [46]:
baselines = pd.DataFrame({"y_actual" : y_train,
                          "y_mean" : y_train.mean(),
                          "y_median" : y_train.median()}
                        )
baselines.head()

Unnamed: 0,y_actual,y_mean,y_median
19498,8131,3951.495312,2404.0
31230,756,3951.495312,2404.0
22312,10351,3951.495312,2404.0
279,2795,3951.495312,2404.0
6647,4092,3951.495312,2404.0


Let's create a linear regression model. You've seen this one before!

In [11]:
eval_model(baselines.y_actual, baselines.y_mean)

4006.3752404199363

In [23]:
eval_model(baselines.y_actual, baselines.y_median)

4294.855563169839

We are going to evaluate our models using RMSE. Our baseline is 4,006 using the mean.

Let's create a linear regression model. You've seen this one before!

In [13]:
lm = LinearRegression()

In [14]:
train_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 1161.38.

The validate RMSE is 1117.48.




In [15]:
trained_model, train_rmse, val_rmse = train_and_evaluate_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 1161.38.

The validation RMSE is 1117.48.




[LASSO LARS](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html) is next. Let's play around with alpha.

In [16]:
ll = LassoLars(alpha=0)

train_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 1161.38.

The validate RMSE is 1117.48.




In [17]:
ll = LassoLars(alpha=0.5)

trained_model, train_rmse, val_rmse = train_and_evaluate_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 1161.66.

The validation RMSE is 1116.92.




Let's do some [polynomial regression](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html) next.

- **IF DEGREE GETS TO 4 OR LATER IT WILL TAKE WAY TOO LONG**

In [18]:
poly = PolynomialFeatures()
X_train_s = poly.fit_transform(X_train)
X_val_s = poly.transform(X_val)

In [19]:
len(X_train_s[0])

300

In [20]:
X_train_s.shape

(37758, 300)

In [21]:
lm = LinearRegression()

trained_model, train_rmse, val_rmse = train_and_evaluate_model(lm, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 768.23.

The validation RMSE is 741.47.




The [TweedieRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.TweedieRegressor.html) is the most flexible algorithm from the curriculum.

In [29]:
tweedie = TweedieRegressor()
trained_model, train_rmse, val_rmse = train_and_evaluate_model(tweedie, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 3875.34.

The validation RMSE is 3777.57.




In [40]:
tweedie

Let's have some fun with the [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) next.

In [30]:
rfr = RandomForestRegressor()

trained_model, train_rmse, val_rmse = train_and_evaluate_model(rfr, X_train, y_train, X_val, y_val)

The train RMSE is 213.94.

The validation RMSE is 548.54.




In [34]:
rfr = RandomForestRegressor()

trained_model, train_rmse, val_rmse = train_and_evaluate_model(rfr, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 215.68.

The validation RMSE is 549.06.




In [41]:
rfr

###  Instructor 2 cents, 

Hyperparameters I would adjust to reduce overfitting in my model:

- Reduce max depth
- Increase min_samples_split and min_samples_leaf
- Decrease max_features
- Define max_samples at 0.50 or a similar proportion

Final, a little [xgboost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor) to finish things off.

In [37]:
xgboost = XGBRegressor()

trained_model, train_rmse, val_rmse = train_and_evaluate_model(xgboost, X_train, y_train, X_val, y_val)

The train RMSE is 415.03.

The validation RMSE is 532.99.




In [38]:
xgboost = XGBRegressor()

trained_model, train_rmse, val_rmse = train_and_evaluate_model(xgboost, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 377.41.

The validation RMSE is 543.55.




In [42]:
xgboost