In [1]:
%autosave 0

Autosave disabled


In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from adam_wrangle import train_val_test, xy_split, scale_data
from adam_model import eval_model, train_model


from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [3]:
df = pd.read_csv('diamonds.csv', index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df = df.iloc[:,:-3]
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
1,0.23,Ideal,E,SI2,61.5,55.0,326
2,0.21,Premium,E,SI1,59.8,61.0,326
3,0.23,Good,E,VS1,56.9,65.0,327
4,0.29,Premium,I,VS2,62.4,58.0,334
5,0.31,Good,J,SI2,63.3,58.0,335


In [5]:
train, val, test = train_val_test(df)
train.shape, val.shape, test.shape

((37758, 7), (8091, 7), (8091, 7))

In [6]:
to_scale = ['carat', 'depth', 'table']

train, val, test = scale_data(train, val, test, to_scale)
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
19498,0.209979,Ideal,H,VVS2,0.508333,0.269231,8131
31230,0.022869,Ideal,E,VS2,0.527778,0.25,756
22312,0.209979,Ideal,E,VS1,0.538889,0.269231,10351
279,0.126819,Ideal,F,SI2,0.544444,0.230769,2795
6647,0.122661,Ideal,I,VVS2,0.519444,0.25,4092


In [7]:
X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)

In [8]:
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)
X_train.shape, X_val.shape

((37758, 23), (8091, 23))

In [9]:
X_train.head()

Unnamed: 0,carat,depth,table,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
19498,0.209979,0.508333,0.269231,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
31230,0.022869,0.527778,0.25,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
22312,0.209979,0.538889,0.269231,False,False,True,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
279,0.126819,0.544444,0.230769,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
6647,0.122661,0.519444,0.25,False,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True


Now we're ready for some modeling. Let's generate a baseline and evaluate it first.

In [10]:
y_train.mean(), y_train.median()

(3951.495312251708, 2404.0)

In [11]:
baseline = pd.DataFrame({'y_actual': y_train,
                          'y_mean': y_train.mean(),
                          'y_median': y_train.median()})
baseline.head()

Unnamed: 0,y_actual,y_mean,y_median
19498,8131,3951.495312,2404.0
31230,756,3951.495312,2404.0
22312,10351,3951.495312,2404.0
279,2795,3951.495312,2404.0
6647,4092,3951.495312,2404.0


In [12]:
eval_model(baseline.y_actual, baseline.y_mean)

4006.3752404199363

In [13]:
eval_model(baseline.y_actual, baseline.y_median)

4294.855563169839

We are going to evaluate our models using RMSE. Our baseline is 4006 Using the mean. 

Let's create a linear regression model. You've seen this one before!

In [16]:
lm = LinearRegression()


In [17]:
train_model(lm, X_train, y_train, X_val, y_val) 

The train RMSE is 1161.3716938344942.
The validate RMSE is 1117.4208997997828.


[LASSO LARS](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html) is next. Let's play around with alpha.

In [18]:
ll = LassoLars(alpha=0)
train_model(ll, X_train, y_train, X_val, y_val) 

The train RMSE is 1161.361922018855.
The validate RMSE is 1117.4413546284518.


In [21]:
ll = LassoLars(alpha=0.5)
train_model(ll, X_train, y_train, X_val, y_val) 

The train RMSE is 1161.6605760513964.
The validate RMSE is 1116.9165070915144.


Let's do some [polynomial regression](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html) next.

In [29]:
poly = PolynomialFeatures()
X_train_s = poly.fit_transform(X_train)
X_val_s = poly.transform(X_val)

In [30]:
X_train_second

In [31]:
len(X_train_s[0])

300

In [32]:
X_train.shape

(37758, 23)

In [36]:
lm = LinearRegression()

train_model(lm, X_train_s, y_train, X_val_s, y_val)


The train RMSE is 767.8790852724211.
The validate RMSE is 741.0877677128647.


The [TweedieRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.TweedieRegressor.html) is the most flexible algorithm from the curriculum.

In [37]:
tweedie = TweedieRegressor()
train_model(tweedie, X_train_s, y_train, X_val_s, y_val)


The train RMSE is 3875.3432845540615.
The validate RMSE is 3777.574842188569.


Let's have some fun with the [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) next.

In [38]:
rf = RandomForestRegressor()
train_model(rf, X_train, y_train, X_val, y_val)


The train RMSE is 213.87536522661136.
The validate RMSE is 548.277137498848.


In [39]:
rf = RandomForestRegressor()
train_model(rf, X_train_s, y_train, X_val_s, y_val)


The train RMSE is 215.27900186348626.
The validate RMSE is 547.41596920199.


Hyperparameters I would adjust to reduce overfitting in my model:
- reduce the max depth
- Increase min_samples_split and min_samples_leaf
- Decrease max_features
- Define max_sample at 0.50 or a similar proportion
  

Final, a little [xgboost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor) to finish things off.

In [40]:
xgbr = XGBRegressor()
train_model(xgbr, X_train, y_train, X_val, y_val)


The train RMSE is 415.03430616923123.
The validate RMSE is 532.9928630196524.
