# Models

In [1]:
'''
This isn't strictly needed. However it solves this annoying pandas error:

/opt/homebrew/Caskroom/miniforge/base/envs/supervised/lib/python3.9/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index
  
The problem is solved with xgboost 1.6 but I don't want to use pip in this case and the conda package is currently 1.5.1  
'''

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb


from sklearn import preprocessing 
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [3]:
df = pd.read_pickle("../data/diamonds.pkl")

## Preparing the model -- 1

In [4]:
categoricals = df.select_dtypes(exclude=np.number).columns.to_list()
#categoricals

#### Is it faster to use the pandas `get_dummies` or scikit's `label_encoder`?

<h4>
    
```
%timeit pd.get_dummies(df,df.columns[df.dtypes == 'object'])

212 ms ± 2.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
```
vs
```
%timeit label_encoder = preprocessing.LabelEncoder()

for col in categoricals:
    df[(col)]=label_encoder.fit_transform(df[(col)])
    
71.8 ns ± 0.0599 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
```
#### Pandas takes about 3 million times as long to do the operation. Wow! That's a _huge_ difference! 

In [5]:
label_encoder = preprocessing.LabelEncoder()

for col in categoricals:
    df[(col)]=label_encoder.fit_transform(df[(col)])

df.head(4)

Unnamed: 0,shape,size,color,fancy_color_dominant_color,fancy_color_secondary_color,fancy_color_overtone,fancy_color_intensity,clarity,cut,symmetry,...,meas_depth,girdle_min,girdle_max,culet_size,culet_condition,fluor_color,fluor_intensity,lab,total_sales_price,eye_clean
0,10,0.09,1,12,10,8,9,10,0,4,...,1.79,0,0,3,3,5,2,2,200,4
1,10,0.09,1,12,10,8,9,10,5,4,...,1.78,1,1,3,3,5,2,2,200,4
2,10,0.09,1,12,10,8,9,10,0,4,...,1.77,4,0,8,3,5,2,2,200,4
3,10,0.09,1,12,10,8,9,10,0,4,...,1.78,0,1,8,3,5,2,2,200,4


## Preparing the model -- 2

In [6]:
X = df.drop(columns=['total_sales_price'],axis=1)
y = df['total_sales_price']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=314)

### Linear Regression for a baseline
```
72.4 ms ± 5.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

RMSE: 16057.10618458078
R2  : 0.7728174575988863
```

In [8]:
lin_reg_model = LinearRegression()
model = lin_reg_model.fit(X_train, y_train)
y_hat= model.predict(X_test)

print("RMSE: {}".format(np.sqrt(mean_squared_error((y_test),(y_hat)))))
print("R2  : {}".format(np.sqrt(r2_score((y_test),(y_hat)))))

RMSE: 16057.10618458078
R2  : 0.7728174575988863


In [9]:
# leave this commented unless you have a few extra minutes to spare

# rr  = RandomForestRegressor()
# rr.fit(X_train,y_train)
# y_pred = rr.predict(X_test)

# print("RMSE: {}".format(np.sqrt(mean_squared_error((y_test),(y_pred)))))
# print("R2  : {}".format(np.sqrt(r2_score((y_test),(y_pred)))))

In [10]:
print(f"""
Random Forest Regression has a {round((16057.1061-9536.7866)/16057.1061 *100 , 1)} % improvement over baseline in RMSE 
and a {round((.9262440179416279-.7728174575988863)/.7728174575988863 *100 , 1)} % improvement in R2
      
... but takes about 1,000x longer to run.""")

# Check these numbers because it's 3AM.


Random Forest Regression has a 40.6 % improvement over baseline in RMSE 
and a 19.9 % improvement in R2
      
... but takes about 1,000x longer to run.


### Decision Trees - marginally better than LR. Fast but max out at depth of 3.

In [11]:
# 

tree=DecisionTreeRegressor(max_depth=3)
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

print("RMSE: {}".format(np.sqrt(mean_squared_error((y_test),(y_pred)))))
print("R2  : {}".format(np.sqrt(r2_score((y_test),(y_pred)))))

RMSE: 13921.257325084407
R2  : 0.8350244922620658


In [12]:
print(f"""
Decision Trees has a {round((16057.1061-13921.2573)/16057.1061 *100 , 1)} % improvement over baseline in RMSE 
and a {round((.9262440179416279-.8350244922620658)/.8350244922620658 *100 , 1)} % improvement in R2
""")


Decision Trees has a 13.3 % improvement over baseline in RMSE 
and a 10.9 % improvement in R2



In [13]:
xgb_classifier = xgb.XGBClassifier()

In [14]:
xgb_r = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators = 100, seed = 123,)
xgb_r.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=10,
             num_parallel_tree=1, predictor='auto', random_state=123,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [15]:
y_pred = xgb_r.predict(X_test)

In [16]:
print("RMSE: {}".format(np.sqrt(mean_squared_error((y_test),(y_pred)))))
print("R2  : {}".format(np.sqrt(r2_score((y_test),(y_pred)))))

RMSE: 9098.612575706364
R2  : 0.9331040969124664


### XGBoost regression is strong for two reasons. Good performance and so much faster than Random Forrest Regression. And this is for a basically untuned model.

In [17]:
# params = {  'max_depth': [2,3,4],
#             'learning_rate': [0.1, 0.2, 0.3, 0.4],
#             'n_estimators': [100],
#             'colsample_bytree': [0.3, 0.7],
#             'subsample':}

# clf = GridSearchCV(estimator=xgbr, 
#                    param_grid=params,
#                    scoring='neg_mean_squared_error', 
#                    verbose=1)
# clf.fit(X, y)

# print("Best parameters:", clf.best_params_)
# print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Yeah, I don't get that.
```
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000}
Lowest RMSE:  25154.080799724477
```

## Rational Quadratic Kernel seems a good contender

In [18]:
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process import GaussianProcessClassifier

In [19]:
# Definition of the Rational Quadratic Kernel
kernel = 1.0 * RationalQuadratic (length_scale=1.0, alpha=0.1) 

In [20]:
# # Train Gaussian Process
# gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)

In [21]:
# kernel = RationalQuadratic(length_scale=1.0, alpha=1.5)
# gpc = GaussianProcessClassifier(kernel=kernel,
#         random_state=0).fit(X, y)
# gpc.score(X, y)

# gpc.predict_proba(X[:2,:])