# Models

In [4]:
'''
This isn't strictly needed. However it solves this annoying pandas error:

/opt/homebrew/Caskroom/miniforge/base/envs/supervised/lib/python3.9/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index
  
The problem is solved with xgboost 1.6 but I don't want to use pip in this case and the conda package is currently 1.5.1  
'''

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb


from sklearn import preprocessing 
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [6]:
df = pd.read_pickle("../data/diamonds.pkl")

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=314)

NameError: name 'X' is not defined

### Linear Regression for a baseline
```
72.4 ms ± 5.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

RMSE: 16057.10618458078
R2  : 0.7728174575988863
```

In [None]:
lin_reg_model = LinearRegression()
model = lin_reg_model.fit(X_train, y_train)
y_hat= model.predict(X_test)

print("RMSE: {}".format(np.sqrt(mean_squared_error((y_test),(y_hat)))))
print("R2  : {}".format(np.sqrt(r2_score((y_test),(y_hat)))))

In [None]:
# leave this commented unless you have a few extra minutes to spare

# rr  = RandomForestRegressor()
# rr.fit(X_train,y_train)
# y_pred = rr.predict(X_test)

# print("RMSE: {}".format(np.sqrt(mean_squared_error((y_test),(y_pred)))))
# print("R2  : {}".format(np.sqrt(r2_score((y_test),(y_pred)))))

In [None]:
print(f"""
Random Forest Regression has a {round((16057.1061-9536.7866)/16057.1061 *100 , 1)} % improvement over baseline in RMSE 
and a {round((.9262440179416279-.7728174575988863)/.7728174575988863 *100 , 1)} % improvement in R2
      
... but takes about 1,000x longer to run.""")

# Check these numbers because it's 3AM.

In [None]:
### Decision Trees - marginally better than LR. Fast but max out at depth of 3.

In [None]:
# 

tree=DecisionTreeRegressor(max_depth=3)
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

print("RMSE: {}".format(np.sqrt(mean_squared_error((y_test),(y_pred)))))
print("R2  : {}".format(np.sqrt(r2_score((y_test),(y_pred)))))

In [2]:
print(f"""
Decision Trees has a {round((16057.1061-13921.2573)/16057.1061 *100 , 1)} % improvement over baseline in RMSE 
and a {round((.9262440179416279-.8350244922620658)/.8350244922620658 *100 , 1)} % improvement in R2
""")


Decision Trees has a 13.3 % improvement over baseline in RMSE 
and a 10.9 % improvement in R2



In [None]:
xgb_classifier = xgb.XGBClassifier()

In [None]:
xgb_r = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators = 100, seed = 123,)
xgb_r.fit(X_train, y_train)

In [None]:
y_pred = xgb_r.predict(X_test)

In [None]:
print("RMSE: {}".format(np.sqrt(mean_squared_error((y_test),(y_pred)))))
print("R2  : {}".format(np.sqrt(r2_score((y_test),(y_pred)))))

In [None]:
### XGBoost regression is strong for two reasons. Good performance and so much faster than Random Forrest Regression. And this is for a basically untuned model.

In [None]:
# params = {  'max_depth': [2,3,4],
#             'learning_rate': [0.1, 0.2, 0.3, 0.4],
#             'n_estimators': [100],
#             'colsample_bytree': [0.3, 0.7],
#             'subsample':}

# clf = GridSearchCV(estimator=xgbr, 
#                    param_grid=params,
#                    scoring='neg_mean_squared_error', 
#                    verbose=1)
# clf.fit(X, y)

# print("Best parameters:", clf.best_params_)
# print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Yeah, I don't get that.
```
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000}
Lowest RMSE:  25154.080799724477
```

In [None]:
## Rational Quadratic Kernel seems a good contender

In [None]:
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process import GaussianProcessClassifier

In [None]:
# Definition of the Rational Quadratic Kernel
kernel = 1.0 * RationalQuadratic (length_scale=1.0, alpha=0.1) 

In [None]:
# Train Gaussian Process
gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)

In [None]:
kernel = RationalQuadratic(length_scale=1.0, alpha=1.5)
gpc = GaussianProcessClassifier(kernel=kernel,
        random_state=0).fit(X, y)
gpc.score(X, y)

gpc.predict_proba(X[:2,:])