In [2]:
import pandas as pd 
import numpy as np
from sklearn.datasets import load_diabetes 
from sklearn.model_selection import cross_val_score 

from xgboost import (XGBRegressor, XGBClassifier, 
                     XGBRFRegressor, XGBRFClassifier)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso, Ridge 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as MSE 


In [5]:
X, y = load_diabetes(return_X_y=True)
kfold = KFold(n_splits=5, shuffle=True, random_state=2)

def regression_model(model):
  scores = cross_val_score(model, X, y, 
                           scoring='neg_mean_squared_error', 
                           cv=kfold, n_jobs=-1)
  rmse = (-scores) ** .5
  return rmse.mean()

regression_model(XGBRegressor(booster='gblinear'))


55.49822381666432

In [6]:
# linearRegression
regression_model(LinearRegression())

55.50936875436024

In [7]:
# Lasso 
regression_model(Lasso())


62.64904114426351

In [8]:
regression_model(Ridge())


58.835292374356676

In [9]:
regression_model(XGBRegressor(booster="gbtree"))


65.9125519300286

In [11]:
def grid_search(params, reg=XGBRegressor(booster='gblinear')):
  grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', 
                          cv=kfold)
  grid_reg.fit(X, y)
  best_params = grid_reg.best_params_
  print(f"Best params: {best_params}")
  best_score = np.sqrt(-grid_reg.best_score_)
  print(f"Best score: {best_score}")


In [12]:
grid_search(params={'reg_alpha':[0.001, 0.01, 0.1, 0.5, 1, 5]})


Best params: {'reg_alpha': 0.01}
Best score: 55.47298609263182


In [13]:
grid_search(params={'reg_lambda': [0.001, 0.01, 0.1, 0.5, 1, 5]})


Best params: {'reg_lambda': 0.001}
Best score: 56.17170015281348


In [14]:
grid_search(params={'feature_selector': ['shuffle']})

Best params: {'feature_selector': 'shuffle'}
Best score: 55.54499520553863


In [15]:
grid_search(params={'feature_selector':['random', 'greedy', 'thrifty'], 
                    'updater':['coord_descent']})


Best params: {'feature_selector': 'thrifty', 'updater': 'coord_descent'}
Best score: 55.488143951136536


In [16]:
grid_search(params={'feature_selector': ['greedy', 'thrifty'], 
                   'updater': ['coord_descent'], 
                   'top_k': [3, 5, 7, 9]})


Best params: {'feature_selector': 'thrifty', 'top_k': 3, 'updater': 'coord_descent'}
Best score: 55.47871836076556


In [17]:
# construct a linear datasets
X = np.arange(1, 100)
np.random.seed(2)
y = []
for i in X:
  y.append(i * np.random.uniform(-0.2, 0.2))

y = np.array(y)
X = X.reshape(X.shape[0], 1)
y = y.reshape(y.shape[0], 1)
regression_model(XGBRegressor(booster='gblinear', 
                              objective='reg:squarederror'))


6.214946302686011

In [18]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror'))


9.372359516507444

In [19]:
regression_model(LinearRegression())


6.214962315808842

In [20]:
X, y = load_diabetes(return_X_y=True)
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror'))


65.91255196051148

In [21]:
df_census = pd.read_csv('census_cleaned.csv')
X_census = df_census.iloc[:, :-1]
y_census = df_census.iloc[:, -1]

def classification_model(model):
  scores = cross_val_score(model, X_census, y_census, 
                           scoring='accuracy', cv=kfold)
  return scores.mean()

classification_model(XGBClassifier(booster='gbtree'))


0.8703664915491263

In [22]:
classification_model(XGBClassifier(booster='dart'))


0.8703664915491263

In [23]:
classification_model(XGBClassifier(booster='gblinear'))


0.8499126062748819

In [24]:
classification_model(LogisticRegression(max_iter=1000))

0.798286266250338

In [25]:
classification_model(XGBClassifier(booster='dart', one_drop=1))

0.8726700350951848

In [None]:
regression_model(XGBRegressor(booster='dart', 
                              objective='reg:squarederror', 
                              sample_type='weighted'))


In [None]:
regression_model(XGBRegressor(booster='dart', 
                                 objective='reg:squarederror', 
                                 normalized_type='forest'))


In [None]:
regression_model(XGBRegressor(booster='dart', 
                              objective='reg:squarederror', 
                              one_drop=1))


In [None]:
grid_search(params={'rate_drop':[0.01, 0.1, 0.2, 0.4]}, 
            reg=XGBRegressor(booster='dart', 
                             objective='reg:squarederror', 
                             one_drop=1))


In [None]:
grid_search(params={'skip_drop': [0.01, 0.1, 0.2, 0.4]},
            reg=XGBRegressor(booster='dart', objective='reg:squarederror'))


In [None]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', 
                              num_parallel_tree=25))


In [None]:
regression_model(XGBRFRegressor(objective='reg:squarederror'))
regression_model(RandomForestRegressor())

In [None]:
classification_model(XGBRFClassifier())
classification_model(RandomForestClassifier())