In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold#, KFold
from sklearn.metrics import mean_squared_error
import wandb
from wandb.xgboost import wandb_callback
# import timm
from pathlib import Path
import os
import math

In [2]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torchinfo import summary
# # from fastai.callback.wandb import *
# from torch.cuda.amp import autocast, GradScaler

In [3]:
# from fastai.vision.all import *
# from fastai.tabular.all import *
# from fastai.callback.wandb import WandbCallback

In [4]:
# import xgboost as xgb
from xgboost import XGBRegressor, XGBRFRegressor
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier

In [5]:
# %config Completer.use_jedi = False

In [6]:
# set_seed(42, reproducible=True)

In [7]:
datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/202108_august/')

In [8]:
df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)

In [9]:
# df.isnull().sum().any() # False

In [10]:
# df.info(verbose=True, null_counts=True)

In [11]:
# df.describe()

In [12]:
y = df.loss

In [13]:
features = [x for x in df.columns if x != 'loss']

In [14]:
X = df[features]

In [15]:
len(X)

250000

In [16]:
df.head()

          f0   f1        f2        f3          f4        f5        f6  \
id                                                                      
0  -0.002350   59  0.766739 -1.350460     42.2727  16.68570   30.3599   
1   0.784462  145 -0.463845 -0.530421  27324.9000   3.47545  160.4980   
2   0.317816   19 -0.432571 -0.382644   1383.2600  19.71290   31.1026   
3   0.210753   17 -0.616454  0.946362   -119.2530   4.08235  185.2570   
4   0.439671   20  0.968126 -0.092546     74.3020  12.30650   72.1860   

          f7         f8       f9  ...        f91        f92      f93  \
id                                ...                                  
0   1.267300   0.392007  1.09101  ...  -42.43990  26.854000  1.45751   
1   0.828007   3.735860  1.28138  ... -184.13200   7.901370  1.70644   
2  -0.515354  34.430800  1.24210  ...    7.43721  37.218100  3.25339   
3   1.383310 -47.521400  1.09130  ...    9.66778   0.626942  1.49425   
4  -0.233964  24.399100  1.10151  ...  290.65700  15.604

In [17]:
len(y), len(y.value_counts())

(250000, 43)

In [18]:
# # first tree sweep
# sweep_config = {
#     "method": "grid", # try grid or random
#     "metric": {
#       "name": "rmse",
#       "goal": "minimize"   
#     },
#     "parameters": {
#         "booster": {
#             "values": ["gbtree", "dart"]
#         },
#         "max_depth": {
#             "values": [3, 6, 9, 12]
#         },
#         "learning_rate": {
#             "values": [0.1, 0.2, 0.3]
#         },
#         "subsample": {
#             "values": [1, 0.5]
#         },
#         "n_estimators": {
#             "values": [50, 125, 200]
#         }
#     }
# }

In [19]:
# first tree sweep
sweep_config = {
    "method": "bayes", # try grid or random
    "metric": {
      "name": "rmse",
      "goal": "minimize"   
    },
    "parameters": {
        "max_depth": {
            "distribution": "normal", 
            "mu": 3,
            "sigma": 1
        },
        "learning_rate": {
            "distribution": "normal", 
            "mu": 0.2,
            "sigma": 0.1
            #             "values": [0.1, 0.2, 0.3]
        },
        "n_estimators": {
            "distribution": "normal",
            "mu": 225,
            "sigma": 50
#             "values": [50, 125, 200]
        }
    }
}

In [20]:
# %env "WANDB_NOTEBOOK_NAME" "202108090846_XGBoostRegressor_tree_sweep"
os.environ['WANDB_NOTEBOOK_NAME'] = '20210810_XGBRegressor_tree_sweep2.ipynb'

In [21]:
sweep_id = wandb.sweep(sweep_config, project="202108_Kaggle_tabular_playground")

In [22]:
def train():
    config_defaults = {
        "library": "xgboost",
        "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
        "booster": 'dart',
        "n_estimators": 200,
        "max_depth": 3,
        "learning_rate": 0.1,
        "test_size": 0.2,
    }

    wandb.init(
#         project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=['XGBoost'],
        name='20210810_XGBRegressor_tree_sweep2',
        notes='XGBRegressor sweep using dart trees, second sweep, using Bayes strategy with normal distributions on max_depth, learning_rate, and n_estimators',
        config=config_defaults)
    
    config = wandb.config
        
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=config.test_size, random_state=config.seed)
    
    model = XGBRegressor(
        tree_method=config.tree_method,
        booster=config.booster,
        n_estimators=config.n_estimators, 
        max_depth=config.max_depth,
        learning_rate=config.learning_rate, 
        subsample=config.subsample,
        random_state=config.seed,
        test_size=config.test_size,
        seed=42
        n_jobs=-1, 
        verbosity=2, 
    )
    wandb.log({'params': model.get_params()})
    model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    

In [23]:
def train():
    config_defaults = {
        "library": "xgboost",
        "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
        "booster": 'dart',
        "n_estimators": 200,
        "max_depth": 3,
        "learning_rate": 0.1,
        "test_size": 0.2,
    }

    wandb.init(
#         project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=['XGBoost'],
        name='20210810_XGBRegressor_tree_sweep2',
        notes='XGBRegressor sweep using dart trees, second sweep, using Bayes strategy with normal distributions on max_depth, learning_rate, and n_estimators',
        config=config_defaults)
    
    config = wandb.config
        
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=config.test_size, random_state=config.seed)
    
    model = XGBRegressor(
        tree_method=config.tree_method,
        booster=config.booster,
        n_estimators=config.n_estimators, 
        max_depth=config.max_depth,
        learning_rate=config.learning_rate, 
        subsample=config.subsample,
        random_state=config.seed,
        test_size=config.test_size,
        seed=42
        n_jobs=-1, 
        verbosity=2, 
    )
    wandb.log({'params': model.get_params()})
    model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    

In [24]:
def train():
    config_defaults = {
        "library": "xgboost",
        "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
        "booster": 'dart',
        "n_estimators": 200,
        "max_depth": 3,
        "learning_rate": 0.1,
        "test_size": 0.2,
    }

    wandb.init(
#         project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=['XGBoost'],
        name='20210810_XGBRegressor_tree_sweep2',
        notes='XGBRegressor sweep using dart trees, second sweep, using Bayes strategy with normal distributions on max_depth, learning_rate, and n_estimators',
        config=config_defaults)
    
    config = wandb.config
        
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=config.test_size, random_state=config.seed)
    
    model = XGBRegressor(
        tree_method=config.tree_method,
        booster=config.booster,
        n_estimators=config.n_estimators, 
        max_depth=config.max_depth,
        learning_rate=config.learning_rate, 
        subsample=config.subsample,
        random_state=config.seed,
        test_size=config.test_size,
        seed=42,
        n_jobs=-1, 
        verbosity=2, 
    )
    wandb.log({'params': model.get_params()})
    model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    

In [25]:
wandb.agent(sweep_id, function=train)  

In [26]:
def train():
    config_defaults = {
        "library": "xgboost",
        "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
        "booster": 'dart',
        "n_estimators": 200,
        "max_depth": 3,
        "learning_rate": 0.1,
        "test_size": 0.2,
    }

    wandb.init(
#         project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=['XGBoost'],
        name='20210810_XGBRegressor_tree_sweep2',
        notes='XGBRegressor sweep using dart trees, second sweep, using Bayes strategy with normal distributions on max_depth, learning_rate, and n_estimators',
        config=config_defaults)
    
    config = wandb.config
        
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=config.test_size, random_state=config.seed)
    
    model = XGBRegressor(
        tree_method=config.tree_method,
        booster=config.booster,
        n_estimators=config.n_estimators, 
        max_depth=config.max_depth,
        learning_rate=config.learning_rate, 
        subsample=config.subsample,
        test_size=config.test_size,
        random_state=42,
        n_jobs=-1, 
        verbosity=2, 
    )
    wandb.log({'params': model.get_params()})
    model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    

In [27]:
wandb.agent(sweep_id, function=train)  

In [28]:
sweep_id = wandb.sweep(sweep_config, project="202108_Kaggle_tabular_playground")

In [29]:
def train():
    config_defaults = {
        "library": "xgboost",
        "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
        "booster": 'dart',
        "n_estimators": 200,
        "max_depth": 3,
        "learning_rate": 0.1,
        "test_size": 0.2,
    }

    wandb.init(
#         project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=['XGBoost'],
        name='20210810_XGBRegressor_tree_sweep2',
        notes='XGBRegressor sweep using dart trees, second sweep, using Bayes strategy with normal distributions on max_depth, learning_rate, and n_estimators',
        config=config_defaults)
    
    config = wandb.config
        
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=config.test_size, random_state=config.seed)
    
    model = XGBRegressor(
        tree_method=config.tree_method,
        booster=config.booster,
        n_estimators=config.n_estimators, 
        max_depth=config.max_depth,
        learning_rate=config.learning_rate, 
        subsample=config.subsample,
        test_size=config.test_size,
        random_state=42,
        n_jobs=-1, 
        verbosity=2, 
    )
    wandb.log({'params': model.get_params()})
    model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    

In [30]:
wandb.agent(sweep_id, function=train)  

In [31]:
sweep_id = wandb.sweep(sweep_config, project="202108_Kaggle_tabular_playground")

In [32]:
def train():
    config_defaults = {
        "library": "xgboost",
        "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
        "booster": 'dart',
        "n_estimators": 200,
        "max_depth": 3,
        "learning_rate": 0.1,
        "test_size": 0.2,
    }

    wandb.init(
#         project="202108_Kaggle_tabular_playground",
        save_code=True,
        tags=['XGBoost'],
        name='20210810_XGBRegressor_tree_sweep2',
        notes='XGBRegressor sweep using dart trees, second sweep, using Bayes strategy with normal distributions on max_depth, learning_rate, and n_estimators',
        config=config_defaults)
    
    config = wandb.config
        
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=config.test_size, random_state=42)
    
    model = XGBRegressor(
        tree_method=config.tree_method,
        booster=config.booster,
        n_estimators=config.n_estimators, 
        max_depth=config.max_depth,
        learning_rate=config.learning_rate, 
        subsample=config.subsample,
        test_size=config.test_size,
        random_state=42,
        n_jobs=-1, 
        verbosity=2, 
    )
    wandb.log({'params': model.get_params()})
    model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
    y_preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_preds)
    rmse = math.sqrt(abs(mse))
    wandb.log({'mse':mse, 'rmse':rmse})
    print(f"MSE is {mse}\nRMSE is {rmse}")   
    

In [33]:
wandb.agent(sweep_id, function=train)  