20211202

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
SAGE = False # if notebook will be used on Amazon SageMaker
USE_GPU = True 
%config Completer.use_jedi = False

## Imports

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import requests # for telegram notifications
from tqdm.notebook import tqdm

from joblib import dump, load

Now, non-stdlib imports

In [3]:
# model selection
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

# metrics
from sklearn.metrics import accuracy_score#, log_loss, roc_auc_score

# eda
import missingno
import doubtlab 

# data cleaning
# from sklearn.impute import SimpleImputer #, KNNImputer
import cleanlab

# normalization
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
from gauss_rank_scaler import GaussRankScaler

# feature generation
from sklearn.preprocessing import PolynomialFeatures
import category_encoders as ce

# models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier

# feature reduction
from sklearn.decomposition import PCA
from umap import UMAP

# clustering
from sklearn.cluster import DBSCAN, KMeans
import hdbscan

# feature selection
# from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
# import featuretools as ft
# from BorutaShap import BorutaShap
# from boruta import BorutaPy

# tracking 
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
os.environ['WANDB_NOTEBOOK_NAME'] = f"nb_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from torch.utils.data import DataLoader
from torchinfo import summary

# widedeep
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT#, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

## Routing

Now, datapath setup

In [5]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')

    # handling datapath
    # datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/dec2021/')
    
else:
    # if on local machine
    if SAGE:
        root = Path('/home/studio-lab-user/sagemaker-studiolab-notebooks')
    else:
        root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/dec2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'studies'
    checkpath = root/'checkpoints'
    
    for pth in [datapath, predpath, subpath, studypath, checkpath]:
        pth.mkdir(exist_ok=True)

  and should_run_async(code)


## Helpers

In [6]:
SEED = 42

# Function to seed everything but the models
def seed_everything(seed, pytorch=True, reproducible=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if pytorch:
        torch.manual_seed(seed) # set torch CPU seed
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed) # set torch GPU(s) seed(s)
        if reproducible and torch.backends.cudnn.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

seed_everything(seed=SEED)

In [7]:
def reduce_memory_usage(df, verbose=True):
    """
    Function to reduce memory usage by downcasting datatypes in a Pandas DataFrame when possible.
    
    h/t to Bryan Arnold (https://www.kaggle.com/puremath86/label-correction-experiments-tps-nov-21)
    """
    
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [8]:
tg_api_token = 'your_api_token' # for Galileo (jupyter_watcher_bot) on Telegram
tg_chat_id = 'your_chat_id'

import requests

def send_tg_message(text='Cell execution completed.'):  
    """
    h/t Ivan Dembicki Jr. for the base version 
    (https://medium.com/@ivan.dembicki.jr/notifications-in-jupyter-notebook-with-telegram-f2e892c55173)
    """
    requests.post('https://api.telegram.org/' +  'bot{}/sendMessage'.format(tg_api_token),
                  params=dict(chat_id=tg_chat_id, text=text))

In [9]:
def reset_weights(m):
    '''
    Try resetting model weights to avoid weight leakage.
    h/t Christian Versloot (https://www.machinecurve.com/index.php/2021/02/03/how-to-use-k-fold-cross-validation-with-pytorch/)
    '''
    # print("Resetting weights")
    for layer in m.children():
        if hasattr(layer, 'reset_parameters'):
            # print(f'Reset trainable parameters of layer = {layer}')
            layer.reset_parameters()

## Data Setup

In [10]:
if USE_GPU: 
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [11]:
# # dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
# dataset_params = {
#     'train_source': str(datapath/'X_orig.feather'),
#     'target_source': str(datapath/'y_orig.joblib'),
#     'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
#     # 'scaler': str(RobustScaler()),
#     # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
#     # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
# }   

# # referring back to the already-entered attributes, specify how the pipeline was sequenced
# # dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# # dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# # now, load the datasets and generate more metadata from them
# # X = load(dataset_params['train_source'])
# X = pd.read_feather(dataset_params['train_source'])
# y = load(dataset_params['target_source'])
# # X_test = load(dataset_params['test_source'])
# X_test = pd.read_feather(dataset_params['test_source'])

# # reduce memory usage
# X = reduce_memory_usage(X)
# X_test = reduce_memory_usage(X)

# # metadata logging
# dataset_params['feature_count'] = X.shape[1]
# dataset_params['instance_count'] = X.shape[0]
    

In [12]:
# data_df = pd.read_csv(datapath/'train.csv', low_memory=False)
# test_df = pd.read_csv(datapath/'test.csv', low_memory=False)
# data_df = reduce_memory_usage(data_df)
# test_df = reduce_memory_usage(test_df)
# data_df.to_feather(datapath/'train.feather')
# test_df.to_feather(datapath/'test.feather')

## Metadata

In [13]:
# baseline -- alter as needed later
exmodel_config = {
    'general_random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
    # **dataset_params
}

## Dataset Class

In [14]:
class ForestDataset(torch.utils.data.Dataset):
    def __init__(self, feather_file, root_dir=datapath):
        '''
        Loads the data from a CSV (or in this case a feather) file
        - feather_file:pathlib.Path - Path to the .feather file containing data
        - root_dir:pathlib.Path - Path to the root directory for the data
                '''
        self.df = pd.read_feather(root_dir/feather_file) # makes the digits available
        self.feather_file = feather_file # attribute for later inspection
        self.root_dir = root_dir # attribute for later inspection
        # self.transforms = transforms 
        # CONSIDER: whether to do feature engineering at all; whether to use GaussScaler
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx): # handles the case where idx is a tensor
            idx = idx.tolist() # converts it to a list
        # data = np.array(self.df.iloc[idx, :-1]) # fetches all but the label as a np.ndarray
        
        # create torch tensors on the GPU (if available)
        data = torch.tensor(self.df.iloc[idx, :-1], device=device, dtype=torch.float32)
        label = torch.tensor(self.df.iloc[idx, -1], device=device, dtype=torch.float32)
        # label = self.df.iloc[idx, -1] # fetches the label as a np.int8
        # img = np.expand_dims(img, axis=0) # adding a channel dimension before the others
            
        # if self.transforms: # expects transforms to be in A.Compose(transforms=[...]) form
        #     augmented = self.transforms(image=img) # returns dict of {'image': ...}
        #     data_tensor = augmented['image'] 
        
        # return {'label': label, 'image': data_tensor}
        return {'label': label, 'data':data}
        
    # def viz(self, idx:int):
    #     '''
    #     Visualizes a single item from the dataset using matplotlib
    #     '''
    #     if not isinstance(idx, int):
    #         print("Argument must be an integer.")
    #         return None
    #     item = self.__getitem__(idx)
    #     print(item['label'])
    #     plt.imshow(np.transpose(item['image'].numpy(), axes=(1,2,0)))        

In [15]:
dataset = ForestDataset(root_dir=datapath, feather_file=datapath/'train.feather')

In [16]:
testset = ForestDataset(root_dir=datapath, feather_file=datapath/'test.feather')

In [17]:
dataset[0]

{'label': tensor(1., device='cuda:0'),
 'data': tensor([0.0000e+00, 3.1890e+03, 4.0000e+01, 8.0000e+00, 3.0000e+01, 1.3000e+01,
         3.2700e+03, 2.0600e+02, 2.3400e+02, 1.9300e+02, 4.8730e+03, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00], device='cuda:0')}

In [18]:
dataset[0]['data'].shape

torch.Size([55])

In [19]:
# data_df.iloc[:,-1].nunique()

## Model

In [20]:
class ForestModel(nn.Module):
    def __init__(self, out_dim=7, num_features=55):
        super(ForestModel, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=40)
        self.fc2 = nn.Linear(in_features=40, out_features=28)
        self.fc3 = nn.Linear(in_features=28, out_features=15)
        self.fc4 = nn.Linear(in_features=15, out_features=7)
        self.drop = nn.AlphaDropout(p=0.3)
        
    def forward(self, x):
        x = x
        x = F.selu(self.fc1(x))
        x = F.selu(self.fc2(x))
        x = F.selu(self.fc3(x))
        scores = self.drop(self.fc4(x))
        return scores
                    

In [21]:
# class MNISTModel(nn.Module):
#     """
#     Based on Jean-Francois Puget's STFT Transformer for the 2021 BirdCLEF competition.
#     - backbone_name:str -- a string that will be passed to timm.create_model to create a model meant to do the lion's share of the processing
#     - out_dim:int=10 -- the number of dimensions in the output; here, 10, for the probability of each digit-class
#     - embedding_size:int=5 -- the number of dimensions in the embedding; various rules of thumb converge to 5 given the 10 categories
#     - pretrained:bool=True -- whether or not to use a pretrained backbone model
    
#     """
#     def __init__(self, backbone_name:str='resnext50_32x4d', out_dim=10, embedding_size=5, pretrained=True):
#         super(MNISTModel, self).__init__()
#         self.backbone_name = backbone_name # this should be a string that will be passed to timm.create_model -- NOT an instance of the Backbone class
#         self.out_dim = out_dim # WHAT DOES THIS REALLY REPRESENT?
#         self.embedding_size = embedding_size
#         self.backbone = Backbone(backbone_name, pretrained=pretrained) # a constructor-wrapper around timm.create_model that handles differing head-layers
#         self.global_pool = nn.AdaptiveAvgPool2d(1) # presumably 1 is fine b/c we want a probability at the end
#         self.neck = nn.Sequential(
#             nn.Dropout(0.3),
#             nn.Linear(in_features=self.backbone.out_features, out_features=self.embedding_size, bias=False),#bias=True), (Karpathy - A Recipe for Training Neural Networks (20190425))
# #             nn.BatchNorm1d(self.embedding_size), # won't work with shape (1,512) i.e. single-channel data, b/c needs non-1 value for all dims
#             nn.LayerNorm(self.embedding_size), # will work with (1,512) shape
#             nn.PReLU()
#         )
        
#         self.head = nn.Linear(self.embedding_size, out_dim) # TODO: inspect this, see if it's outputting logits, probs, or preds
#         # self.head = nn.Linear(
        
#     def forward(self, data, get_embeddings=False, get_attentions=False):
# #         print(f"Input data has shape {data.shape}")# and the value is:\n{data}")
# #         x = data[0] # the first item in the incoming tuple, i.e. the tensor
# #         print(f"x.shape is {x.shape}")
# #         x = x.unsqueeze(1) # adding in a channel -- assumes input doesn't have that dim
# #         print(f"After unsqueeze(1), x.shape is {x.shape}")
# #         print(x.shape)
# #         x = x.expand(-1, 3, -1, -1) # inflates input to be three-channel
# #         print(f"After x.expand(-1,3,-1,-1), x.shape is {x.shape}")
# #         x = x.expand(3,-1,-1)
#         x = data
#         x = self.backbone(x)
# #         print(f"After the backbone, x.shape is {x.shape}")
#         if 'vit' not in self.backbone_name:
#             x = self.global_pool(x) 
#             x = x[:,:,0,0] # in timm implementations, this is replaced with the layer torch.nn.Flatten
#             # tensor shape here is (1,2048)
#         x = self.neck(x)
#         # tensor shape here is (1,5)
#         # return x
#         logits = self.head(x)
#         return logits # this outputs the unnormalized scores for the 10 classes; pass the tensor to F.softmax for probabilities


# Training

In [22]:
model = ForestModel().to(device)

In [23]:
summary(model)

Layer (type:depth-idx)                   Param #
ForestModel                              --
├─Linear: 1-1                            2,240
├─Linear: 1-2                            1,148
├─Linear: 1-3                            435
├─Linear: 1-4                            112
├─AlphaDropout: 1-5                      --
Total params: 3,935
Trainable params: 3,935
Non-trainable params: 0

In [24]:
test_tensor = torch.randn(1,55, device=device)

In [25]:
test_tensor

tensor([[ 0.1940,  2.1614, -0.1721,  0.8491, -1.9244,  0.6530, -0.6494, -0.8175,
          0.5280, -1.2753, -1.6621, -0.3033, -0.0926,  0.1992, -1.1204,  1.8577,
         -0.7145,  0.6881,  0.7968, -0.0334,  1.4917, -0.5165, -0.2541,  1.4746,
         -0.3260, -1.1600,  2.3551, -0.6924,  0.1837, -1.1835, -1.8029, -1.5808,
          0.8387,  1.4192,  0.6469,  0.4253, -1.5892,  0.6223,  1.6898, -0.6648,
          0.9425,  0.0783,  0.0847, -0.1408,  0.3316, -0.5890, -1.0723,  0.0954,
         -0.3347, -0.5258, -0.8776,  0.3938,  0.1640, -0.1977,  1.0104]],
       device='cuda:0')

In [26]:
model(test_tensor)

tensor([[-1.0595,  0.0101,  0.5060,  0.6401, -1.0595,  0.4837, -1.0595]],
       device='cuda:0', grad_fn=<AddBackward0>)

In [27]:
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

In [28]:
loss_function = nn.CrossEntropyLoss()
bs = 512

In [29]:
trainloader = DataLoader(dataset=dataset, batch_size=bs, num_workers=4, pin_memory=True)

In [30]:
trainloader.num_workers

4

In [31]:
data = trainloader.dataset[0]

In [33]:
optimizer = Adam(model.parameters(), lr=1e-4)

In [42]:
data['label']


tensor(1., device='cuda:0')

In [45]:
data = data.unsqueeze(0)

  and should_run_async(code)


AttributeError: 'dict' object has no attribute 'unsqueeze'

In [46]:
inputs = data['data']#.to(dtype=torch.float32)
print(f"inputs has shape {inputs.shape}, dtype {inputs.dtype}, and device {inputs.device}")
targets = data['label'].to(dtype=torch.long).unsqueeze(0)
# print(f"type(inputs) = {inputs.dtype}, type(targets) = {targets.dtype}")
# inputs = inputs.to(device)
# targets = targets.to(device)
optimizer.zero_grad()
outputs = model(inputs)
# outputs = outputs.to(device)
# print(f"Targets are: {targets}")
# print(f"Outputs are: {outputs}")
# print(f"Outputs is type {outputs.dtype}")

loss = loss_function(outputs, targets)
loss.backward()

inputs has shape torch.Size([55]), dtype torch.float32, and device cuda:0


  and should_run_async(code)


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [35]:
outputs

tensor([ -1.0595, -17.9349,  -1.0595, -24.7569,  -1.0595, -36.4029,  -1.0595],
       device='cuda:0', grad_fn=<AddBackward0>)

In [38]:
outputs.shape

torch.Size([7])

In [39]:
targets.shape

torch.Size([])

In [37]:
F.cross_entropy(target=targets, input=outputs)

  and should_run_async(code)


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [43]:
alt_targets = torch.tensor([1], device='cuda')

  and should_run_async(code)


In [44]:
F.cross_entropy(target=alt_targets, input=outputs)

  and should_run_async(code)


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [36]:
targets

tensor(1, device='cuda:0')

In [36]:
for fold, (train_ids, valid_ids) in enumerate(kfold.split(dataset)):
    print(f'FOLD {fold}')
    print('--------------------------------')
    
    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_ids)
    
    trainloader = DataLoader(dataset=dataset, sampler=train_subsampler, batch_size=bs, num_workers=0,)# pin_memory=True,)
    validloader = DataLoader(dataset=dataset, sampler=valid_subsampler, batch_size=bs, num_workers=0,)# pin_memory=True,)
    
    model.apply(reset_weights) # resetting weights so that you don't have fold leakage
    
    optimizer = Adam(model.parameters(), lr=1e-4)
    
    for epoch in range(5):
        print(f"Beginning epoch {epoch}")
        train_loss = 0.0
        valid_loss = 0.0
        model.train()
        
        for i, data in enumerate(trainloader):
            inputs = data['data']#.to(dtype=torch.float32)
            print(f"inputs has shape {inputs.shape}, dtype {inputs.dtype}, and device {inputs.device}")
            targets = data['label'].to(dtype=torch.long)
            # print(f"type(inputs) = {inputs.dtype}, type(targets) = {targets.dtype}")
            # inputs = inputs.to(device)
            # targets = targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            # outputs = outputs.to(device)
            # print(f"Targets are: {targets}")
            # print(f"Outputs are: {outputs}")
            # print(f"Outputs is type {outputs.dtype}")
            
            loss = loss_function(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() # because it's a tensor
        train_loss /= len(trainloader.dataset)
        print(f"Training loss is {train_loss}")
        
        model.eval()
        num_correct = 0
        num_examples = 0
        for i, data in enumerate(validloader):
            inputs = data['data']#.to(dtype=torch.float32)
            targets = data['label'].to(dtype=torch.long)
            # print(f"type(inputs) = {inputs.dtype}, type(targets) = {targets.dtype}")
            inputs = inputs.to(device)
            targets = targets.to(device)
            # optimizer.zero_grad() # only for training
            outputs = model(inputs)
            outputs = outputs.to(device)
            # print(f"Targets are: {targets}")
            # print(f"Outputs are: {outputs}")
            # print(f"Outputs is type {outputs.dtype}")
            loss = loss_function(outputs, targets)
            # loss.backward() # only for training
            # optimizer.step() # only for training
            valid_loss += loss.item() # because it's a tensor
            correct = torch.eq(torch.max(F.softmax(outputs, dim=1), dim=1)[1], targets)
            num_correct += torch.sum(correct).item()
            num_examples += correct.shape[0]
        valid_loss /= len(validloader.dataset)
        
        print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}, accuracy = {:.2f}'.format(epoch, train_loss, valid_loss, num_correct / num_examples))
    torch.save(model.state_dict(), checkpath)/f'resnext50_fold{fold}.pth'
    

FOLD 0
--------------------------------
Beginning epoch 0
inputs has shape torch.Size([512, 55]), dtype torch.float32, and device cuda:0


RuntimeError: cuda runtime error (710) : device-side assert triggered at /tmp/pip-req-build-1_ic8ial/aten/src/THC/generic/THCTensorMath.cu:29