In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
import time
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass#print(os.path.join(dirname, filename))

from pathlib import Path
import sys

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset 
from torch.utils.data import DataLoader
from torch.nn import functional as F 
import torchvision
import torch.nn.utils.prune as prune

from tqdm import tqdm
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# input_path = Path('../input/ubiquant-market-prediction')
input_path = Path('../input/ubiquant-parquet')


output_path = Path('../output')

model_name = 'my_model'
result_path = Path('../working')

checkpoint_path = result_path / model_name
if not checkpoint_path.exists():
    checkpoint_path.mkdir(parents=True, exist_ok=True)

os.listdir('../')

In [None]:
# data_type = {f'f_{i}': np.float16 for i in range(300)}
# data_type['investment_id'] = np.uint16
# data_type['time_id'] = np.uint16

# features = pd.read_csv(input_path/'train.csv').set_index(['investment_id','time_id'])
features = pd.read_parquet(input_path/'train.parquet', engine='pyarrow').set_index(['investment_id','time_id']).astype(np.float16)



In [None]:
target = features['target']
del features['row_id']
del features['target']

features.info()

In [None]:
print(features.shape, target.shape)

In [None]:
class MultipleTimeSeriesCV:
    """Generates tuples of train_idx, test_idx pairs
    Assumes the MultiIndex contains levels 'symbol' and 'date'
    purges overlapping outcomes"""

    def __init__(self,
                 n_splits=3,
                 train_period_length=126,
                 test_period_length=21,
                 lookahead=None,
                 date_idx='date',
                 shuffle=False):
        self.n_splits = n_splits
        self.lookahead = lookahead
        self.test_length = test_period_length
        self.train_length = train_period_length
        self.shuffle = shuffle
        self.date_idx = date_idx

    def split(self, X, y=None, groups=None):
        unique_dates = X.index.get_level_values(self.date_idx).unique()
        days = sorted(unique_dates, reverse=True)
        split_idx = []
        for i in range(self.n_splits):
            test_end_idx = i * self.test_length
            test_start_idx = test_end_idx + self.test_length
            train_end_idx = test_start_idx + self.lookahead - 1
            train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
            split_idx.append([train_start_idx, train_end_idx,
                              test_start_idx, test_end_idx])

        dates = X.reset_index()[[self.date_idx]]
        for train_start, train_end, test_start, test_end in split_idx:

            train_idx = dates[(dates[self.date_idx] > days[train_start])
                              & (dates[self.date_idx] <= days[train_end])].index
            test_idx = dates[(dates[self.date_idx] > days[test_start])
                             & (dates[self.date_idx] <= days[test_end])].index
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx.to_numpy(), test_idx.to_numpy()

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [None]:
class UbiquantDataset(Dataset):
    def __init__(self, X, y, transform=None):
        self._feature = X
        self._k = X.shape[1]
        self._n = X.shape[0]
        self._target = y
        self._transform = transform
        
    def __len__(self):
        return self._n
    
    def __getitem__(self, idx):
        X = self._feature[idx,:].reshape(self._k, 1)
        y = self._target[[idx]]
        if self._transform:
            sample = self._transform(X)
        return torch.from_numpy(X).float(), torch.from_numpy(y).float()

def get_train_valid_data(X, y, train_idx, test_idx):
    x_train, y_train = X.iloc[train_idx, :].to_numpy(), y.iloc[train_idx].to_numpy()
    x_val, y_val = X.iloc[test_idx, :].to_numpy(), y.iloc[test_idx].to_numpy()
#     scaler = MinMaxScaler(feature_range=(-1, 1))
#     x_train = scaler.fit_transform(x_train)
#     x_val = scaler.transform(x_val)
    return UbiquantDataset(x_train, y_train), UbiquantDataset(x_val, y_val)

In [None]:
class torch_model(nn.Module):
    def load_model(self, model_path, cuda=False):
        pretrained_model = torch.load(f=model_path, map_location="cuda" if cuda else "cpu") # Load pre-trained weights in current model
        with torch.no_grad():
            self.load_state_dict(pretrained_model, strict=True)
        
        # Debug loading
        #print('Parameters found in pretrained model:')
        pretrained_layers = pretrained_model.keys()
        #for l in pretrained_layers:
        #    print(l, " ")
        for name, module in self.state_dict().items(): 
            if name in pretrained_layers:
                assert torch.equal(pretrained_model[name].cpu(), module.cpu())
                #print('{} have been loaded correctly in current model.'.format(name))
            else:
                raise ValueError("state_dict() keys do not match")
        print('model weights have been loaded correctly in current model.')

In [None]:
class nn_regressor(torch_model): 
    def __init__(self, param):
        super(nn_regressor, self).__init__()
        self.n_in = param['n_in']
        self.cls = nn.Sequential(
            nn.Linear(param['n_in'], param['n_hidden1']),
            param['act_f'](),
            nn.BatchNorm1d(num_features=param['n_hidden1']),
            nn.Linear(param['n_hidden1'], param['n_hidden2']),
            param['act_f'](),
            nn.BatchNorm1d(num_features=param['n_hidden2']),
            nn.Linear(param['n_hidden2'], param['n_hidden3']),
            param['act_f'](),
            nn.BatchNorm1d(num_features=param['n_hidden3']),
            # nn.Linear(n_hidden3, n_hidden4),
            # nn.ReLU(),
            # nn.BatchNorm1d(num_features=n_hidden4),
            nn.Linear(param['n_hidden3'], 1)
        )
        
    def forward(self, x):
        x = x.view(-1, self.n_in) 
        x = self.cls(x)
        return x

In [None]:
def test(model, test_dataloader, params):
    device = torch.device("cpu")
    test_loss = 0.0
    model.eval()
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_dataloader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = params['criterion'](output, target)
            test_loss += loss.item()
    return test_loss / len(test_dataloader)
    

def train(NN_model, train_dataloader, val_dataloader, params):
    # model, optimizer
    device = torch.device("cpu")
    model = NN_model(params).to(device)
    optimizer = optim.Adam(model.parameters(),lr=params['lr'])
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=params['T_max'], eta_min=4e-08)
    # train
    for epoch in range(1, params['epoch']+1):
        start_time = time.time()
        train_loss = 0.0
        model.train()
        for batch_idx, (data, target) in enumerate(train_dataloader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = params['criterion'](output, target)
            loss.backward()
            if params['clip']:
                torch.nn.utils.clip_grad_norm_(model.parameters(),params['clip'])
            train_loss += loss.item()
            optimizer.step()
            scheduler.step()
            
            if batch_idx % params['log_interval'] == 0:
                print('Train : [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    batch_idx * len(data), len(train_dataloader.dataset),
                    100. * batch_idx / len(train_dataloader), loss.item()))
        
        train_loss /= len(train_dataloader)
        val_loss = test(model, val_dataloader, params)
        print("| Epoch {} | running {:.2f} seconds | train Loss {:.6f} | val loss {:.6f} |".format(
            epoch, time.time()-start_time, train_loss, val_loss))

In [None]:
train_period_length = 5 * 12 * 5
test_period_length = 5 * 5
# train_period_length = 120
# test_period_length = 24
n_splits = 1
lookahead = 1

params = {"lr": 5e-4, 'n_in': 300, 'batch_size': 256, 'n_hidden1': 525, 'n_hidden2': 210, 
          'n_hidden3': 128, 'n_hidden4': 64, 'epoch': 2, 'clip': None, 'log_interval': 1000, 
          'T_max': 10, 'act_f': nn.ReLU, 'criterion': nn.MSELoss(reduction='mean')}

In [None]:
cv = MultipleTimeSeriesCV(n_splits=n_splits,
                          train_period_length=train_period_length,
                          test_period_length=test_period_length,
                          lookahead=lookahead,
                          date_idx='time_id')

In [None]:
models = []

for fold, (train_idx, test_idx) in tqdm(enumerate(cv.split(features))):
    print(f'fold: {fold}')
    # train-val split
    train_set, val_set = get_train_valid_data(features, target, train_idx, test_idx)
    train_loader = DataLoader(train_set, batch_size=params['batch_size'], shuffle=True, drop_last=True, num_workers=4)
    val_loader = DataLoader(val_set, batch_size=params['batch_size'], shuffle=True, drop_last=False, num_workers=4)
    # train
    train(nn_regressor, train_loader, val_loader, params)

In [None]:
import ubiquant
env = ubiquant.make_env()


In [None]:
iter_test = env.iter_test() 
# features_col = features.columns
for (test_df, sample_prediction_df) in iter_test:
    features = test_df.filter(like='f')
    
    y_preds = []
    for model in models:
        y_pred = model.predict((features.iloc[:,selected_features], features))
        y_preds.append(y_pred)
    sample_prediction_df['target'] = np.mean(y_preds, axis=0)
    env.predict(sample_prediction_df)