## Import

In [1]:
import pandas as pd
import numpy as np
np.random.seed(2023)
import pickle
from sklearn.metrics import mean_squared_error
import os
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
# pip install pytorch-tabnet
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
# ignore warnings
import warnings ; warnings.filterwarnings('ignore')

## Read data

In [2]:
symphony_feature, chorus_feature, voice_feature, solo_feature, chamber_feature = pickle.load(open(f'../data/Features.pkl','rb'))

## TABNET

In [3]:
def TabNet_modeling(X, y):
    # Road model
    reg = TabNetRegressor(optimizer_fn=torch.optim.Adam,
                          optimizer_params=dict(lr=1e-2),
                          scheduler_params={"step_size":50,"gamma":0.9},
                          scheduler_fn=torch.optim.lr_scheduler.StepLR,
                          mask_type='sparsemax', verbose=0, seed=2023)  # "sparsemax", entmax
    
    # Split data 
    X['SET'] = np.random.choice(["train", "val", "test"], p =[.8, .1, .1], size=(X.shape[0],))
    X_train, X_val, X_test = X.query('SET=="train"').drop('SET', axis=1), X.query('SET=="val"').drop('SET', axis=1), X.query('SET=="test"').drop('SET', axis=1)
    y_train, y_val, y_test = y.loc[X_train.index], y.loc[X_val.index], y.loc[X_test.index]
    
    # Tunning
    reg.fit(X_train=X_train.values, y_train=y_train.values,
            eval_set=[(X_train.values, y_train.values), (X_val.values, y_val.values)],
            eval_name=['train', 'valid'],
            eval_metric=['rmse'],
            max_epochs=1000 , patience=300,
            batch_size=1024, virtual_batch_size=128,
            num_workers=0,
            drop_last=False)
    
    # Predict
    preds = reg.predict(X_test.values)
    test_mse = mean_squared_error(y_pred=preds, y_true=y_test)   
    
    # Selection
    fimportance = pd.DataFrame({'feature': X_train.columns, 'importance':reg.feature_importances_})
    unuse = fimportance.query('importance==0').feature.values.tolist()
    if len(unuse) != 0:
        X = X.drop(unuse, axis=1)
        return X, reg.best_cost, test_mse, reg
    else:
        return None, 'Done!', 'Done!', None

- 교향곡

In [4]:
X = symphony_feature.drop([i for i in symphony_feature.columns if 'TARGET' in i], axis=1)
y = symphony_feature[[i for i in symphony_feature.columns if 'TARGET' in i]]

In [5]:
first_selected_symphony_feature, val_rmse, test_rmse, first_model_symphony = TabNet_modeling(X, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 539 with best_epoch = 239 and best_valid_rmse = 164.8821
BEST VALID SCORE FOR DATA : 164.8821016529471
FINAL TEST SCORE FOR DATA : 42275.14319664677


In [6]:
# 최종 모델
second_selected_symphony_feature, val_rmse, test_rmse, second_model_symphony = TabNet_modeling(first_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 511 with best_epoch = 211 and best_valid_rmse = 200.51576
BEST VALID SCORE FOR DATA : 200.51576071092336
FINAL TEST SCORE FOR DATA : 31722.005369391303


In [7]:
third_selected_symphony_feature, val_rmse, test_rmse, third_model_symphony = TabNet_modeling(second_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 566 with best_epoch = 266 and best_valid_rmse = 168.1597
BEST VALID SCORE FOR DATA : 168.1597002912043
FINAL TEST SCORE FOR DATA : 26678.12271861361


In [8]:
fourth_selected_symphony_feature, val_rmse, test_rmse, fourth_model_symphony = TabNet_modeling(third_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 556 with best_epoch = 256 and best_valid_rmse = 185.09874
BEST VALID SCORE FOR DATA : 185.09874135531803
FINAL TEST SCORE FOR DATA : 53888.50646498374


In [9]:
fifth_selected_symphony_feature, val_rmse, test_rmse, fifth_model_symphony = TabNet_modeling(fourth_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 656 with best_epoch = 356 and best_valid_rmse = 184.65765
BEST VALID SCORE FOR DATA : 184.65765240565392
FINAL TEST SCORE FOR DATA : 55746.25784871587


In [10]:
sixth_selected_symphony_feature, val_rmse, test_rmse, sixth_model_symphony = TabNet_modeling(fifth_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 391 with best_epoch = 91 and best_valid_rmse = 218.71002
BEST VALID SCORE FOR DATA : 218.71002205434337
FINAL TEST SCORE FOR DATA : 38695.22940554545


In [11]:
seventh_selected_symphony_feature, val_rmse, test_rmse, seventh_model_symphony = TabNet_modeling(sixth_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")

Stop training because you reached max_epochs = 1000 with best_epoch = 862 and best_valid_rmse = 160.5521
BEST VALID SCORE FOR DATA : Done!
FINAL TEST SCORE FOR DATA : Done!


- 합창

In [12]:
# 최종 모델
X = chorus_feature.drop([i for i in chorus_feature.columns if 'TARGET' in i], axis=1)
y = chorus_feature[[i for i in chorus_feature.columns if 'TARGET' in i]]

In [13]:
first_selected_chorus_feature, val_rmse, test_rmse, first_model_chorus = TabNet_modeling(X, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 396 with best_epoch = 96 and best_valid_rmse = 104.18992
BEST VALID SCORE FOR DATA : 104.18991759238911
FINAL TEST SCORE FOR DATA : 36038.10356112092


In [14]:
second_selected_chorus_feature, val_rmse, test_rmse, second_model_chorus = TabNet_modeling(first_selected_chorus_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 537 with best_epoch = 237 and best_valid_rmse = 117.22899
BEST VALID SCORE FOR DATA : Done!
FINAL TEST SCORE FOR DATA : Done!


- 성악

In [15]:
# 최종 모델
X = voice_feature.drop([i for i in voice_feature.columns if 'TARGET' in i], axis=1)
y = voice_feature[[i for i in voice_feature.columns if 'TARGET' in i]]

In [16]:
first_selected_voice_feature, val_rmse, test_rmse, first_model_voice = TabNet_modeling(X, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 575 with best_epoch = 275 and best_valid_rmse = 61.16147
BEST VALID SCORE FOR DATA : 61.161470206090875
FINAL TEST SCORE FOR DATA : 18832.786757205522


In [17]:
second_selected_voice_feature, val_rmse, test_rmse, second_model_voice = TabNet_modeling(first_selected_voice_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 668 with best_epoch = 368 and best_valid_rmse = 33.9581
BEST VALID SCORE FOR DATA : Done!
FINAL TEST SCORE FOR DATA : Done!


- 독주

In [18]:
X = solo_feature.drop([i for i in solo_feature.columns if 'TARGET' in i], axis=1)
y = solo_feature[[i for i in solo_feature.columns if 'TARGET' in i]]

In [19]:
first_selected_solo_feature, val_rmse, test_rmse, first_model_solo = TabNet_modeling(X, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 350 with best_epoch = 50 and best_valid_rmse = 158.7263
BEST VALID SCORE FOR DATA : 158.7262962824763
FINAL TEST SCORE FOR DATA : 88863.69298169497


In [20]:
# 최종 모델
second_selected_solo_feature, val_rmse, test_rmse, second_model_solo = TabNet_modeling(first_selected_solo_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 664 with best_epoch = 364 and best_valid_rmse = 129.77311
BEST VALID SCORE FOR DATA : 129.77311255960308
FINAL TEST SCORE FOR DATA : 159624.66722288562


In [21]:
third_selected_solo_feature, val_rmse, test_rmse, third_model_solo = TabNet_modeling(second_selected_solo_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 654 with best_epoch = 354 and best_valid_rmse = 55.04906
BEST VALID SCORE FOR DATA : 55.049061505458965
FINAL TEST SCORE FOR DATA : 20754.88604744343


In [24]:
fourth_selected_solo_feature, val_rmse, test_rmse, fourth_model_solo = TabNet_modeling(third_selected_solo_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")

Stop training because you reached max_epochs = 1000 with best_epoch = 934 and best_valid_rmse = 89.97768
BEST VALID SCORE FOR DATA : Done!
FINAL TEST SCORE FOR DATA : Done!


- 실내악

In [25]:
X = chamber_feature.drop([i for i in chamber_feature.columns if 'TARGET' in i], axis=1)
y = chamber_feature[[i for i in chamber_feature.columns if 'TARGET' in i]]

In [26]:
first_selected_chamber_feature, val_rmse, test_rmse, first_model_chamber = TabNet_modeling(X, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 891 with best_epoch = 591 and best_valid_rmse = 49.20571
BEST VALID SCORE FOR DATA : 49.20571073031093
FINAL TEST SCORE FOR DATA : 26829.03361086408


In [27]:
# 최종 모델
second_selected_chamber_feature, val_rmse, test_rmse, second_model_chamber = TabNet_modeling(first_selected_chamber_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 980 with best_epoch = 680 and best_valid_rmse = 122.10502
BEST VALID SCORE FOR DATA : 122.1050167227347
FINAL TEST SCORE FOR DATA : 21873.032723672866


In [28]:
third_selected_chamber_feature, val_rmse, test_rmse, third_model_chamber = TabNet_modeling(second_selected_chamber_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 961 with best_epoch = 661 and best_valid_rmse = 28.47508
BEST VALID SCORE FOR DATA : 28.4750808843294
FINAL TEST SCORE FOR DATA : 31782.73019420714


In [29]:
fourth_selected_chamber_feature, val_rmse, test_rmse, fourth_model_chamber = TabNet_modeling(third_selected_chamber_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")

Stop training because you reached max_epochs = 1000 with best_epoch = 908 and best_valid_rmse = 82.70715
BEST VALID SCORE FOR DATA : 82.70715033844876
FINAL TEST SCORE FOR DATA : 40919.92804516618


In [30]:
fifth_selected_chamber_feature, val_rmse, test_rmse, fifth_model_chamber = TabNet_modeling(fourth_selected_chamber_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 793 with best_epoch = 493 and best_valid_rmse = 91.4251
BEST VALID SCORE FOR DATA : 91.4250996432538
FINAL TEST SCORE FOR DATA : 45383.83921841585


In [31]:
sixth_selected_chamber_feature, val_rmse, test_rmse, sixth_model_chamber = TabNet_modeling(fifth_selected_chamber_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 741 with best_epoch = 441 and best_valid_rmse = 92.13339
BEST VALID SCORE FOR DATA : Done!
FINAL TEST SCORE FOR DATA : Done!


## Save data

In [33]:
pickle.dump((second_selected_symphony_feature,
             chorus_feature.drop([i for i in chorus_feature.columns if 'TARGET' in i], axis=1),
             voice_feature.drop([i for i in voice_feature.columns if 'TARGET' in i], axis=1) ,
             second_selected_solo_feature, 
             second_selected_chamber_feature), 
            open(f'../data/Final_useful_features.pkl', 'wb'))