## Import

In [1]:
import pandas as pd
import numpy as np
np.random.seed(2023)
import pickle
from sklearn.metrics import mean_squared_error
import os
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
# pip install pytorch-tabnet
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
# ignore warnings
import warnings ; warnings.filterwarnings('ignore')

## Read data

In [2]:
symphony_feature, chorus_feature, voice_feature, solo_feature, chamber_feature = pickle.load(open(f'../data/SaleRatePredictionFeatures.pkl','rb'))

## TABNET

In [3]:
def TabNet_modeling(X, y):
    # Road model
    reg = TabNetRegressor(optimizer_fn=torch.optim.Adam,
                          optimizer_params=dict(lr=1e-2),
                          scheduler_params={"step_size":50,"gamma":0.9},
                          scheduler_fn=torch.optim.lr_scheduler.StepLR,
                          mask_type='sparsemax', verbose=0, seed=2023)  # "sparsemax", entmax
    
    # Split data 
    X['SET'] = np.random.choice(["train", "val", "test"], p =[.8, .1, .1], size=(X.shape[0],))
    X_train, X_val, X_test = X.query('SET=="train"').drop('SET', axis=1), X.query('SET=="val"').drop('SET', axis=1), X.query('SET=="test"').drop('SET', axis=1)
    y_train, y_val, y_test = y.loc[X_train.index], y.loc[X_val.index], y.loc[X_test.index]
    
    # Tunning
    reg.fit(X_train=X_train.values, y_train=y_train.values,
            eval_set=[(X_train.values, y_train.values), (X_val.values, y_val.values)],
            eval_name=['train', 'valid'],
            eval_metric=['rmse'],
            max_epochs=1000 , patience=300,
            batch_size=1024, virtual_batch_size=128,
            num_workers=0,
            drop_last=False)
    
    # Predict
    preds = reg.predict(X_test.values)
    test_mse = mean_squared_error(y_pred=preds, y_true=y_test)   
    
    # Selection
    fimportance = pd.DataFrame({'feature': X_train.columns, 'importance':reg.feature_importances_})
    unuse = fimportance.query('importance==0').feature.values.tolist()
    if len(unuse) != 0:
        X = X.drop(unuse, axis=1)
        return X, reg.best_cost, test_mse, reg
    else:
        return None, 'Done!', 'Done!', None

- 교향곡

In [4]:
X = symphony_feature.drop([i for i in symphony_feature.columns if 'TARGET' in i], axis=1)
y = symphony_feature[[i for i in symphony_feature.columns if 'TARGET' in i]]

In [5]:
first_selected_symphony_feature, val_rmse, test_rmse, first_model_symphony = TabNet_modeling(X, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 377 with best_epoch = 77 and best_valid_rmse = 0.44282
BEST VALID SCORE FOR DATA : 0.44282195050959755
FINAL TEST SCORE FOR DATA : 0.23989544964675905


In [6]:
second_selected_symphony_feature, val_rmse, test_rmse, second_model_symphony = TabNet_modeling(first_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 303 with best_epoch = 3 and best_valid_rmse = 2.4785
BEST VALID SCORE FOR DATA : 2.4784997098488972
FINAL TEST SCORE FOR DATA : 50.07231011456347


In [7]:
third_selected_symphony_feature, val_rmse, test_rmse, third_model_symphony = TabNet_modeling(second_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 787 with best_epoch = 487 and best_valid_rmse = 0.37664
BEST VALID SCORE FOR DATA : 0.3766399252879392
FINAL TEST SCORE FOR DATA : 0.18350811273240622


In [8]:
# 최종 모델
fourth_selected_symphony_feature, val_rmse, test_rmse, fourth_model_symphony = TabNet_modeling(third_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 334 with best_epoch = 34 and best_valid_rmse = 0.61289
BEST VALID SCORE FOR DATA : 0.6128896186300978
FINAL TEST SCORE FOR DATA : 710.3275096968365


In [9]:
fifth_selected_symphony_feature, val_rmse, test_rmse, fifth_model_symphony = TabNet_modeling(fourth_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 552 with best_epoch = 252 and best_valid_rmse = 0.34518
BEST VALID SCORE FOR DATA : 0.34518276186660823
FINAL TEST SCORE FOR DATA : 0.1259805243662003


In [10]:
sixth_selected_symphony_feature, val_rmse, test_rmse, sixth_model_symphony = TabNet_modeling(fifth_selected_symphony_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 562 with best_epoch = 262 and best_valid_rmse = 0.34538
BEST VALID SCORE FOR DATA : Done!
FINAL TEST SCORE FOR DATA : Done!


- 합창

In [11]:
# 최종 모델
X = chorus_feature.drop([i for i in chorus_feature.columns if 'TARGET' in i], axis=1)
y = chorus_feature[[i for i in chorus_feature.columns if 'TARGET' in i]]

In [12]:
first_selected_chorus_feature, val_rmse, test_rmse, first_model_chorus = TabNet_modeling(X, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 778 with best_epoch = 478 and best_valid_rmse = 0.46555
BEST VALID SCORE FOR DATA : 0.4655470148539781
FINAL TEST SCORE FOR DATA : 0.20799941084478113


In [13]:
second_selected_chorus_feature, val_rmse, test_rmse, second_model_chorus = TabNet_modeling(first_selected_chorus_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 413 with best_epoch = 113 and best_valid_rmse = 0.53243
BEST VALID SCORE FOR DATA : 0.5324326927044309
FINAL TEST SCORE FOR DATA : 0.12856383586447911


- 성악

In [15]:
X = voice_feature.drop([i for i in voice_feature.columns if 'TARGET' in i], axis=1)
y = voice_feature[[i for i in voice_feature.columns if 'TARGET' in i]]

In [16]:
# 최종 모델
first_selected_voice_feature, val_rmse, test_rmse, first_model_voice = TabNet_modeling(X, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 504 with best_epoch = 204 and best_valid_rmse = 0.32671
BEST VALID SCORE FOR DATA : 0.32671330440768853
FINAL TEST SCORE FOR DATA : 22.099918787583867


In [17]:
second_selected_voice_feature, val_rmse, test_rmse, second_model_voice = TabNet_modeling(first_selected_voice_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 886 with best_epoch = 586 and best_valid_rmse = 0.12837
BEST VALID SCORE FOR DATA : 0.1283736560575514
FINAL TEST SCORE FOR DATA : 0.09643088376399896


In [19]:
third_selected_voice_feature, val_rmse, test_rmse, third_model_voice = TabNet_modeling(second_selected_voice_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 403 with best_epoch = 103 and best_valid_rmse = 0.1881
BEST VALID SCORE FOR DATA : 0.18810108340403497
FINAL TEST SCORE FOR DATA : 0.17642773066072176


In [20]:
fourth_selected_voice_feature, val_rmse, test_rmse, fourth_model_voice = TabNet_modeling(third_selected_voice_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 623 with best_epoch = 323 and best_valid_rmse = 0.41191
BEST VALID SCORE FOR DATA : Done!
FINAL TEST SCORE FOR DATA : Done!


- 독주

In [21]:
# 최종 모델
X = solo_feature.drop([i for i in solo_feature.columns if 'TARGET' in i], axis=1)
y = solo_feature[[i for i in solo_feature.columns if 'TARGET' in i]]

In [22]:
first_selected_solo_feature, val_rmse, test_rmse, first_model_solo = TabNet_modeling(X, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 394 with best_epoch = 94 and best_valid_rmse = 0.39046
BEST VALID SCORE FOR DATA : 0.3904626979699089
FINAL TEST SCORE FOR DATA : 0.18251669189340375


In [23]:
second_selected_solo_feature, val_rmse, test_rmse, second_model_solo = TabNet_modeling(first_selected_solo_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 513 with best_epoch = 213 and best_valid_rmse = 0.32068
BEST VALID SCORE FOR DATA : Done!
FINAL TEST SCORE FOR DATA : Done!


- 실내악

In [24]:
X = chamber_feature.drop([i for i in chamber_feature.columns if 'TARGET' in i], axis=1)
y = chamber_feature[[i for i in chamber_feature.columns if 'TARGET' in i]]

In [25]:
# 최종 모델
first_selected_chamber_feature, val_rmse, test_rmse, first_model_chamber = TabNet_modeling(X, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 632 with best_epoch = 332 and best_valid_rmse = 0.25366
BEST VALID SCORE FOR DATA : 0.25365698862744557
FINAL TEST SCORE FOR DATA : 0.06997700386363327


In [26]:
second_selected_chamber_feature, val_rmse, test_rmse, second_model_chamber = TabNet_modeling(first_selected_chamber_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 813 with best_epoch = 513 and best_valid_rmse = 0.12287
BEST VALID SCORE FOR DATA : 0.12286779365576868
FINAL TEST SCORE FOR DATA : 0.05712674240821076


In [27]:
third_selected_chamber_feature, val_rmse, test_rmse, third_model_chamber = TabNet_modeling(second_selected_chamber_feature, y)
print(f"BEST VALID SCORE FOR DATA : {val_rmse}")
print(f"FINAL TEST SCORE FOR DATA : {test_rmse}")


Early stopping occurred at epoch 616 with best_epoch = 316 and best_valid_rmse = 0.22706
BEST VALID SCORE FOR DATA : Done!
FINAL TEST SCORE FOR DATA : Done!


## Save data

In [28]:
pickle.dump((fourth_selected_symphony_feature,
             chorus_feature.drop([i for i in chorus_feature.columns if 'TARGET' in i], axis=1),
             first_selected_voice_feature,
             solo_feature.drop([i for i in solo_feature.columns if 'TARGET' in i], axis=1), 
             first_selected_chamber_feature), 
            open(f'../data/SaleRatePredictionFeatures_useful.pkl', 'wb'))