# Create missing mask for train, val, test tabular data

In [1]:
'''
* Licensed under the Apache License, Version 2.
* By Siyi Du, 2024
'''
import numpy as np
import pandas as pd
from os.path import join, dirname
import torch
from sklearn.metrics import roc_auc_score, accuracy_score

In [2]:
'''
create_missing_mask(data_tabular_path, mask_path, random_seed, missing_strategy, missing_rate)
: 값 단위 또는 특징 단위로 무작위로 결측값을 생성하는 마스크 생성
'''
def create_missing_mask(data_tabular_path, mask_path, random_seed, missing_strategy, missing_rate):
    '''
    missing_strategy: value (random value missingness) or feature (random feature missingness)
                      value: 개별 값 단위로 랜덤하게 결측값 생성
                      feature: 특정 열 전체를 랜덤하게 결측값 생성
    missing_rate: 0.0-1.0
    '''
    data_tabular = np.array(pd.read_csv(data_tabular_path, header=None))
    print(f'data tabular shape: {data_tabular.shape}')
    np.random.seed(random_seed)
    M, N = data_tabular.shape[0], data_tabular.shape[1]
    
    if missing_strategy == 'value':
        missing_mask_data = np.zeros((M*N), dtype=bool)
        mask_pos = np.random.choice(M*N, size=int(M*N*missing_rate), replace=False)
        missing_mask_data[mask_pos] = True
        missing_mask_data = missing_mask_data.reshape((M,N))
        
    elif missing_strategy == 'feature':
        missing_mask_data = np.zeros((M,N), dtype=bool)
        mask_pos = np.random.choice(N, size=int(N*missing_rate), replace=False)
        missing_mask_data[:,mask_pos] = True
        
    else:
        raise print('Only support value and feature missing strategy')
    
    np.save(mask_path, missing_mask_data)
    print(f'Real missing rate: {missing_mask_data.sum()/missing_mask_data.size}')
    print(f'Save missing mask to {mask_path}')
    return missing_mask_data

'''
create_certain_missing_mask(data_tabular_path, mask_path, mask_pos_order, missing_strategy, missing_rate)
: 특정 순서를 기반하여 결측값 생성 마스크.
'''
def create_certain_missing_mask(data_tabular_path, mask_path, mask_pos_order, missing_strategy, missing_rate):
    '''
    Create mask according to a mask order list (for MI and LI feature missingness)
        MI: 랜덤 포레스트의 특징 중요도 내림차순 (중요도가 높은 특징 순서)
        LI: 랜덤 포레스트의 특징 중요도 올림차순 (중요도가 낮은 특징 순서)
    '''
    data_tabular = np.array(pd.read_csv(data_tabular_path, header=None))
    print(f'data tabular shape: {data_tabular.shape}')
    M, N = data_tabular.shape[0], data_tabular.shape[1]
    assert N == len(mask_pos_order)
    mask_pos = mask_pos_order[:int(N*missing_rate)]
    missing_mask_data = np.zeros((M,N), dtype=bool)
    missing_mask_data[:,mask_pos] = True
    np.save(mask_path, missing_mask_data)
    print(f'Real missing rate: {missing_mask_data.sum()/missing_mask_data.size}')
    print(f'Save missing mask to {mask_path}')
    return missing_mask_data

## DVM

In [3]:
# TODO: change to your own path
FEATURES = '/data/ephemeral/home/data/base_features'
MASK_PATH = join(FEATURES, 'missing_mask')

#### Random mask

In [4]:
# 데이터의 개별 값 단위로 무작위 결측값 생성
missing_strategy = 'value'
# 결측값 비율
missing_rate = 0.3
target = 'dvm'

train_name = 'dvm_features_train_noOH_all_views_reordered.csv'
val_name = 'dvm_features_val_noOH_all_views_reordered.csv'
test_name = 'dvm_features_test_noOH_all_views_reordered.csv'
for name, seed, split in zip([train_name, val_name, test_name], [2021,2022,2023], ['train', 'val', 'test']):
    save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    path = join(FEATURES, name)
    # print(path)
    create_missing_mask(path, save_mask_path, seed, missing_strategy, missing_rate)

data tabular shape: (106676, 16)
Real missing rate: 0.29999953129101203
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_train_noOH_all_views_reordered_dvm_value_0.3.npy
data tabular shape: (26669, 16)
Real missing rate: 0.29999953129101203
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_val_noOH_all_views_reordered_dvm_value_0.3.npy
data tabular shape: (33337, 16)
Real missing rate: 0.2999988751237364
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_test_noOH_all_views_reordered_dvm_value_0.3.npy


In [5]:
# 특정 열 전체를 랜덤하게 결측값 생성
missing_strategy = 'feature'

train_name = 'dvm_features_train_noOH_all_views_reordered.csv'
val_name = 'dvm_features_val_noOH_all_views_reordered.csv'
test_name = 'dvm_features_test_noOH_all_views_reordered.csv'
for name, seed, split in zip([train_name, val_name, test_name], [2022,2022,2022], ['train', 'val', 'test']):
    save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    path = join(FEATURES, name)
    # print(path)
    create_missing_mask(path, save_mask_path, seed, missing_strategy, missing_rate)

data tabular shape: (106676, 16)
Real missing rate: 0.25
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_train_noOH_all_views_reordered_dvm_feature_0.3.npy
data tabular shape: (26669, 16)
Real missing rate: 0.25
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_val_noOH_all_views_reordered_dvm_feature_0.3.npy
data tabular shape: (33337, 16)
Real missing rate: 0.25
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_test_noOH_all_views_reordered_dvm_feature_0.3.npy


In [6]:
# Check train, val, test to miss the same columns
# 결측값 마스크 일관성 확인

train_np = np.load(join(MASK_PATH, f'{train_name[:-4]}_dvm_feature_0.3.npy'))
val_np = np.load(join(MASK_PATH, f'{val_name[:-4]}_dvm_feature_0.3.npy'))
test_np = np.load(join(MASK_PATH, f'{test_name[:-4]}_dvm_feature_0.3.npy'))
print(train_np[0])
print(val_np[0])
print(test_np[0])

[False False False  True  True False  True False False False  True False
 False False False False]
[False False False  True  True False  True False False False  True False
 False False False False]
[False False False  True  True False  True False False False  True False
 False False False False]


#### Mask based on importance

Random Forest 분류기를 사용해서 훈련 데이터로 학습하고 테스트 테이터 예측 수행하여 계산

-> Masked 값을 찾는 것이 아닌, 레이블 값 학습과 예측

In [7]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# 컬럼명 정의
reordered_column_name = ['Maker', 'Genmodel', 'Color', 'Bodytype', 'Gearbox', 'Fuel_type',
                'Wheelbase', 'Height', 'Width', 'Length', 'Seat_num', 'Door_num', 'Entry_price', 
                'Year', 'First_release_year', 'Engine_size']

# 데이터 로드
X_train = pd.read_csv(join(FEATURES, 'dvm_features_train_noOH_all_views_reordered.csv'), header=None)
X_test = pd.read_csv(join(FEATURES, f'dvm_features_test_noOH_all_views_reordered.csv'), header=None)
y_train = torch.load(join(FEATURES, 'labels_model_all_train_all_views.pt'))
y_test = torch.load(join(FEATURES, 'labels_model_all_test_all_views.pt'))

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. RandomForest
print("=== RandomForest 결과 ===")
rf = RandomForestRegressor(random_state=2022)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)

rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)
print(f"MSE: {rf_mse:.4f}")
print(f"R² Score: {rf_r2:.4f}")

# 2. GradientBoosting
print("\n=== GradientBoosting 결과 ===")
gb = GradientBoostingRegressor(random_state=2022)
gb.fit(X_train_scaled, y_train)
gb_pred = gb.predict(X_test_scaled)

gb_mse = mean_squared_error(y_test, gb_pred)
gb_r2 = r2_score(y_test, gb_pred)
print(f"MSE: {gb_mse:.4f}")
print(f"R² Score: {gb_r2:.4f}")

# 3. XGBoost
print("\n=== XGBoost 결과 ===")
xgb = XGBRegressor(random_state=2022)
xgb.fit(X_train_scaled, y_train)
xgb_pred = xgb.predict(X_test_scaled)

xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)
print(f"MSE: {xgb_mse:.4f}")
print(f"R² Score: {xgb_r2:.4f}")

# 모델 비교
print("\n=== 모델 성능 비교 ===")
results = pd.DataFrame({
    'Model': ['RandomForest', 'GradientBoosting', 'XGBoost'],
    'MSE': [rf_mse, gb_mse, xgb_mse],
    'R² Score': [rf_r2, gb_r2, xgb_r2]
})
print(results)

# 컬럼 수 확인
print(f"\n데이터 컬럼 수: {X_train.shape[1]}")
print(f"정의된 컬럼명 수: {len(reordered_column_name)}")

# 가장 좋은 모델의 특징 중요도 확인
best_model = min([(rf_mse, rf, 'RandomForest'), 
                 (gb_mse, gb, 'GradientBoosting'), 
                 (xgb_mse, xgb, 'XGBoost')], 
                key=lambda x: x[0])

print(f"\n=== {best_model[2]} 특징 중요도 ===")
feature_importance = pd.DataFrame({
    'feature': reordered_column_name,
    'importance': best_model[1].feature_importances_
})
print(feature_importance.sort_values('importance', ascending=False))

=== RandomForest 결과 ===
MSE: 0.0004
R² Score: 0.9996

=== GradientBoosting 결과 ===
MSE: 0.1190
R² Score: 0.8821

=== XGBoost 결과 ===
MSE: 0.0028
R² Score: 0.9973

=== 모델 성능 비교 ===
              Model       MSE  R² Score
0      RandomForest  0.000370  0.999634
1  GradientBoosting  0.119050  0.882085
2           XGBoost  0.002773  0.997253

데이터 컬럼 수: 16
정의된 컬럼명 수: 16

=== RandomForest 특징 중요도 ===
               feature  importance
1             Genmodel    0.313634
12         Entry_price    0.171641
14  First_release_year    0.136155
13                Year    0.099356
6            Wheelbase    0.091945
8                Width    0.060145
0                Maker    0.057269
9               Length    0.028472
7               Height    0.024672
3             Bodytype    0.008187
15         Engine_size    0.005964
10            Seat_num    0.001499
11            Door_num    0.001006
5            Fuel_type    0.000042
4              Gearbox    0.000008
2                Color    0.000007


In [8]:
'''
Random Forest의 특징 중요도 분석
: 각 특징이 모델 성능에 얼마나 기여했는지를 평가

-> 라벨에 대해 어떤 특징이 중요한지
'''
# Get feature importances
importances = best_model[1].feature_importances_

# Sort feature importances in descending order
# MI_indices: 높은 중요도 순서로 정렬
MI_indices = np.argsort(importances)[::-1]
# LI_indices: 낮은 중요도 순서로 정렬
LI_indices = np.argsort(importances)

# Get feature names
MI_feature_name = [reordered_column_name[x] for x in MI_indices]
print(MI_feature_name)

['Genmodel', 'Entry_price', 'First_release_year', 'Year', 'Wheelbase', 'Width', 'Maker', 'Length', 'Height', 'Bodytype', 'Engine_size', 'Seat_num', 'Door_num', 'Fuel_type', 'Gearbox', 'Color']


In [9]:
missing_rate = 0.3

missing_strategy = 'MI'
train_name = 'dvm_features_train_noOH_all_views_reordered.csv'
val_name = 'dvm_features_val_noOH_all_views_reordered.csv'
test_name = 'dvm_features_test_noOH_all_views_reordered.csv'
for name, split in zip([train_name, val_name, test_name], ['train', 'val', 'test']):
    save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    path = join(FEATURES, name)
    create_certain_missing_mask(path, save_mask_path, MI_indices, missing_strategy, missing_rate)

missing_strategy = 'LI'
train_name = 'dvm_features_train_noOH_all_views_reordered.csv'
val_name = 'dvm_features_val_noOH_all_views_reordered.csv'
test_name = 'dvm_features_test_noOH_all_views_reordered.csv'
for name, split in zip([train_name, val_name, test_name], ['train', 'val', 'test']):
    save_mask_path = join(MASK_PATH, f'{name[:-4]}_{target}_{missing_strategy}_{missing_rate}.npy')
    path = join(FEATURES, name)
    create_certain_missing_mask(path, save_mask_path, LI_indices, missing_strategy, missing_rate)

data tabular shape: (106676, 16)
Real missing rate: 0.25
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_train_noOH_all_views_reordered_dvm_MI_0.3.npy
data tabular shape: (26669, 16)
Real missing rate: 0.25
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_val_noOH_all_views_reordered_dvm_MI_0.3.npy
data tabular shape: (33337, 16)
Real missing rate: 0.25
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_test_noOH_all_views_reordered_dvm_MI_0.3.npy
data tabular shape: (106676, 16)
Real missing rate: 0.25
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_train_noOH_all_views_reordered_dvm_LI_0.3.npy
data tabular shape: (26669, 16)
Real missing rate: 0.25
Save missing mask to /data/ephemeral/home/data/base_features/missing_mask/dvm_features_val_noOH_all_views_reordered_dvm_LI_0.3.npy
data tabular shape: (33337, 16)
Real missing rate: 0.25
Sa

In [10]:
train_np = np.load(join(MASK_PATH, f'{train_name[:-4]}_dvm_MI_0.3.npy'))
val_np = np.load(join(MASK_PATH, f'{val_name[:-4]}_dvm_MI_0.3.npy'))
test_np = np.load(join(MASK_PATH, f'{test_name[:-4]}_dvm_MI_0.3.npy'))
print(train_np[0])
print(val_np[0])
print(test_np[0])

[False  True False False False False False False False False False False
  True  True  True False]
[False  True False False False False False False False False False False
  True  True  True False]
[False  True False False False False False False False False False False
  True  True  True False]
