In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import clear_output
import torch

from warnings import simplefilter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
pd.set_option('display.max_columns', 500)
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [3]:
SEED = 909
PATH = os.getcwd()
train = pd.read_csv(f'{PATH}/data/train.csv')
test = pd.read_csv(f'{PATH}/data/test.csv')

In [4]:
rename_columns = {
    "제조사": "Manufacturer",           "모델": "Model",
    "차량상태": "VehicleCondition",     "배터리용량": "BatteryCapacity",
    "구동방식": "DriveType",            "주행거리(km)": "MileageKm",
    "보증기간(년)": "WarrantyYears",    "사고이력": "AccidentHistory",
    "연식(년)": "Year",                 "가격(백만원)": "Price",
}

train = train.rename(columns=rename_columns)
test = test.rename(columns=rename_columns)

In [5]:
train['BatteryCapacity'] = train['BatteryCapacity'].fillna(train.groupby(['Manufacturer', 'Model'])['BatteryCapacity'].transform('mean'))
test['BatteryCapacity'] = test['BatteryCapacity'].fillna(test.groupby(['Manufacturer', 'Model'])['BatteryCapacity'].transform('mean'))

In [316]:
without_columns = ['ID', 'Price'] # , 'AccidentHistory', 'DriveType'
categorical_columns = [col for col in train.columns if (train[col].dtype in ['object', 'category']) and (col not in without_columns)]
numerical_columns  = [col for col in train.columns if col not in categorical_columns and (col not in without_columns)]

In [317]:
# LABEL ENCODE -> categorical features
for feat in categorical_columns:
    le = LabelEncoder()
    train[feat] = le.fit_transform(train[feat])
    test[feat] = le.transform(test[feat])

In [318]:
from itertools import combinations
target_encode_columns = []
target_encode_columns = categorical_columns.copy() # + numerical_columns.copy()
target_encode_columns += list(map(list, combinations(categorical_columns, 2)))
target_encode_columns += list(map(list, combinations(categorical_columns, 3)))
target_encode_columns += list(map(list, combinations(categorical_columns, 4)))

In [319]:
# https://github.com/rapidsai/deeplearning/blob/main/RecSys2020Tutorial/03_3_TargetEncoding.ipynb
def target_encode(train, valid, test, encode_col, target_col, smooth=0.0, agg="mean"):
    encoded_col = f'TE_{agg.upper()}_' + '_'.join(encode_col)
    
    df_tmp = train[encode_col + [target_col]].groupby(encode_col).agg([agg, 'count']).reset_index()
    if agg=="mean": mn = train[target_col].mean()
    elif agg=="median": mn = train[target_col].median()
    elif agg=="std": mn = train[target_col].std()
    elif agg=="min": mn = train[target_col].min()
    elif agg=="max": mn = train[target_col].max()
    
    df_tmp.columns = encode_col + [agg, 'count']
    df_tmp['TE_tmp'] = ((df_tmp[agg] * df_tmp['count']) + (mn * smooth)) / (df_tmp['count'] + smooth)
    
    train = train.merge(df_tmp[encode_col + ['TE_tmp']], how='left', left_on=encode_col, right_on=encode_col)
    train[encoded_col] = train['TE_tmp'].fillna(mn)
    train = train.drop(columns=['TE_tmp'])
    
    df_tmp_m = valid[encode_col].merge(df_tmp, how='left', left_on=encode_col, right_on=encode_col)
    valid[encoded_col] = df_tmp_m['TE_tmp'].fillna(mn).values

    df_tmp_m = test[encode_col].merge(df_tmp, how='left', left_on=encode_col, right_on=encode_col)
    test[encoded_col] = df_tmp_m['TE_tmp'].fillna(mn).values
    
    return train, valid, test

In [320]:
class SimpleGRU(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(SimpleGRU, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.GRU = torch.nn.GRU(input_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.fc1 = torch.nn.Linear(hidden_size, hidden_size//2)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(dropout)
        self.fc2 = torch.nn.Linear(hidden_size//2, 1)
        
    
    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        
        x, _ = self.GRU(x, h0)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x


In [321]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, X: pd.DataFrame, y: pd.DataFrame=None):
        self.X = X.reset_index(drop=True)
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        X_values = self.X.iloc[idx].values

        X_tensor = torch.tensor(X_values, dtype=torch.float32)
        if self.y is not None: y_tensor = torch.tensor(self.y[idx], dtype=torch.float32)

        # LSTM 입력 형식 (batch_size, seq_len, input_size)
        X_tensor = X_tensor.unsqueeze(0)
        
        if self.y is not None:
            return X_tensor, y_tensor   
        else:
            return X_tensor

In [322]:
X = train.drop('ID', axis=1)
X_test = test.drop('ID', axis=1)

In [323]:
X_train, X_valid = train_test_split(X, test_size=0.2, random_state=SEED)

In [324]:
for _, feat in enumerate(target_encode_columns):                    
    if isinstance(feat, list): c = feat
    else: c = [feat]    
    # TARGET ENCODE 
    X_train, X_valid, X_test = target_encode(X_train, X_valid, X_test, encode_col=c, target_col="Price", smooth=0.0, agg="mean")
    X_train, X_valid, X_test = target_encode(X_train, X_valid, X_test, encode_col=c, target_col="Price", smooth=0.0, agg="median")
    X_train, X_valid, X_test = target_encode(X_train, X_valid, X_test, encode_col=c, target_col="Price", smooth=0.0, agg="std")
    
X_train, y_train = X_train.drop('Price', axis=1), X_train[['Price']]
X_valid, y_valid = X_valid.drop('Price', axis=1), X_valid[['Price']]
            
X_scaler = StandardScaler()
X_train_scaled = X_scaler.fit_transform(X_train)
X_valid_scaled = X_scaler.transform(X_valid)
X_test_scaled = X_scaler.transform(X_test)

y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train)
y_valid_scaled = y_scaler.transform(y_valid)

X_train_reconstruct = pd.DataFrame(data=X_train_scaled, columns=X_train.columns, index=X_train.index)
X_valid_reconstruct = pd.DataFrame(data=X_valid_scaled, columns=X_valid.columns, index=X_valid.index)
test_reconstruct = pd.DataFrame(data=X_test_scaled, columns=X_test.columns, index=X_test.index)

In [325]:
train_dataset = CustomDataset(X_train_reconstruct, y_train_scaled)
valid_dataset = CustomDataset(X_valid_reconstruct, y_valid_scaled)
test_dataset = CustomDataset(test_reconstruct)

In [326]:
num_epochs = 1000
learning_rate = 0.01
batch_size = 1024
patience = 30

input_size = X_train_reconstruct.shape[1]
hidden_size = 1024
num_layers = 1
dropout = 0.0

In [327]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

In [328]:
model = SimpleGRU(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    dropout=dropout
)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, threshold=1e-10, min_lr=1e-10)

In [329]:
best_valid_loss = float('inf') 
early_stop_counter = 0 

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0    
    train_pbar = tqdm(train_dataloader, unit='batch', desc='Train')
    for _, (X_batch, y_batch) in enumerate(train_pbar):
        optimizer.zero_grad()
        outputs = model(X_batch) 
        
        outputs = outputs[:, -1, :] 
        
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_dataloader)

    model.eval()
    valid_loss = 0.0
    valid_pbar = tqdm(valid_dataloader, unit='batch', desc='Valid')
    with torch.no_grad():
        for _, (X_val, y_val) in enumerate(valid_pbar):
            val_outputs = model(X_val)
            val_outputs = val_outputs[:, -1, :] 
            
            val_loss = criterion(val_outputs, y_val)
            valid_loss += val_loss.item()

    valid_loss /= len(valid_dataloader)

    print(f"[Epoch {epoch+1}/{num_epochs}] Train Loss: {train_loss:.8f} | Valid Loss: {valid_loss:.8f}", end=" | ")
    
    if scheduler:
        scheduler.step(valid_loss)
        lr = scheduler.optimizer.param_groups[0]['lr']
        print(f"Learnin Rate : {lr}", end=" | ")
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        early_stop_counter = 0 
        best_model_state = model.state_dict() 
        print(f"Validation loss improved")
    else:
        early_stop_counter += 1
        print(f"No improvement in validation loss. {early_stop_counter}/{patience}")
    
    if (epoch + 1) % 10 == 0:
        clear_output(wait=True)

    if early_stop_counter >= patience:
        print(f"Early stopping triggered. Best valdation loss is {best_valid_loss}")
        break

Train: 100%|██████████| 6/6 [00:01<00:00,  3.61batch/s]
Valid: 100%|██████████| 2/2 [00:00<00:00,  7.96batch/s]


[Epoch 931/1000] Train Loss: 0.00185527 | Valid Loss: 0.00230521 | Learnin Rate : 6.103515625e-07 | No improvement in validation loss. 25/30


Train: 100%|██████████| 6/6 [00:02<00:00,  2.28batch/s]
Valid: 100%|██████████| 2/2 [00:00<00:00,  7.22batch/s]


[Epoch 932/1000] Train Loss: 0.00186034 | Valid Loss: 0.00230518 | Learnin Rate : 6.103515625e-07 | No improvement in validation loss. 26/30


Train: 100%|██████████| 6/6 [00:01<00:00,  3.71batch/s]
Valid: 100%|██████████| 2/2 [00:00<00:00,  9.80batch/s]


[Epoch 933/1000] Train Loss: 0.00186279 | Valid Loss: 0.00230521 | Learnin Rate : 6.103515625e-07 | No improvement in validation loss. 27/30


Train: 100%|██████████| 6/6 [00:01<00:00,  4.24batch/s]
Valid: 100%|██████████| 2/2 [00:00<00:00,  9.00batch/s]


[Epoch 934/1000] Train Loss: 0.00186776 | Valid Loss: 0.00230519 | Learnin Rate : 6.103515625e-07 | No improvement in validation loss. 28/30


Train: 100%|██████████| 6/6 [00:01<00:00,  4.19batch/s]
Valid: 100%|██████████| 2/2 [00:00<00:00,  9.13batch/s]


[Epoch 935/1000] Train Loss: 0.00186736 | Valid Loss: 0.00230524 | Learnin Rate : 6.103515625e-07 | No improvement in validation loss. 29/30


Train: 100%|██████████| 6/6 [00:01<00:00,  4.28batch/s]
Valid: 100%|██████████| 2/2 [00:00<00:00,  7.75batch/s]

[Epoch 936/1000] Train Loss: 0.00186065 | Valid Loss: 0.00230518 | Learnin Rate : 6.103515625e-07 | No improvement in validation loss. 30/30
Early stopping triggered. Best valdation loss is 0.002305082860402763





In [330]:
model.eval()
predictions = []
with torch.no_grad():
    for X_batch in test_dataloader:
        pred = model(X_batch)
        pred = pred[:, -1, :]  # (batch_size, 1)
        predictions.append(pred.cpu().numpy())

predictions = np.concatenate(predictions, axis=0)  # (전체 검증 샘플 수, 1)

# 스케일러 역변환
predictions_inversed = y_scaler.inverse_transform(predictions)  # 원본 값으로 복원

In [331]:
submit = pd.read_csv(f'{PATH}/data/sample_submission.csv')
submit['가격(백만원)'] = list(map(float, predictions_inversed.flatten()))
submit.head()

Unnamed: 0,ID,가격(백만원)
0,TEST_000,130.792572
1,TEST_001,79.693657
2,TEST_002,55.093788
3,TEST_003,34.899097
4,TEST_004,45.692009


In [332]:
submit.to_csv(f'{PATH}/result/lstm/LSTM_CV-{best_valid_loss:.6f}_LB-.csv', index=False)