In [160]:
import os
import numpy as np
import pandas as pd
import category_encoders as ce
import torch

from tqdm import tqdm
from warnings import simplefilter
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

In [161]:
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [162]:
SEED = 999
PATH = os.getcwd()
train = pd.read_csv(f'{PATH}/data/train.csv').drop(columns=['ID'], axis=1)
test = pd.read_csv(f'{PATH}/data/test.csv').drop(columns=['ID'], axis=1)

In [163]:
rename_columns = {
    "제조사": "Manufacturer",           "모델": "Model",
    "차량상태": "VehicleCondition",     "배터리용량": "BatteryCapacity",
    "구동방식": "DriveType",            "주행거리(km)": "MileageKm",
    "보증기간(년)": "WarrantyYears",    "사고이력": "AccidentHistory",
    "연식(년)": "Year",                 "가격(백만원)": "Price",
}

train = train.rename(columns=rename_columns)
test = test.rename(columns=rename_columns)

In [164]:
# 배터리 용량 결측치 처리
upper7 = train[train['WarrantyYears']>=7]['BatteryCapacity'].mean()
lower7 = train[train['WarrantyYears']<7]['BatteryCapacity'].mean()

train.fillna(-1, inplace=True)
test.fillna(-1, inplace=True)

def fill_battery(row):
    if row['BatteryCapacity'] == -1:
        if row['WarrantyYears'] >= 7: return upper7
        else: return lower7
    return row['BatteryCapacity']

train['BatteryCapacity'] = train.apply(fill_battery, axis=1)
test['BatteryCapacity'] = test.apply(fill_battery, axis=1)

In [165]:
# bins = [0, 60, 80, 100]
# labels = ["s", "m", "l"]
# train["BatteryCapacity_cut"] = pd.cut(train["BatteryCapacity"], bins=bins, labels=labels, right=False, include_lowest=True)
# test["BatteryCapacity_cut"] = pd.cut(test["BatteryCapacity"], bins=bins, labels=labels, right=False, include_lowest=True)

In [166]:
train['Year'] = 2024 - train['Year']
test['Year'] = 2024 -  test['Year']

train['WarrantyYears'] = 2024 - train['WarrantyYears']
test['WarrantyYears'] = 2024 -  test['WarrantyYears']

In [167]:
without_columns = ['ID', 'Price']
categorical_columns = [col for col in train.columns if (train[col].dtype in ['object', 'category']) and (col not in without_columns)]
numerical_columns  = [col for col in train.columns if col not in categorical_columns and (col not in without_columns)]

for i in range(len(numerical_columns)):
    for j in range(i, len(numerical_columns)):
        train[f'{numerical_columns[i]}*{numerical_columns[j]}'] = train[numerical_columns[i]] * train[numerical_columns[j]]
        test[f'{numerical_columns[i]}*{numerical_columns[j]}'] = test[numerical_columns[i]] * test[numerical_columns[j]]
        
train['BatteryCapacity/MileageKm'] = train['BatteryCapacity'] / train['MileageKm']
test['BatteryCapacity/MileageKm'] = test['BatteryCapacity'] / test['MileageKm']

numerical_columns  = [col for col in train.columns if col not in categorical_columns and (col not in without_columns)]

In [168]:
class SimpleGRU(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(SimpleGRU, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.GRU = torch.nn.GRU(input_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.fc1 = torch.nn.Linear(hidden_size, hidden_size//2)
        self.fc2 = torch.nn.Linear(hidden_size//2, hidden_size//4)
        self.fc3 = torch.nn.Linear(hidden_size//4, 1)
        
        self.dropout = torch.nn.Dropout(p=dropout)
        self.relu = torch.nn.ReLU()
    
    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        
        x, _ = self.GRU(x)
        x = self.fc1(x[:, -1, :])
        x = self.dropout(self.relu(x))
        x = self.fc2(x)
        x = self.dropout(self.relu(x))
        x = self.fc3(x)
        
        return x

In [169]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if isinstance(self.X, pd.DataFrame):
            X_tensor = torch.tensor(self.X.iloc[idx], dtype=torch.float32)        
        else:   
            X_tensor = torch.tensor(self.X[idx], dtype=torch.float32)        
        X_tensor = X_tensor.unsqueeze(0)
        
        if self.y is not None: 
            if isinstance(self.y, pd.DataFrame):
                y_tensor = torch.tensor(self.y.iloc[idx], dtype=torch.float32)
            else:
                y_tensor = torch.tensor(self.y[idx], dtype=torch.float32)
            return X_tensor, y_tensor
        return X_tensor

In [170]:
class WeightedMSELoss(torch.nn.Module):
    def __init__(self, scale=1.0):
        super(WeightedMSELoss, self).__init__()
        self.scale = scale

    def forward(self, preds, labels):
        residuals = labels - preds
        weights = torch.where(residuals > 0, self.scale, 1.0)
        loss = torch.mean(weights * residuals ** 2)
        return loss

In [171]:
def target_encode(X_train, X_valid, X_test, encode_col, target_col, smooth=0.0, agg="mean"):
    encoded_col = f'TE_{agg.upper()}_' + '_'.join(encode_col)
    
    df_tmp = X_train[encode_col + [target_col]].groupby(encode_col).agg([agg, 'count']).reset_index()
    if agg=="mean": mn = X_train[target_col].mean()
    elif agg=="median": mn = X_train[target_col].median()
    elif agg=="std": mn = X_train[target_col].std()
    elif agg=="min": mn = X_train[target_col].min()
    elif agg=="max": mn = X_train[target_col].max()
    
    df_tmp.columns = encode_col + [agg, 'count']
    df_tmp['TE_tmp'] = ((df_tmp[agg] * df_tmp['count']) + (mn * smooth)) / (df_tmp['count'] + smooth)
    
    X_train = X_train.merge(df_tmp[encode_col + ['TE_tmp']], how='left', left_on=encode_col, right_on=encode_col)
    X_train[encoded_col] = X_train['TE_tmp'].fillna(mn)
    X_train = X_train.drop(columns=['TE_tmp'])
    # X_train[encoded_col] = X_train[encoded_col].astype("float32")
    
    df_tmp_m = X_valid[encode_col].merge(df_tmp, how='left', left_on=encode_col, right_on=encode_col)
    X_valid[encoded_col] = df_tmp_m['TE_tmp'].fillna(mn).values
    # X_valid[encoded_col] = X_valid[encoded_col].astype("float32")

    df_tmp_m = X_test[encode_col].merge(df_tmp, how='left', left_on=encode_col, right_on=encode_col)
    X_test[encoded_col] = df_tmp_m['TE_tmp'].fillna(mn).values
    # X_test[encoded_col] = X_test[encoded_col].astype("float32")
    
    return X_train, X_valid, X_test

In [172]:
X, y = train, train[['Price']]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=SEED)
X_test = test

X_train, y_train = X_train.reset_index(drop=True), y_train.reset_index(drop=True)
X_valid, y_valid = X_valid.reset_index(drop=True), y_valid.reset_index(drop=True)

## Target Encoder
encoder_columns = [
    'Manufacturer', 'Model', 
    # ['Manufacturer', 'BatteryCapacity_cut'], 
    # ['Manufacturer', 'AccidentHistory'], 
    # ['Model', 'BatteryCapacity_cut'], 
    # ['Model', 'AccidentHistory'], 
]
for column in encoder_columns:
    if not isinstance(column, list): column = [column]
    X_train, X_valid, X_test = target_encode(X_train, X_valid, X_test, encode_col=column, target_col='Price', smooth=0.0, agg="mean")

X_train = X_train.drop(columns=['Price'] + ['Manufacturer', 'Model'], axis=1) # 
X_valid = X_valid.drop(columns=['Price'] + ['Manufacturer', 'Model'], axis=1) # 
X_test = X_test.drop(columns=[] + ['Manufacturer', 'Model'], axis=1) # 

encoder_columns = ['VehicleCondition', 'DriveType', 'AccidentHistory'] # , 'BatteryCapacity_cut'
encoder = ce.PolynomialEncoder(cols=encoder_columns)
X_train = encoder.fit_transform(X_train)
X_valid = encoder.transform(X_valid)
X_test = encoder.transform(X_test)

scaler_columns = numerical_columns
X_scaler = StandardScaler()
X_train[scaler_columns] = X_scaler.fit_transform(X_train[scaler_columns])
X_valid[scaler_columns] = X_scaler.transform(X_valid[scaler_columns])
X_test[scaler_columns] = X_scaler.transform(X_test[scaler_columns])

y_scaler = StandardScaler()
y_train[['Price']] = y_scaler.fit_transform(y_train[['Price']])
y_valid[['Price']] = y_scaler.transform(y_valid[['Price']])

In [173]:
num_epochs = 1000
learning_rate = 0.001
batch_size = 128
patience = 50

input_size = X_train.shape[1]
hidden_size = 1024
num_layers = 2
dropout = 0.1

In [174]:
X_train.shape

(6747, 22)

In [175]:
train_dataset = CustomDataset(X_train, y_train)
valid_dataset = CustomDataset(X_valid, y_valid)
test_dataset = CustomDataset(X_test)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

In [176]:
model = SimpleGRU(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    dropout=dropout
)

criterion = WeightedMSELoss(1.2) # torch.nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, threshold=1e-12, min_lr=1e-12)

In [177]:
best_valid_loss = float('inf') 
early_stop_counter = 0 

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0    
    train_pbar = tqdm(train_dataloader, unit='batch', desc='Train')
    for _, (X_batch, y_batch) in enumerate(train_pbar):
        optimizer.zero_grad()
        outputs = model(X_batch)         
        outputs = outputs
        
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_dataloader)

    model.eval()
    valid_loss = 0.0
    valid_pbar = tqdm(valid_dataloader, unit='batch', desc='Valid')
    with torch.no_grad():
        for _, (X_batch, y_batch) in enumerate(valid_pbar):
            val_outputs = model(X_batch)
            val_outputs = val_outputs
            
            val_loss = criterion(val_outputs, y_batch)
            valid_loss += val_loss.item()

    valid_loss /= len(valid_dataloader)

    print(f"[Epoch {epoch+1}/{num_epochs}] Train Loss: {train_loss:.8f} | Valid Loss: {valid_loss:.8f}", end=" | ")
    
    if scheduler:
        scheduler.step(valid_loss)
        lr = scheduler.optimizer.param_groups[0]['lr']
        print(f"Learnin Rate : {lr}", end=" | ")
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        early_stop_counter = 0 
        best_model_state = model.state_dict() 
        torch.save(best_model_state, f'{PATH}/result/lstm/weights/best.pt')
        print(f"Validation loss improved")
    else:
        early_stop_counter += 1
        print(f"No improvement in validation loss. {early_stop_counter}/{patience}")
    
    if early_stop_counter >= patience:
        print(f"Early stopping triggered. Best valdation loss is {best_valid_loss}")
        break
    
    if (epoch + 1) % 10 == 0:
        clear_output(wait=True)

Train: 100%|██████████| 53/53 [00:10<00:00,  5.26batch/s]
Valid: 100%|██████████| 6/6 [00:00<00:00, 11.23batch/s]


[Epoch 181/1000] Train Loss: 0.00349684 | Valid Loss: 0.00137491 | Learnin Rate : 1.953125e-06 | No improvement in validation loss. 46/50


Train: 100%|██████████| 53/53 [00:09<00:00,  5.81batch/s]
Valid: 100%|██████████| 6/6 [00:00<00:00, 13.61batch/s]


[Epoch 182/1000] Train Loss: 0.00354254 | Valid Loss: 0.00136294 | Learnin Rate : 1.953125e-06 | No improvement in validation loss. 47/50


Train: 100%|██████████| 53/53 [00:09<00:00,  5.85batch/s]
Valid: 100%|██████████| 6/6 [00:00<00:00, 11.56batch/s]


[Epoch 183/1000] Train Loss: 0.00341971 | Valid Loss: 0.00140964 | Learnin Rate : 1.953125e-06 | No improvement in validation loss. 48/50


Train: 100%|██████████| 53/53 [00:09<00:00,  5.86batch/s]
Valid: 100%|██████████| 6/6 [00:00<00:00,  9.66batch/s]


[Epoch 184/1000] Train Loss: 0.00354384 | Valid Loss: 0.00140778 | Learnin Rate : 1.953125e-06 | No improvement in validation loss. 49/50


Train: 100%|██████████| 53/53 [00:08<00:00,  6.05batch/s]
Valid: 100%|██████████| 6/6 [00:00<00:00, 12.18batch/s]

[Epoch 185/1000] Train Loss: 0.00359621 | Valid Loss: 0.00139146 | Learnin Rate : 1.953125e-06 | No improvement in validation loss. 50/50
Early stopping triggered. Best valdation loss is 0.0013573304361974199





In [181]:
model.eval()
predictions = []
with torch.no_grad():
    for X_batch in test_dataloader:
        pred = model(X_batch)
        pred = pred
        predictions.append(pred.cpu().numpy())

predictions = np.concatenate(predictions, axis=0)

  X_tensor = torch.tensor(self.X.iloc[idx], dtype=torch.float32)


In [183]:
submit = pd.read_csv(f'{PATH}/data/sample_submission.csv')
submit['가격(백만원)'] = list(map(float, y_scaler.inverse_transform(predictions))) # y_scaler.inverse_transform(predictions)
submit.head()

  submit['가격(백만원)'] = list(map(float, y_scaler.inverse_transform(predictions))) # y_scaler.inverse_transform(predictions)


Unnamed: 0,ID,가격(백만원)
0,TEST_000,130.929901
1,TEST_001,80.123482
2,TEST_002,64.982758
3,TEST_003,35.278687
4,TEST_004,48.143429


In [184]:
submit.to_csv(f'{PATH}/result/lstm/LSTM_CV-{best_valid_loss:.12f}_LB-.csv', index=False)