# Code Templates: LSTM Prediction Model
2024-2 SWCON314

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, ConcatDataset

import glob
from tqdm import tqdm

from LSTMModel import LSTMModel

In [None]:
# Hyperparameters
TIME_STEP = 5
STRIDE = 1
INPUT_SIZE = 24
OUTPUT_SIZE = 1
BATCH_SIZE = 161
HIDDEN_LAYER_SIZE = 100
EPOCHS = 50
LEARNNG_RATE = 0.0005

# path and filename to save trained model
OUTPUT_FILENAME = 'model/free_1.pth'

# path to dataset
DATA_PATH = 'data'

# selecting columns from dataset csv file
x_features = [i for i in range(24)]
y_features = [48]

csv_file_paths = glob.glob(f'{DATA_PATH}/**/*.csv', recursive=True)
for path in csv_file_paths:
    print(path)

cuda_available = torch.cuda.is_available()
print(cuda_available)

In [None]:
def preprocess_file(file_path,  time_steps):
    data = pd.read_csv(file_path)
    X_features = data.values[:, x_features]
    y_target = data.values[:, y_features]
    X, y = create_dataset(X_features, y_target, time_steps)
    return X, y

def create_dataset(X, y, time_steps, stride=STRIDE):
    Xs, ys = [], []
    for i in range(0, len(X) - time_steps, stride):  # i의 증가량을 stride로 조정 -> sequence에서 시작 index를 stride 숫자씩 옮겨가며 시작 index ~ index + time_step만큼의 열을 추출
        v = X[i:(i + time_steps)]   # input feature data sequence
        Xs.append(v)
        ys.append(y[i + time_steps])

    return np.array(Xs), np.array(ys)

def load_and_process_data(file_paths, time_steps, batch_size=16):
    all_X, all_y = [], []
    
    for file_path in file_paths:
        # 파일별 데이터 처리
        X, y = preprocess_file(file_path, time_steps)
        all_X.append(X)
        all_y.append(y)
    
    # 모든 데이터를 하나로 합침
    combined_X = np.concatenate(all_X, axis=0)
    combined_y = np.concatenate(all_y, axis=0)
    
    # 데이터 분할
    X_train, X_val, y_train, y_val = train_test_split(
        combined_X, combined_y, test_size=0.1, random_state=42)
    
    # DataLoader 생성
    train_loader = DataLoader(TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)),
        batch_size=batch_size,
        shuffle=True)
    val_loader = DataLoader(TensorDataset(
        torch.tensor(X_val, dtype=torch.float32),
        torch.tensor(y_val, dtype=torch.float32)),
        batch_size=batch_size,
        shuffle=False)
    
    return train_loader, val_loader

def create_final_loaders(csv_file_paths, time_steps, batch_size):
    # csv_file_paths 분할
    subsets = [csv_file_paths[x:x+100] for x in range(0, len(csv_file_paths), 100)]

    # 각 서브셋에 대한 DataLoader 리스트 초기화
    train_loaders = []
    valid_loaders = []

    # 각 서브셋에 대해 DataLoader 생성 및 리스트에 추가
    for subset in subsets:
        train_loader, valid_loader = load_and_process_data(
            subset,
            time_steps=time_steps,
            batch_size=batch_size)
        train_loaders.append(train_loader)
        valid_loaders.append(valid_loader)

    # DataLoader들을 합친 최종 DataLoader 생성
    final_train_loader = DataLoader(ConcatDataset(
        [loader.dataset for loader in train_loaders]),
        batch_size=batch_size,
        shuffle=True)
    final_valid_loader = DataLoader(ConcatDataset(
        [loader.dataset for loader in valid_loaders]),
        batch_size=batch_size,
        shuffle=False)

    return final_train_loader, final_valid_loader

trainloader, validloader = create_final_loaders(
    csv_file_paths, time_steps=TIME_STEP, batch_size=BATCH_SIZE)

In [None]:
model = LSTMModel(input_size=INPUT_SIZE, hidden_layer_size=HIDDEN_LAYER_SIZE, output_size=OUTPUT_SIZE)   

## if you have GPU
if cuda_available:
    model.cuda()

epochs = EPOCHS
best_val_rmse = float('inf')
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNNG_RATE)
for epoch in range(epochs):
    model.train()
    train_loss= 0
    for X_batch, y_batch in tqdm(trainloader, desc=f"Epoch {epoch+1}/{epochs} Training"):

        ## if you have GPU
        if cuda_available:
            X_batch, y_batch = X_batch.cuda(), y_batch.cuda()

        ####### LSTM ########
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step() 
        train_loss += loss.item()

    # run validation every 5 epochs
    if epoch % 5 == 0 or epoch == epochs-1:

        model.eval()

        val_rmse = []
        y_preds = []
        y_actuals = []
        with torch.no_grad():
            for X_batch, y_batch in tqdm(validloader, desc=f"Epoch {epoch+1}/{epochs} Validation"):

                ## if you have GPU
                if cuda_available:
                    X_batch, y_batch = X_batch.cuda(), y_batch.cuda()

                # inference the model
                y_pred = model(X_batch)

                # calculate RMSE
                rmse = torch.sqrt(criterion(y_pred, y_batch)).cpu().numpy()
                val_rmse.append(rmse)

                # for the first batch
                if len(y_preds) == 0:  
                    y_preds = y_pred.cpu().numpy()
                    y_actuals = y_batch.cpu().numpy()
                # for the rest of the batches
                else:  
                    y_preds = np.vstack((y_preds, y_pred.cpu().numpy()))
                    y_actuals = np.vstack((y_actuals, y_batch.cpu().numpy()))

        epoch_val_rmse = np.mean(val_rmse)
        print(f"Epoch {epoch+1}, Validation RMSE: {epoch_val_rmse}")

        if epoch_val_rmse < best_val_rmse:
            best_val_rmse = epoch_val_rmse
            print(f"New best model with RMSE: {best_val_rmse}, saving model...")
            torch.save(model.state_dict(), OUTPUT_FILENAME)

In [None]:
import matplotlib.pyplot as plt

model.eval()
data = pd.read_csv(r'data/free/free1/4_rear.csv') # choice was arbitrary selection from the dataset 

X = data.values[:,x_features].astype(np.float32)
y = data.values[:,y_features].astype(np.float32)
y_pred = []

error_sum = 0
with torch.no_grad():
    for i in tqdm(range(X.shape[0]-5)):
        t = torch.Tensor(X[i:i+5])
        t = t.unsqueeze(0)

        ## if you have GPU
        if cuda_available:
            t = t.cuda()
        
        pred = model(t)
        pred = pred.cpu()
        pred = pred.squeeze().cpu()
        y_pred.append(pred.numpy())
        rmse = np.sqrt(np.sum((y[i] - pred.numpy()) ** 2) / y[i].size)
        error_sum += rmse

error_sum = error_sum / (X.shape[0]-5)
print(f'RMSE avg: {error_sum}')

plt.plot(y[5:], label='actual')
y_pred_arr = np.array(y_pred)
plt.plot(y_pred_arr, label='predicted')
plt.legend(loc='best')
plt.show(block=True)