# Env Setting

In [1]:
model_saved_name="model_colab.ckpt"
dataset_path="data/std_dataset"


## Colab

In [2]:
colab_dir = '/content/drive/MyDrive/DeepLearning/MagNet_lstm_3C90_cycle_10_6'  # example for colab

platform = 'auto' # auto detect platform (colab, windows_local, linux_local, unknown)
#platform = 'colab'
#platform = 'windows_local'
#platform = 'linux_local'
#platform = 'unknown'

### Path config

In [3]:
import os

try:
    from google.colab import drive
    drive.mount("/content/drive")
except ImportError:
    if os.path.exists('c:/'):  # check if it is windows
        platform = 'windows_local'
    elif os.path.exists('/home/'):  # check if it is linux
        platform = 'linux_local'
    else:
        platform = 'unknown'
else:
    platform = 'colab'

if platform == 'colab':
  os.chdir(colab_dir)

print('\ncurrent execution path: ', os.getcwd())  #获取当前工作目录路径
print('\ncurrent platform: ', platform)  #获取当前工作目录路径

Mounted at /content/drive

current execution path:  /content/drive/MyDrive/DeepLearning/MagNet_lstm_3C90_cycle_10_6

current platform:  colab


# Cuda check

In [4]:
import torch

gpu_num = 0
cuda_ready = False

if torch.cuda.is_available():
    cuda_ready = True
    print('cuda good!')
    gpu_num = torch.cuda.device_count()
    if (gpu_num < 1):
        print('GPU unavailable')
    else:
        print('GPU num: ', gpu_num)  # 查看GPU数量
        for gpu in range(gpu_num):
            print('GPU type: ', torch.cuda.get_device_name(gpu))  # 查看GPU名称
            print('GPU memory: {:.2f} Gbyte'.format(
                torch.cuda.get_device_properties(gpu).total_memory /
                1e9))  # 查看GPU总内存
else:
    cuda_ready = False
    print('cuda unavailable!')


cuda good!
GPU num:  1
GPU type:  NVIDIA A100-SXM4-40GB
GPU memory: 42.48 Gbyte


# Start coding

In [5]:
print(platform)
print(os.getcwd())
print(cuda_ready)
print(os.path.abspath(''))

colab
/content/drive/MyDrive/DeepLearning/MagNet_lstm_3C90_cycle_10_6
True
/content/drive/MyDrive/DeepLearning/MagNet_lstm_3C90_cycle_10_6


In [6]:
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt


import NW_LSTM
import NN_DataLoader

In [7]:
# Check if CUDA is available and if so, set the device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print("Device using ",device)

# Instantiate the model with appropriate dimensions
model = model = NW_LSTM.get_global_model().to(device)

# Print the model architecture and parameters number
print(model)
print("Total number of parameters: ", sum(p.numel() for p in model.parameters()))

# Load the pre-train model if it exists
try:
    model.load_state_dict(torch.load(model_saved_name))
    print("Pre-train model loaded")
except:
    print("No model found, start training from scratch")
    pass

# Define the loss function and optimizer
#loss_fn = nn.MSELoss()
#loss_fn = NW_LSTM.RelativeLoss()
loss_fn = NW_LSTM.RelativeLoss_abs()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


Device using  cuda
LSTMSeq2One(
  (lstm): LSTM(3, 30, batch_first=True)
  (fc1): Linear(in_features=30, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=8, bias=True)
  (fc5): Linear(in_features=8, out_features=8, bias=True)
  (fc6): Linear(in_features=8, out_features=1, bias=True)
  (relu): ReLU()
  (leaky_relu): LeakyReLU(negative_slope=0.01)
  (elu): ELU(alpha=1.0)
)
Total number of parameters:  9009
No model found, start training from scratch


In [8]:
# Default para in desktop env
epochs = 10
valid_batch_size=1000

if platform == "colab":
  epochs = 700
  valid_batch_size=2000


train_dataloader = NN_DataLoader.get_dataLoader(os.path.normpath(dataset_path +
                                                            "/train.mat"),
                                          batch_size=128)

# Get validation data
valid_dataloader = NN_DataLoader.get_dataLoader(os.path.normpath(dataset_path +
                                                            "/valid.mat"),
                                            batch_size=valid_batch_size)
valid_inputs, valid_targets = next(iter(valid_dataloader))
valid_inputs, valid_targets = valid_inputs.to(device), valid_targets.to(device)

# Save the model with the lowest validation loss
minium_loss = 1

# estimate time used for training
import time
t0 = time.perf_counter()

# Train the model
for epoch in range(epochs):

    # estimate time used for one epoch(s)
    t_epoch = time.perf_counter() - t0
    t0 = time.perf_counter()

    # Train one epoch
    for i, (train_inputs, train_targets) in enumerate(train_dataloader):
        # Move data to device
        train_inputs, train_targets = train_inputs.to(device), train_targets.to(device)

        # Forward pass
        train_outputs = model(train_inputs)

        # Compute loss
        loss = loss_fn(train_outputs, train_targets)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Compute validation loss
    if epoch > 300:
        with torch.no_grad():
            valid_outputs = model(valid_inputs)
            # Compute loss
            valid_loss = loss_fn(valid_outputs, valid_targets)

        if valid_loss < minium_loss:
            minium_loss = valid_loss
            torch.save(model.state_dict(), model_saved_name)
            print(f"  Model saved , Validation Loss: {valid_loss.item():.3e}, ")

    # Print loss every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {loss.item():.3e}, "
            #   f"Validation Loss: {valid_loss.item():.3e} ,"
              f"Remain time: {t_epoch/60 * (epochs - epoch - 1):.1f} min")


Epoch 10/700, Training Loss: 5.569e-01, Remain time: 10.0 min
Epoch 20/700, Training Loss: 2.036e-01, Remain time: 9.4 min
Epoch 30/700, Training Loss: 3.030e-01, Remain time: 9.6 min
Epoch 40/700, Training Loss: 3.617e-01, Remain time: 9.2 min
Epoch 50/700, Training Loss: 1.267e-01, Remain time: 9.0 min
Epoch 60/700, Training Loss: 3.810e-01, Remain time: 9.1 min
Epoch 70/700, Training Loss: 1.366e-01, Remain time: 8.9 min
Epoch 80/700, Training Loss: 1.301e-01, Remain time: 8.7 min
Epoch 90/700, Training Loss: 1.963e-01, Remain time: 8.6 min
Epoch 100/700, Training Loss: 8.413e-02, Remain time: 8.5 min
Epoch 110/700, Training Loss: 6.386e-02, Remain time: 8.4 min
Epoch 120/700, Training Loss: 1.279e-01, Remain time: 8.1 min
Epoch 130/700, Training Loss: 5.609e-02, Remain time: 7.9 min
Epoch 140/700, Training Loss: 8.806e-02, Remain time: 7.9 min
Epoch 150/700, Training Loss: 9.087e-02, Remain time: 7.8 min
Epoch 160/700, Training Loss: 9.178e-02, Remain time: 7.6 min
Epoch 170/700, T

## GPU monitor
### nvidia-smi -l 3