In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import Meshpkg as mp

import pandas as pd
import numpy as np
import tensorflow as tf

import time
import datetime
import matplotlib.pyplot as plt
from tensorflow import keras
from collections import deque
from tqdm import tqdm
import pickle

"Parameter 정의"
p = mp.params

"Seed 설정"
seed = 42
mp.Initialize.my_seed.my_seed_everywhere(42)

"Episode 수" 
n_episodes = 100000

"model, target model(DDPG) 정의"
actor_model = mp.Initialize.model_definition.NNmodel().actor_dense()
actor_target = keras.models.clone_model(actor_model)

critic_model = mp.Initialize.model_definition.NNmodel().critic_dense()
critic_target = keras.models.clone_model(critic_model)

# print(f'Actor NN \n {actor_model.summary()}')
# print(f'Critic NN\n {critic_model.summary()}')

"Replay_memory 정의"
replay_memory = deque(maxlen = p.buffer_size)

"Noise 정의"
ou_noise = mp.Agent.noise.OUActionNoise(mean=np.zeros(p.n_actions), 
                                        std_deviation=float(p.std_dev) * np.ones(p.n_actions))
"Inference 주기"
episode_inference = 100

"Neural Network model 저장 주기, 저장 여부"
episode_save = 100
save_model = True

"Episode - training reward/inference reward list  Time initialize"
reward_list = [ ]
reward_inf_list = [ ]
start = time.time()

for episode in range(1, n_episodes+1): 
    
    s = mp.Env.Step.step_class()
    state = s.reset()
    
    step_ended = 0
    reward_episode = 0
 
    for step in range(1, p.num_layer+1):
        
        actions = mp.Agent.policy.DDPG_policy(state, actor_model, ou_noise())
        
        next_state, reward, done, info, steps =  s.step_func(actions, step, episode)
        replay_memory.append((state, actions, reward, next_state, done, steps))
        state = next_state
        reward_episode += np.average(reward)
        if any(done) == 1:
            step_ended = step
            reward_list.append(reward_episode)
            ### txt file 출력하는 부분 ###
            with open("episode_step_record.txt", 'a') as epistep_file:
                epistep_file.write(f' \n<episode: {episode}> Step ended: {step_ended} ')
                if episode == 1:
                    end1 = start
                end2 = time.time()
                epi_time = str(datetime.timedelta(seconds= (end2 - end1)))
                short1 = epi_time.split(".")[0]
                total_time = str(datetime.timedelta(seconds= (end2 - start)))
                short2 = total_time.split(".")[0]
                epistep_file.write(f"  Time per episode: {short1} (Total: {short2})\n") # epi 시간, 누적시간 출력
                end1 = end2
            ############################
            if step_ended != p.num_layer:
                replay_memory = mp.Train.replay_penalty.penalty_reward(replay_memory, info, step_ended, 1, 2, [-5, -3, 0, 1])
            break
    # step_bar.close()
    "replay memory 다 차면, episode 끝나고 model training 시작"
    if len(replay_memory) == p.buffer_size:
        "한번에 평균내서 weight update"
        mp.Train.model_training.training_step_mean_DDPG(actor_model, actor_target, critic_model, critic_target, replay_memory, episode)
        "각 점마다 weight update"
        
    "episode (episode_inference)회마다 Inference"
    if episode % (episode_inference) == 0:
        volume_mesh_inf, reward_inf_mean = mp.Inference.inference.inference_step(actor_model, critic_model, episode)
        mp.Inference.render.render(volume_mesh_inf, episode)
        mp.Inference.graph.graph_plot().createFolder('volume_mesh')
        with open(f'volume_mesh/volume_mesh_{episode}.p', 'wb') as fr:    
            pickle.dump(volume_mesh_inf, fr)
        
        reward_inf_list.append(reward_inf_mean)

    "episode (episode_target)회마다 Target model update"
    if episode % (p.episode_target) == 0:
        # target model에 "tau"만큼만 weight 복사
        mp.Train.target_update.soft_update(actor_target.variables, actor_model.variables, p.tau)
        mp.Train.target_update.soft_update(critic_target.variables, critic_model.variables, p.tau)

    "episode (episode_save)회마다 model, replay memory, episode-reward 저장"
    if (episode % (episode_save) == 0) and (save_model):
        actor_model.save(f'model_storage/DDPG_actor_{p.mesh_name}_episode_{episode}')
        critic_model.save(f'model_storage/DDPG_critic_{p.mesh_name}_episode_{episode}')
        
        mp.Inference.graph.graph_plot().createFolder('replay_memory')
        with open(f'replay_memory/replay_memory_{episode}.p', 'wb') as fr:    
            pickle.dump(replay_memory, fr)
            
        mp.Inference.graph.graph_plot().Episode_Reward_train_plot(reward_list, episode)
        mp.Inference.graph.graph_plot().Episode_Reward_inf_plot(reward_inf_list, episode)

print ('Finish at: ',str(datetime.timedelta(seconds= (time.time() - start))))

INFO:tensorflow:Assets written to: model_storage/DDPG_actor_spline_1_1_episode_100\assets
INFO:tensorflow:Assets written to: model_storage/DDPG_critic_spline_1_1_episode_100\assets
INFO:tensorflow:Assets written to: model_storage/DDPG_actor_spline_1_1_episode_200\assets
INFO:tensorflow:Assets written to: model_storage/DDPG_critic_spline_1_1_episode_200\assets
INFO:tensorflow:Assets written to: model_storage/DDPG_actor_spline_1_1_episode_300\assets
INFO:tensorflow:Assets written to: model_storage/DDPG_critic_spline_1_1_episode_300\assets
INFO:tensorflow:Assets written to: model_storage/DDPG_actor_spline_1_1_episode_400\assets
INFO:tensorflow:Assets written to: model_storage/DDPG_critic_spline_1_1_episode_400\assets
INFO:tensorflow:Assets written to: model_storage/DDPG_actor_spline_1_1_episode_500\assets
INFO:tensorflow:Assets written to: model_storage/DDPG_critic_spline_1_1_episode_500\assets
INFO:tensorflow:Assets written to: model_storage/DDPG_actor_spline_1_1_episode_600\assets
INFO:

KeyboardInterrupt: 

In [None]:
print(mp.params.num_layer)