## TO DO:
- Just finished adding json for input model params. Seems to be working.
- Update for test model as well

## Importing Libraries

In [7]:
import random
import numpy as np
import pandas as pd
import EagarTsai as et
import matplotlib.pyplot as plt
import seaborn as sns
import time
from rl import CustomEnv
import json
pd.set_option('display.max_rows', None)

## Initialising fixed parameters

In [8]:
"""Reading input file to get model parameter settings chosen by the user"""

f = open("model_params_train.json")
data = json.load(f)

# Assigning the values to variables so they can be used in the training and testing below

timestep = data['timestep']                 # Number of timesteps in an episode
max_steps = data['maximum_steps']           # Maximum number of timesteps across episodes in a single epoch
epochs = data['epochs']                     # Total number of epochs
model_alpha = data['model_alpha']           # Learning rate of the Bellmann equation
model_gamma = data['model_gamma']           # Discount factor of the Bellmann equation
model_epsilon = data['model_epsilon']       # Initial value for epsilon-greedy algorithm
store_data = bool(data['store_data'])       # Boolean value whether to store data locally or not

# The remaining varaibles dictate the minimum, maximum, and interval values for the three process parameters
min_power = data['min_power']
max_power = data['max_power']
interval_power = data['interval_power']
min_speed = data['min_speed']
max_speed = data['max_speed']
interval_speed = data['interval_speed']
min_hatch = data['min_hatch']
max_hatch = data['max_hatch']
interval_hatch = data['interval_hatch']

f.close()

## Creating parameter combinations

In [15]:
power = np.arange(min_power, max_power + 1, interval_power)
speed = np.arange(min_speed, max_speed + 1, interval_speed)
hatch = np.arange(min_hatch, max_hatch + 0.001, interval_hatch)

parameters = []
for p in power:
    for s in speed:
        for h in hatch:
            parameters.append((p,s,h))

In [16]:
len(parameters)

1650

## Running training model

#### The training model is where the model uses the epsilon greedy algorithm to explore the state space to find the state with the highest reward. The epsilon value comes into play and over time the agent begins exploiting more often, choosing the best action instead of a random one.

In [5]:
train_results = pd.DataFrame(columns = ['timesteps', 'test number', 'alpha', 'gamma', 'total steps', 'steps to optimal',\
                                     'optimal state', 'reward', 'reward per episode', 'time taken', 'no. of states visited'])

In [6]:
start_time = time.perf_counter()         # Beginning timer for the entire training

for j in range(0, epochs):               # For each epoch
    t1 = time.perf_counter()             # Beginning timer for the epoch
      
    # Creating instance of the environment
    env = CustomEnv(parameters, timestep, model_alpha, model_gamma, model_epsilon)
    
    episode = 0                          # Initialising episode number to 0
    states = set()                       # Total number of states visited in the epoch
    states_visited = []                  # Total number of states visited in the episode
    epoch_reward = 0                     # Total value of rewards in the epoch
    episode_rewards = []                 # Set containing each episode's total reward

    # As long as the maximum timesteps in an epoch is not exceeded
    while(env.steps < max_steps):
        env.reset(timestep)              # Resetting the environment
        done = False                     # Setting done to be False so the episode restarts
        episode += 1
        total_reward = 0                 # Episode reward initialised to zero before start of episode
        
        while not done:                  # For each episode
            reward, done = env.step(power, speed, hatch)
            total_reward += reward
            epoch_reward += reward
            states_visited.append(env.state)
            states.add(env.state)               
                
        print('States visited in episode ', episode, 'of test', j+1, 'are ', states_visited)
        
        episode_rewards.append(total_reward)
        print('------------------')
        states_visited.clear()
    
    t2 = time.perf_counter()             # Ending timer for the epoch
    time_taken = t2 - t1                 # Calculating time taken for the epoch to run
    
    row = pd.Series([timestep, j + 1, env.alpha, env.gamma, env.steps, env.optimal_steps, env.optimal_state, \
                     env.rmax, epoch_reward / episode, time_taken, len(states)], \
                    index = train_results.columns)
    train_results.loc[len(train_results)] = row
    
    # If the user opts to store the results, then for each epoch the individual step results and the overall
    # train results will be stored in separate excel files.
    if store_data:
        env.results.to_excel(f'Results//Epoch {j + 1} Step Results.xlsx')
        train_results.sort_values('reward', ascending = False).to_excel(f'Results//Epoch {j + 1} Train Results.xlsx')
        
end_time = time.perf_counter()           # Ending timer for entire training

time_elapsed = end_time - start_time     # Calculating time taken for the whole training

States visited in episode  1 of test 1 are  [632, 533, 634, 745, 744, 865, 876, 977, 1088, 989, 878, 768, 767, 758, 858, 748, 627, 516, 526, 527, 626, 727, 736, 856, 955, 1074, 1174, 1075, 956, 847, 746, 756, 657, 546, 436, 547, 648, 539, 429, 439, 429, 538, 529, 538, 419, 519, 529, 409, 519, 399, 279, 288, 408, 399, 499, 508, 617, 507, 518, 617, 517, 516, 635, 746, 745, 645, 546, 436, 545, 436, 546, 657, 648, 649, 769, 659, 658, 549, 659, 538, 637, 746, 736, 737, 738, 737, 846, 955, 1074, 1063, 1062, 942, 833, 843, 724, 734, 724, 603, 712, 832, 931, 1051, 1161, 1052, 1162, 1281, 1292, 1282, 1383, 1483, 1604, 1503, 1612, 1512, 1401, 1290, 1300, 1190, 1301, 1311, 1420, 1531, 1412, 1292, 1303, 1403, 1503, 1612, 1601, 1590, 1490, 1501, 1600, 1611, 1612, 1622, 1633, 1644, 1645, 1644, 1643, 1644, 1633, 1632, 1643, 1632, 1521, 1532, 1412, 1531, 1640, 1641, 1640, 1641, 1640, 1630, 1520, 1421, 1411, 1421, 1300, 1190, 1311, 1201, 1192, 1193, 1183, 1293, 1194, 1083, 983, 864, 985, 1096, 1205, 10

In [7]:
print('Elapsed Time is', time_elapsed / 60, 'minutes')

Elapsed Time is 1.5071835900000001 minutes


In [12]:
"""Exporting results to an Excel sheet"""

#train_results = pd.read_excel('Tugrul Results.xlsx')

'Exporting results to an Excel sheet'

In [13]:
"""Importing results from an Excel sheet"""

#train_results.to_excel('Pre-Paper Results.xlsx', index = False)

'Importing results from an Excel sheet'

In [8]:
"""Displaying training details for each epoch"""

train_results.sort_values('reward', ascending = False)

Unnamed: 0,timesteps,test number,alpha,gamma,total steps,steps to optimal,optimal state,reward,reward per episode,time taken,no. of states visited
0,200.0,1.0,0.2,0.5,1400.0,1008.0,1610.0,2.073876,55.486202,125.410695,763.0
2,200.0,3.0,0.2,0.5,1400.0,856.0,1580.0,1.863562,90.358565,121.899498,722.0
1,200.0,2.0,0.2,0.5,1400.0,1078.0,1350.0,1.858433,59.349341,124.341999,802.0


In [22]:
"""Printing best parameter configuration from each epoch"""

print("Optimal parameter configurations (P, v, h):\n")
for i in range(epochs):
    opt_state = train_results['optimal state'][i]
    print(f"Epoch {i + 1}: {parameters[int(opt_state)]}")

Optimal parameter configurations (P, v, h):

Epoch 1: (400, 2325, 0.0375)
Epoch 2: (400, 2325, 0.0375)
Epoch 3: (400, 2325, 0.0375)


In [11]:
env.results

Unnamed: 0,power,speed,hatch,pi1,pi2,ved,state,reward,steps,episode
0,375.0,1.2,0.000125,14.578285,2000.0,83.333333,1457.0,0.272832,1.0,3.0
1,400.0,0.975,0.000125,17.599649,2000.0,109.401709,1557.0,0.507687,2.0,3.0
2,375.0,0.75,0.000138,17.73578,2000.0,121.212121,1438.0,0.510425,3.0,3.0
3,400.0,0.75,0.00015,17.034401,2000.0,118.518519,1549.0,0.465929,4.0,3.0
4,375.0,0.75,0.00015,16.257799,2000.0,111.111111,1439.0,0.421513,5.0,3.0
5,400.0,0.975,0.00015,14.666374,2000.0,91.168091,1559.0,0.313471,6.0,3.0
6,400.0,0.75,0.00015,17.034401,2000.0,118.518519,1549.0,0.465929,7.0,3.0
7,400.0,0.975,0.000138,15.999681,2000.0,99.456099,1558.0,0.400608,8.0,3.0
8,375.0,0.75,0.00015,16.257799,2000.0,111.111111,1439.0,0.421513,9.0,3.0
9,400.0,0.75,0.00015,17.034401,2000.0,118.518519,1549.0,0.465929,10.0,3.0


In [33]:
"""Storing Q table as an Excel file if required for testing"""

# df = pd.DataFrame(data=env.qtable)

# df = (df.T)

# df.to_excel('Qtable_train.xlsx')

### Test Run

In [17]:
"""If a specific Qtable is to be used, then import it here, otherwise the one from the train model is used"""
try:
    filename = "Qtable_train.xlsx"
    qtable = pd.read_excel(filename)
    print("Using imported Q-table")
except:
    qtable = env.qtable
    print("Using Q-table from train model")

Using imported Q-table


In [18]:
"""Reading input file to get model parameter settings chosen by the user for testing"""

f = open("model_params_test.json")
data = json.load(f)

# Assigning the values to variables so they can be used in the training and testing below

timestep = data['timestep']                 # Number of timesteps in an episode
max_steps = data['maximum_steps']           # Maximum number of timesteps across episodes in a single epoch
epochs = data['epochs']                     # Total number of epochs
model_alpha = data['model_alpha']           # Learning rate of the Bellmann equation
model_gamma = data['model_gamma']           # Discount factor of the Bellmann equation
model_epsilon = data['model_epsilon']       # Initial value for epsilon-greedy algorithm
store_data = bool(data['store_data'])       # Boolean value whether to store data locally or not

f.close()

In [19]:
test_results = pd.DataFrame(columns = ['timestep', 'test number', 'alpha', 'gamma', 'total steps', 'steps to optimal',\
                                     'optimal state', 'reward', 'reward per episode', 'time taken', \
                                        'number of states'])

In [20]:
# Creating a table to record only the reward for each episode
test_reward_table = pd.DataFrame(index = np.arange(1, 6, 1))
test_reward_table.index.name = 'Episode Number'

In [22]:
start_time = time.perf_counter()         # Beginning timer for the entire training

for j in range(0,epochs):                # For each epoch
    t1 = time.perf_counter()             # Beginning timer for the epoch
    
    # Creating instance of the environment
    env_test = CustomEnv(parameters, timestep, model_alpha, model_gamma, model_epsilon, qtable = qtable)
    
    episode = 0                          # Initialising episode number to 0
    states = set()                       # Total number of states visited in the epoch
    states_visited = []                  # Total number of states visited in the episode
    epoch_reward = 0                     # Total value of rewards in the epoch
    episode_rewards = []                 # Set containing each episode's total reward

    # As long as the maximum timesteps in an epoch is not exceeded
    while(env_test.steps < max_steps):
        env_test.reset(timestep)         # Resetting the environment
        done = False                     # Setting done to be False so the episode restarts
        episode += 1                     
        total_reward = 0                 # Episode reward initialisd to zero before start of episode
        
        while not done:                  # For each episode
            reward, done = env_test.step(power, speed, hatch, test = True)
            total_reward += reward
            epoch_reward += reward
            states_visited.append(env_test.state)
            states.add(env_test.state)
        print('States visited in episode ', episode, 'of test', j+1, 'are ', states_visited)
        
        episode_rewards.append(total_reward)  
        print('------------------')
        states_visited.clear()
    
    t2 = time.perf_counter()             # Ending timer for the epoch
    time_taken = t2 - t1                 # Calculating time taken for the epoch to run
    
    test_reward_table[f'test {j + 1} rewards'] = episode_rewards
    row = pd.Series([timestep, j + 1, env_test.alpha, env_test.gamma, env_test.steps, env_test.optimal_steps,\
                     env_test.optimal_state, env_test.rmax, epoch_reward / episode, time_taken, \
                     len(states)], index = test_results.columns)
    test_results.loc[len(test_results)] = row
    
    # If the user opts to store the results, then for each epoch the individual step results and the overall
    # train results will be stored in separate excel files.
    if store_data:
        env.results.to_excel(f'Test Results//Epoch {j + 1} Step Results.xlsx')
        test_results.sort_values('reward', ascending = False).to_excel(f'Test Results//Epoch {j + 1} Test Results.xlsx')

    
end_time = time.perf_counter()           # Ending timer for entire training

time_elapsed = end_time - start_time     # Calculating time taken for the whole training

States visited in episode  1 of test 1 are  [638, 759, 659, 549, 439, 329, 328, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 109, 219, 218, 

States visited in episode  3 of test 2 are  [1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1215, 1336, 1437, 1447, 1436, 1547, 1548, 1438, 1327, 1226, 1225, 1

In [24]:
print('Elapsed Time is', time_elapsed / 60, 'minutes')

Elapsed Time is 3.4283805583333256 minutes


In [14]:
#test_reward_table.to_excel('Test Reward table 3.xlsx', index = False)

In [26]:
test_results.sort_values('reward', ascending = False)

Unnamed: 0,timestep,test number,alpha,gamma,total steps,steps to optimal,optimal state,reward,reward per episode,time taken,number of states
0,200.0,1.0,0.2,0.5,1000.0,604.0,1610.0,2.073876,232.245755,85.719644,50.0
1,200.0,2.0,0.2,0.5,1000.0,3.0,880.0,1.84263,269.163895,119.980732,40.0


In [25]:
"""Printing best parameter configuration from each epoch"""

print("Optimal parameter configurations (P, v, h):\n")
for i in range(epochs):
    opt_state = test_results['optimal state'][i]
    print(f"Epoch {i + 1}: {parameters[int(opt_state)]}")

Optimal parameter configurations (P, v, h):

Epoch 1: (400, 2325, 0.0375)
Epoch 2: (250, 750, 0.0375)
