# AI Workshop Project: Pong

In [None]:
import random
import gym
import time
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.optimizers import Adam
from skimage.transform import resize
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.decomposition import PCA

Sequential model

In [None]:
def cur_model(num_Conv2D,num_filters,num_kernel,encode_depth,num_encoded_units,units_decrease,num_lstm_units,num_frames):
  model = Sequential()
  for i in range(num_Conv2D): # number of convolutional layers
    model.add(Conv2D(num_filters, kernel_size=num_kernel, data_format='channels_last', activation='relu')) # data_format makes input in shape(batch_size,height, width, channels)
  
  #model.add(Reshape((num_frames, -1))) # reshapes output of conv2D into 2D for initial state of LSTM
  model.add(Flatten())

  for i in range(encode_depth):
    #model.add(Dropout(0.1))
    model.add(Dense(num_encoded_units - units_decrease * i))
    model.add(LeakyReLU())
    #model.add(LSTM(num_encoded_units, return_sequences=True))

  #model.add(LSTM(num_lstm_units)) # LSTM
  #model.add(Dropout(0.1))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='mean_squared_error', optimizer=Adam()) # model compiles
  return model

In [None]:
def reinforcement(env, models, n_games=1, epochs=1, epsilon_decrease=0.1, backsight=2, discount_factor=0.9):
  epsilon = 1 # epsilon allows the model to explore new actions
  state_history = []
  value_history = []
  env.reset() # reset the enviornment state when starting.
  obs = env.observation_space.sample() # array with shape (x,) with values corresponding to each observation of the game. x is the number of observation.

  for game_number in range(n_games):
    states = []
    actions = []
    values = []
    memory = [resize(np.sum(obs, axis=2).reshape((210, 160, 1)).astype('float') / (255 * 3), (26, 20))] * backsight # after np.sum(obs,axis=2), shape changes to 2d so reshape it into 3d and divide by (255*3) for standarlization. Then multiply backsight.
    # before np.sum, it will be 3d but np.sum will sum every [] inside of the 3d array.
    frames = 0

    while True:
      if False: # random.random() > epsilon: #first state, epsilon will always bigger but because epsilon decreases incrementally during the game, model will favor optimized policy model compare to random action.
        action = np.argmax([model.predict(np.array([memory], dtype='float16'))[0][0] for model in models])
        # action = np.argmax(model.predict(np.array([memory], dtype='float16'))[0]) # LSTM outputs a single value, so choose the first index.
      else:
        action = env.action_space.sample() # this is random action and outputs discrete integer value i.e. 1
      
      obs, reward, done, _info = env.step(action) # returns observation_spcae, amount of reward, boolean value if episode has terminated, and info about state.
      memory.pop(0) # removes the first index from memory array.
      memory.append(resize(np.sum(obs, axis=2).reshape((210, 160, 1)).astype('float') / (255 * 3), (26, 20))) # and replace with current observation_space. 
      actions.append(action) # action taken gets added to the array.
      states.append(memory) # obersvation states array gets added to the array/
      values.append([0] * env.action_space.n) # if action_space.n = 2, then it will be in a shape of [0,0]

      for i, scores in enumerate(values):
        scores[actions[i]] += reward * discount_factor ** (len(values) - i - 1) # for example, if there 2 action space, then actions[i] will consist of (0,0),(0,1),(1,0),(1,1). it will then add reward.
      
      frames += 1 # frames added after every episode.
      
      if done:
        print('Game', game_number, 'lasted', frames, 'frames')
        env.reset() # resets state.
        break
    
    '''state_negatives = []
    state_positives = []
    value_negatives = []
    value_positives = []

    for i, state in enumerate(states):
      max_i = np.argmax(np.abs(values[i]))
      if abs(values[i][max_i]) >= 0.05:
        if values[i][max_i] > 0:
          state_positives.append(state)
          value_positives.append(values[i])
        else:
          state_negatives.append(state)
          value_negatives.append(values[i])
    
    n_extensions = min(len(value_negatives), len(value_positives))
    state_history.extend(state_positives[:n_extensions]) # observation states gets added to history.
    state_history.extend(state_negatives[:n_extensions])
    value_history.extend(value_positives[:n_extensions]) # value get added to history.
    value_history.extend(value_negatives[:n_extensions])'''
    state_history.extend(states)
    value_history.extend(values)

    if (game_number) % 1 == 0:
      '''if len(value_history) > 0:
        #value_history_np = np.array(value_history)
        #value_history_scaled = np.exp(value_history_np) / np.repeat(np.sum(np.exp(value_history), axis=1), value_history_np.shape[1]).reshape(value_history_np.shape)
        state_history_per_action = []
        value_history_per_action = []

        for _ in range(len(models)):
          state_history_per_action.append([])
          value_history_per_action.append([])

        for state, value in zip(state_history, value_history):
          action_index = np.argmax(np.abs(value))
          state_history_per_action[action_index].append(state)
          value_history_per_action[action_index].append([value[action_index]])
        
        print([len(hist) for hist in value_history_per_action])

        #for i, model in enumerate(models):
        #  model.fit(np.array(state_history_per_action[i], dtype='float16'), np.array(value_history_per_action[i]), epochs=epochs) # model.fit(x,y), x is state_history and y is value_history. Thus it's using state_history to predict possible values. Learning occurs here.
        
        #state_history = []
        #value_history = []'''
    
    epsilon = max(0, epsilon - epsilon_decrease) # eplison decreases until 0.
  
  value_history_np = np.array(value_history)
  value_history_scaled =  value_history_np #np.exp(value_history_np) / np.repeat(np.sum(np.exp(value_history), axis=1), value_history_np.shape[1]).reshape(value_history_np.shape)
  return np.array(state_history, dtype='float16'), value_history_scaled

In [None]:
!pip install patool

In [None]:
!wget -nc http://www.atarimania.com/roms/Roms.rar

In [None]:
import patoolib
from os import mkdir
from os.path import exists
from shutil import rmtree

if exists('roms'):
  rmtree('roms')

mkdir('roms')
patoolib.extract_archive("Roms.rar", outdir="roms")

In [None]:
!ale-import-roms roms

In [None]:

env = gym.make("Pong-v4", difficulty=0)
models = [cur_model(0, 8, 3, 8, 500, 62, 64, 8) for _ in range(env.action_space.n)]

In [None]:
np.array(env.observation_space.sample()).shape

In [None]:
X, Y = reinforcement(env, models, n_games=8, epochs=1, epsilon_decrease=0.01)
#X = np.load('X.npy')
#Y = np.load('Y.npy')

In [None]:
#np.save('X.npy', X)
#np.save('Y.npy', Y)
state_history = X
value_history = Y
state_history_per_action = []
value_history_per_action = []

for _ in range(len(models)):
    state_history_per_action.append([])
    value_history_per_action.append([])

for state, value in zip(state_history, value_history):
    action_index = np.argmax(np.abs(value))
    state_history_per_action[action_index].append(state)
    value_history_per_action[action_index].append([value[action_index]])

print([len(hist) for hist in value_history_per_action])
scaler = MinMaxScaler()
scaler.fit(np.concatenate(value_history_per_action).reshape(-1, 1))
concatenated_state_history = np.concatenate([hist for hist in state_history_per_action])
transformer = PCA(500)
print('Fitting transformer')
transformer.fit(concatenated_state_history.reshape(len(concatenated_state_history), -1))
print('Fit complete')

for i in range(len(models)):
    value_history_per_action[i] = scaler.transform(np.array(value_history_per_action[i]).reshape(-1, 1)).flatten()
    action_state_history = np.array(state_history_per_action[i])
    state_history_per_action[i] = transformer.transform(action_state_history.reshape(len(action_state_history), -1)).reshape(len(action_state_history), -1)
    print('Transformed', i)

for i, model in enumerate(models):
    model.fit(np.array(state_history_per_action[i], dtype='float16'), np.array(value_history_per_action[i]), epochs=4)

In [None]:
for i, model in enumerate(models):
    model.save('model/' + str(i))

In [None]:
from pickle import dump
dump(transformer, open('transformer.pkl', 'wb'))