# T-SNE Visualization Tool

## 1) Collect trajectories

### a) Create basis trajectories:

In [19]:
import numpy as np
import os
import sys
sys.path.append(os.path.abspath('../'))

from environments.gym_parser import parse_gym_environment
from rl_algorithms import rockAgent, paperAgent, scissorsAgent, randomAgent

def RPSenv():
    import gym
    import gym_rock_paper_scissors
    return gym.make('RockPaperScissors-v0')

def RPSTask(RPSenv):
    return parse_gym_environment(RPSenv)


In [20]:
from tqdm import tqdm
from multiagent_loops import simultaneous_action_rl_loop


def collect_basis_trajectories_for(env, agents, fixed_opponents, nbr_episodes_matchup):
    trajs = {'agent':[],
                'opponent':[],
                'trajectory':[]
                }
    
    progress_bar = tqdm(range(len(fixed_opponents)))
    for e in progress_bar:
        fixed_opponent = fixed_opponents[e]
        for agent in agents:
            trajectories = simulate(env, agent, fixed_opponent, episodes=nbr_episodes_matchup, training=False)
            for t in trajectories:
                trajs['agent'].append( fixed_opponent.name)
                trajs['opponent'].append( agent.name)
                trajs['trajectory'].append( t)
        progress_bar.set_description(f'Collecting trajectories: {agent.name} against {fixed_opponent.name}.')
    return trajs

def simulate(env, agent, fixed_opponent, episodes, training):
    agent_vector = [agent, fixed_opponent]
    trajectories = list()
    mode = 'Training' if training else 'Inference'
    progress_bar = tqdm(range(episodes))
    for e in progress_bar:
        trajectory = simultaneous_action_rl_loop.run_episode(env, agent_vector, training=training)
        trajectories.append(trajectory)
        progress_bar.set_description(f'{mode} {agent.name} against {fixed_opponent.name}')
    return trajectories


In [117]:
trajectories = collect_basis_trajectories_for(RPSenv(), 
                                              [randomAgent],
                                             [rockAgent, paperAgent, scissorsAgent],
                                             nbr_episodes_matchup=100)


  0%|          | 0/3 [00:00<?, ?it/s][A

  0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against RockAgent:   0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against RockAgent:   0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against RockAgent:   0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against RockAgent:   0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against RockAgent:   0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against RockAgent:   0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against RockAgent:   0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against RockAgent:   0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against RockAgent:   0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against RockAgent:   0%|          | 0/100 [00:00<?, ?it/s][A[A

Inference RandomAgent against 

Inference RandomAgent against RockAgent:  64%|██████▍   | 64/100 [00:00<00:00, 172.51it/s][A[A

Inference RandomAgent against RockAgent:  64%|██████▍   | 64/100 [00:00<00:00, 170.88it/s][A[A

Inference RandomAgent against RockAgent:  64%|██████▍   | 64/100 [00:00<00:00, 169.23it/s][A[A

Inference RandomAgent against RockAgent:  64%|██████▍   | 64/100 [00:00<00:00, 167.61it/s][A[A

Inference RandomAgent against RockAgent:  64%|██████▍   | 64/100 [00:00<00:00, 165.99it/s][A[A

Inference RandomAgent against RockAgent:  64%|██████▍   | 64/100 [00:00<00:00, 164.38it/s][A[A

Inference RandomAgent against RockAgent:  64%|██████▍   | 64/100 [00:00<00:00, 163.00it/s][A[A

Inference RandomAgent against RockAgent:  64%|██████▍   | 64/100 [00:00<00:00, 161.52it/s][A[A

Inference RandomAgent against RockAgent:  64%|██████▍   | 64/100 [00:00<00:00, 160.20it/s][A[A

Inference RandomAgent against RockAgent:  64%|██████▍   | 64/100 [00:00<00:00, 158.91it/s][A[A

Inference RandomAgen

Inference RandomAgent against PaperAgent:  47%|████▋     | 47/100 [00:00<00:00, 176.51it/s][A[A

Inference RandomAgent against PaperAgent:  47%|████▋     | 47/100 [00:00<00:00, 173.68it/s][A[A

Inference RandomAgent against PaperAgent:  47%|████▋     | 47/100 [00:00<00:00, 171.11it/s][A[A

Inference RandomAgent against PaperAgent:  47%|████▋     | 47/100 [00:00<00:00, 168.37it/s][A[A

Inference RandomAgent against PaperAgent:  47%|████▋     | 47/100 [00:00<00:00, 164.84it/s][A[A

Inference RandomAgent against PaperAgent:  47%|████▋     | 47/100 [00:00<00:00, 161.89it/s][A[A

Inference RandomAgent against PaperAgent:  47%|████▋     | 47/100 [00:00<00:00, 159.09it/s][A[A

Inference RandomAgent against PaperAgent:  47%|████▋     | 47/100 [00:00<00:00, 156.76it/s][A[A

Inference RandomAgent against PaperAgent:  47%|████▋     | 47/100 [00:00<00:00, 152.31it/s][A[A

Inference RandomAgent against PaperAgent:  69%|██████▉   | 69/100 [00:00<00:00, 221.98it/s][A[A

Inference 

Inference RandomAgent against ScissorsAgent:  19%|█▉        | 19/100 [00:00<00:00, 99.47it/s] [A[A

Inference RandomAgent against ScissorsAgent:  19%|█▉        | 19/100 [00:00<00:00, 97.50it/s][A[A

Inference RandomAgent against ScissorsAgent:  19%|█▉        | 19/100 [00:00<00:00, 95.78it/s][A[A

Inference RandomAgent against ScissorsAgent:  19%|█▉        | 19/100 [00:00<00:00, 94.11it/s][A[A

Inference RandomAgent against ScissorsAgent:  40%|████      | 40/100 [00:00<00:00, 196.35it/s][A[A

Inference RandomAgent against ScissorsAgent:  40%|████      | 40/100 [00:00<00:00, 193.41it/s][A[A

Inference RandomAgent against ScissorsAgent:  40%|████      | 40/100 [00:00<00:00, 190.29it/s][A[A

Inference RandomAgent against ScissorsAgent:  40%|████      | 40/100 [00:00<00:00, 186.17it/s][A[A

Inference RandomAgent against ScissorsAgent:  40%|████      | 40/100 [00:00<00:00, 182.66it/s][A[A

Inference RandomAgent against ScissorsAgent:  40%|████      | 40/100 [00:00<00:00, 17

In [118]:
trajectories.keys()


dict_keys(['agent', 'opponent', 'trajectory'])

In [119]:
len(trajectories['trajectory'])


300

### b) Collect trajectories from training:

In [120]:
def all_files_in_directory(directory):
    return [os.path.join(directory, f)
            for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

def all_folders_in_directory(directory):
    return [os.path.join(directory, f)
            for f in os.listdir(directory) if not( os.path.isfile(os.path.join(directory, f))) ]

def all_files_in_directory(directory):
    return [os.path.join(directory, f)
            for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) ]

def get_agent_name_from_full_path(filename):
    return os.path.splitext(filename)[0].split('/')[-1]

def get_episode_number_from_full_path(filename):
    return os.path.splitext(filename)[0].split('/')[-1]

def collect_trajectories_from(run_dir, with_policies=False):
    menageries_dir = os.path.join(run_dir,'menageries')
    trajs = {'agent':[],
             'opponent':[],
             'trajectory':[],
             'episode':[]
                }
    policies = []
    
    for folder in all_folders_in_directory(menageries_dir):
        agent_name = get_agent_name_from_full_path(folder)
        policies_files = all_files_in_directory(folder)
        
        if with_policies:
            progress_bar = tqdm(range(len(policies_files)))
            for e in progress_bar:
                f = files[e]
                file_name = get_file_name_from_full_path(f)
                policy = AgentHook.unhook(path=f)
                policies.append(policy)
        
        trajectory_folder = os.path.join(folder, 'trajectories')
        trajectory_files = all_files_in_directory(trajectory_folder)
        progress_bar = tqdm(range(len(trajectory_files)))
        for e in progress_bar:
            t = trajectory_files[e]
            print(t)
            episode_number = get_episode_number_from_full_path(t)
            traj = pickle.load(open(t, 'rb'))
            trajs['agent'].append(agent_name)
            trajs['opponent'].append(agent_name)
            trajs['trajectory'].append(traj)
            trajs['episode'].append(episode_number)
    
    return trajs, policies
        

In [121]:
source_dir = "/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/"
run_dir = os.path.join(source_dir, "run-0")
#run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories
trajs, policies = collect_trajectories_from( run_dir, with_policies=False)



  0%|          | 0/1000 [00:00<?, ?it/s][A
 26%|██▌       | 258/1000 [00:00<00:00, 2563.01it/s][A


/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories/episode_821.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories/episode_585.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories/episode_701.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories/episode_311.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories/episode_92.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h

 56%|█████▋    | 563/1000 [00:00<00:00, 2803.68it/s][A
 79%|███████▉  | 788/1000 [00:00<00:00, 2616.26it/s][A
100%|██████████| 1000/1000 [00:00<00:00, 2549.14it/s][A

/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories/episode_618.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories/episode_972.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories/episode_686.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories/episode_709.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_mlp/trajectories/episode_696.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_


  0%|          | 0/1000 [00:00<?, ?it/s][A
 30%|███       | 302/1000 [00:00<00:00, 2999.94it/s][A

/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_rnn/trajectories/episode_821.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_rnn/trajectories/episode_585.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_rnn/trajectories/episode_701.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_rnn/trajectories/episode_311.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_rnn/trajectories/episode_92.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h


 64%|██████▎   | 635/1000 [00:00<00:00, 3130.78it/s][A
 94%|█████████▍| 945/1000 [00:00<00:00, 3121.59it/s][A
100%|██████████| 1000/1000 [00:00<00:00, 3126.65it/s][A

/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_rnn/trajectories/episode_957.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_rnn/trajectories/episode_291.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_rnn/trajectories/episode_814.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_rnn/trajectories/episode_738.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_h64_rnn/trajectories/episode_191.traj
/home/kevin/Development/git/Generalized-RL-Self-Play-Framework/experiment/experiment-Naive-TrajTest-CH1e3/run-0/menageries/NaiveSP-ppo_

In [122]:
len(trajs['episode'])

2000

### Add those to the current basis trajectories:

In [123]:
for k in trajs.keys():
    for idx in range(len(trajs[k])):
        if k not in trajectories: 
            trajectories[k] = [None]*len(trajectories['trajectory'])
        trajectories[k].append( trajs[k][idx])

## 2) Encode trajectories

In [124]:
import copy
ts = copy.deepcopy(trajectories['trajectory'])
print(f'Nbr traj: {len(ts)} // Steps per traj: {len(ts[0])} // Elements per steps: {len(ts[0][0])}')

Nbr traj: 2300 // Steps per traj: 10 // Elements per steps: 5


In [125]:
a0 = ts[0][0][1]
print(a0)
s0 = ts[0][0][0][0][-1]
print(s0)

oh_a0 = [ ]

[0, 0]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [126]:
#actions = [ [step[1] for idx, step in enumerate(t) if idx<3]for t in ts]
#actions = [ [step[1] for idx, step in enumerate(t) if idx<10]for t in ts]

actions = [ [step[0][0][-1] for idx, step in enumerate(t) if idx<10 and idx>0]for t in ts]

In [127]:
actions = np.asarray(actions)
actions.shape

(2300, 9, 10)

In [128]:
x_actions = copy.deepcopy(actions)
y_agents = np.asarray( copy.deepcopy(trajectories['agent']) )
print(x_actions.shape, y_agents.shape)

(2300, 9, 10) (2300,)


In [129]:
import pickle
data_dir = './data'
x_actions_dir = './x_actions'
y_agents_dir = './y_agents'
pickle.dump( trajectories, open(data_dir, 'wb'))
pickle.dump( x_actions, open(x_actions_dir, 'wb'))
pickle.dump( y_agents, open(y_agents_dir, 'wb'))

In [130]:
"""
def encode_trajectory(data):
    traj = data['trajectory']
    actions = np.asarray( [ [step[1] for idx, step in enumerate(t) if idx<10]for t in traj] )
    agents = np.asarray( data['agent'])
    return actions, agents
"""

"\ndef encode_trajectory(data):\n    traj = data['trajectory']\n    actions = np.asarray( [ [step[1] for idx, step in enumerate(t) if idx<10]for t in traj] )\n    agents = np.asarray( data['agent'])\n    return actions, agents\n"

## 3) Create t-SNE

In [131]:
from sklearn.manifold import TSNE

In [132]:
n_dims = 2
shuffle = False
if shuffle:
    p = np.random.permutation(len(x_actions))
    x_actions = x_actions[p]
    y_agents = y_agents[p]

X_sample_flat = np.reshape(x_actions, [x_actions.shape[0], -1])
perplexities = [5, 50, 100,200,300,500]
#embeddings = TSNE(n_components=n_dims, init='pca', random_state=17, verbose=2, perplexity=perplexities[1]).fit_transform(X_sample_flat)
embeddings = []
for perplexity in perplexities:
    embeddings.append( 
        TSNE(n_components=n_dims, 
                  init='pca',
                  #init='random', 
                  random_state=17, 
                  verbose=2, 
                  learning_rate=300,
                  n_iter=1000,
                  perplexity=perplexity
                 ).fit_transform(X_sample_flat)
    )

[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 2300 samples in 0.013s...
[t-SNE] Computed neighbors for 2300 samples in 0.898s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2300
[t-SNE] Computed conditional probabilities for sample 2000 / 2300
[t-SNE] Computed conditional probabilities for sample 2300 / 2300
[t-SNE] Mean sigma: 0.531085
[t-SNE] Computed conditional probabilities in 0.034s
[t-SNE] Iteration 50: error = 100.5936203, gradient norm = 0.1747399 (50 iterations in 1.038s)
[t-SNE] Iteration 100: error = 100.4479752, gradient norm = 0.1382473 (50 iterations in 1.856s)
[t-SNE] Iteration 150: error = 100.4284439, gradient norm = 0.1321299 (50 iterations in 1.258s)
[t-SNE] Iteration 200: error = 100.2719879, gradient norm = 0.1315120 (50 iterations in 1.112s)
[t-SNE] Iteration 250: error = 100.2776184, gradient norm = 0.1313664 (50 iterations in 1.364s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 100.277618
[t-SNE] Iteration 300

[t-SNE] Iteration 300: error = 1.9465537, gradient norm = 0.0006126 (50 iterations in 1.574s)
[t-SNE] Iteration 350: error = 1.9349537, gradient norm = 0.0001878 (50 iterations in 1.359s)
[t-SNE] Iteration 400: error = 1.9333839, gradient norm = 0.0000766 (50 iterations in 1.357s)
[t-SNE] Iteration 450: error = 1.9330760, gradient norm = 0.0000316 (50 iterations in 1.409s)
[t-SNE] Iteration 500: error = 1.9324441, gradient norm = 0.0000356 (50 iterations in 1.309s)
[t-SNE] Iteration 550: error = 1.9322882, gradient norm = 0.0000680 (50 iterations in 1.514s)
[t-SNE] Iteration 600: error = 1.9321176, gradient norm = 0.0000182 (50 iterations in 1.441s)
[t-SNE] Iteration 650: error = 1.9320822, gradient norm = 0.0000093 (50 iterations in 1.296s)
[t-SNE] Iteration 700: error = 1.9320524, gradient norm = 0.0000112 (50 iterations in 1.380s)
[t-SNE] Iteration 750: error = 1.9320898, gradient norm = 0.0000106 (50 iterations in 1.281s)
[t-SNE] Iteration 800: error = 1.9320713, gradient norm = 0.

In [133]:
embeddings_dir = './embedding'
pickle.dump(embeddings, open(embeddings_dir, "wb"))

## Plot

In [140]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from matplotlib.ticker import NullFormatter
import seaborn as sns

num_classes = len(np.unique(y_agents))
labels = np.unique(y_agents)
y_sample = copy.deepcopy(y_agents)

# plot the 2D data points
labels.sort()
labels = list(reversed(labels))
print(labels)
always_on_labels = [labels[0], labels[1], labels[2], labels[3]]
#agent_labels = ['NaiveSP-ppo_h64_rnn']
agent_labels = list( set(labels) - set(always_on_labels) )
per_agent_labels_list = [ (agent_label, always_on_labels+[agent_label]) for agent_label in agent_labels]
print(agent_labels)

for idx, perplexity in enumerate(perplexities):
    for idx_label, pa_labels in enumerate(per_agent_labels_list):
        agent_label = pa_labels[0]
        pa_labels_list = pa_labels[1]
        plot_dir = f'./plot-t-sne_{agent_label}_per={perplexity}'

        fig = plt.figure()
        ax = fig.add_subplot(111)
        colors = cm.Spectral(np.linspace(0, 1, num_classes))

        xx = embeddings[idx][:, 0]
        yy = embeddings[idx][:, 1]
        
        # Create a cubehelix colormap to use with kdeplot
        s = np.linspace(0, 3, 10)[0]
        cmap = sns.cubehelix_palette(start=s, light=1, as_cmap=True)
        
        # Draw the background:
        xl = xx[y_sample==agent_label]
        yl = yy[y_sample==agent_label]
        sns.kdeplot(xl, yl, cmap=cmap, shade=True, cut=1, ax=ax)

        for idx, label in enumerate(pa_labels_list):
            xl = xx[y_sample==label]
            yl = yy[y_sample==label]
            ax.scatter(xl, yl, color=colors[idx], label=label, s=3, alpha=0.6)
            
        ax.xaxis.set_major_formatter(NullFormatter())
        ax.yaxis.set_major_formatter(NullFormatter())
        plt.axis('tight')
        plt.legend(loc='best', scatterpoints=1, fontsize=10)
        plt.savefig(plot_dir+'.pdf', format='pdf', dpi=1000)
        plt.savefig(plot_dir+'.png', format='png', dpi=1000)
        #plt.show()
        plt.close(fig)


['ScissorsAgent', 'RockAgent', 'PaperAgent', 'NaiveSP-ppo_h64_rnn', 'NaiveSP-ppo_h64_mlp']
['NaiveSP-ppo_h64_mlp']
