In [659]:
import csv
import pandas as pd
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import KBinsDiscretizer

from torch.nn import functional as F
import bisect

torch.manual_seed(1337)

# hyperparameters
batch_size = 16 
block_size = 32 
max_iters = 5000
epochs = 5
eval_interval = 100
learning_rate = 1e-3


st_dims = 4
ac_dims = 1
rw_dims = 1
rtg_dims= 1
traj_len = st_dims + ac_dims + rw_dims + rtg_dims
pattern = [100, 100, 100, 100, 2, 1, 100]

eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
out_vocab = 100
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [660]:
data = pd.read_csv("cartpole_play.csv")

In [661]:
rename = {'0':'episode', '1':'obs1', '2':'obs2', '3':'obs3', '4':'obs4', '5':'actions', '6':'rewards', '7':'rewardstg'}
data = data.rename(rename, axis=1)
data.drop(columns='Unnamed: 0', inplace=True)
data

Unnamed: 0,episode,obs1,obs2,obs3,obs4,actions,rewards,rewardstg
0,0.0,0.048681,0.233331,-0.044224,-0.267353,1.0,1.0,66.0
1,0.0,0.053347,0.038867,-0.049571,0.011060,0.0,1.0,65.0
2,0.0,0.054124,-0.155511,-0.049350,0.287700,0.0,1.0,64.0
3,0.0,0.051014,-0.349895,-0.043596,0.564419,0.0,1.0,63.0
4,0.0,0.044016,-0.154190,-0.032307,0.258327,1.0,1.0,62.0
...,...,...,...,...,...,...,...,...
36762,1099.0,0.145006,0.416619,-0.229412,-1.206810,1.0,1.0,4.0
36763,1099.0,0.145006,0.416619,-0.229412,-1.206810,1.0,1.0,3.0
36764,1099.0,0.145006,0.416619,-0.229412,-1.206810,0.0,1.0,2.0
36765,1099.0,0.145006,0.416619,-0.229412,-1.206810,0.0,1.0,1.0


In [662]:
observations = data[['obs1', 'obs2', 'obs3', 'obs4']]
actions = data['actions']
rewards = data['rewards']-1
rewardstg = data['rewardstg']

# Discretizer

Here's the deal: we need to discretize continuous actions to a vocabulary size ```vocab = 100```. Moreover, rewards to go is up to 150, so we also need to shrink that to a vocabulary to a size ```100```. For that, we use sklearns KBinsDiscretizer, which discretizes a input into bins. However, once we discretize, we will have bins identified from 0 to 99. If we want to use these as tokens, we will run into the problem that the token 1 for state and the token 1 for action are treated the same way by the input layer. We do not want that. We need to differentiate those tokens.  The following class allows to discretize and differentiate tokens. 

In [663]:
#this class will work with numpy arrays as inputs, except fit_transform
class DataDiscretizer:
    def __init__(self, vocab, traj_len, encode='ordinal', strategy='uniform'):
        self.vocab = vocab
        self.traj_len = traj_len
        self.ob_enc = KBinsDiscretizer(vocab, encode="ordinal", strategy='uniform')
        self.rtg_enc = KBinsDiscretizer(vocab, encode="ordinal", strategy="uniform")
        
    def fit_transform(self, observations, actions, rewards, rewardstg):
        self.dif = np.array([i*out_vocab for i in range(traj_len)])
        sim_ob_tok = pd.DataFrame(self.ob_enc.fit_transform(observations.values))
        sim_rtg_tok = pd.DataFrame(self.rtg_enc.fit_transform(rewardstg.values.reshape(-1, 1)))
        sim_tok = pd.concat([sim_ob_tok, actions, rewards, sim_rtg_tok], axis=1)
        dif_tok = sim_tok.copy() + self.dif
        self.total_vocab = dif_tok.max().max()
        return dif_tok.values.reshape(-1).astype(int), sim_tok.values.reshape(-1).astype(int)
        
    def discretize_observation(self, obs):
        return self.ob_enc.transform(obs.reshape(1, -1)).reshape(-1)
    
    def discretize_rtg(self, rtg):
        return self.rtg_enc.transform(rtg.reshape(1, -1)).reshape(-1)
        
    def discretize_trajectory(self, traj):
        obs = traj[:st_dims]
        anr = traj[st_dims:-1]
        rtg = traj[-1:]
        sim_obs_tok = self.discretize_observation(obs)
        sim_rtg_tok = self.discretize_rtg(rtg)
        sim_tok = np.concatenate([sim_obs_tok, anr, sim_rtg_tok])
        dif_tok = sim_tok.copy() + self.dif
        return sim_tok.astype(int), dif_tok.astype(int)
    
    def similar_to_different(self, tok, mod):
        mod = mod%self.traj_len
        return tok+self.dif[mod]
    
    def different_to_similar(self, tok, mod):
        mod = mod%self.traj_len
        return tok-self.dif[mod]
    
    def get_raw(self, tok, mod):
        tok -= self.dif[mod] 
        if mod < st_dims:
            assert NotImplemented
        elif mod < st_dims + ac_dims + rw_dims:
            return tok
        else:
            return self.rtg_enc.inverse_transform(tok.reshape(1, -1)).reshape(-1)[0]

    def get_vocab_size(self):
        return self.total_vocab
        

In [668]:
dd = DataDiscretizer(out_vocab, traj_len)
dif, sim = dd.fit_transform(observations, actions, rewards, rewardstg)
inp_vocab = int(dd.get_vocab_size())+1

In [669]:
data

Unnamed: 0,episode,obs1,obs2,obs3,obs4,actions,rewards,rewardstg
0,0.0,0.048681,0.233331,-0.044224,-0.267353,1.0,1.0,66.0
1,0.0,0.053347,0.038867,-0.049571,0.011060,0.0,1.0,65.0
2,0.0,0.054124,-0.155511,-0.049350,0.287700,0.0,1.0,64.0
3,0.0,0.051014,-0.349895,-0.043596,0.564419,0.0,1.0,63.0
4,0.0,0.044016,-0.154190,-0.032307,0.258327,1.0,1.0,62.0
...,...,...,...,...,...,...,...,...
36762,1099.0,0.145006,0.416619,-0.229412,-1.206810,1.0,1.0,4.0
36763,1099.0,0.145006,0.416619,-0.229412,-1.206810,1.0,1.0,3.0
36764,1099.0,0.145006,0.416619,-0.229412,-1.206810,0.0,1.0,2.0
36765,1099.0,0.145006,0.416619,-0.229412,-1.206810,0.0,1.0,1.0


In [670]:
dif = torch.tensor(dif, dtype=torch.long)
sim = torch.tensor(sim, dtype=torch.long)

# Transformer architecture

In [671]:
#data is heavily identified by its index
#since it determines whther is a state (pos 0, 1, 2, 3)
#and action (4), a reward (5), or rewrad to go(6)
#so in the follwoing funciton we also output the position
#of the first token.
def get_batch():
    ix = torch.randint(len(dif) - block_size, (batch_size,))
    x = torch.stack([dif[i:i+block_size] for i in ix])
    y = torch.stack([sim[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y, ix%traj_len

@torch.no_grad()
def estimate_loss():
    out = {}
    tt.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        x, y, mod = get_batch()
        logits, loss = tt(x, mod, targets=y)
        losses[k] = loss.item()
    out = losses.mean()
    tt.train()
    return out

In [672]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(3*block_size, 3*block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


The only difference between the following transformer and the vanilla transformer is that we have a generate_mask function, thats masks out ilegal actions or rewards

In [673]:
# super simple bigram model
class TrajectoryTransformer(nn.Module):

    def __init__(self):
        super().__init__()
        self.inp_embedding = nn.Embedding(inp_vocab, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, out_vocab)

    def forward(self, sequence, mods, targets=None):
        vocab_embedding = self.inp_embedding(sequence) #(B,T,C)
        B, T, C = vocab_embedding.shape
        pos_encoding = self.position_embedding_table(torch.arange(T, device=device)) #(T,C)
        x = vocab_embedding + pos_encoding #(B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) 

        #MODIFICATON
        preds = self.lm_head(x) # (B,T,vocab)
        #make all the imporoper values ilegal
        mask = self.generate_mask(mods, T)
        preds = preds.masked_fill(mask, float('-inf'))

        if targets is None:
            loss = None
        else:
            B, T, C = preds.shape
            preds = preds.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(preds, targets)

        return preds, loss
    
    def generate_mask(self, mods, T):
        
        mask = torch.stack(
        [torch.stack(
            [(torch.arange(out_vocab)) >= pattern[(i+mod+1)%traj_len] #true if value should be masked
                 for i in range(T)])  #we want to mask up to sequence length 
                     for mod in mods]) #we want to mask specific positions 
        return mask

In [674]:
tt = TrajectoryTransformer()
tt.to(device)
optimizer = torch.optim.AdamW(tt.parameters(), lr=learning_rate)

In [675]:
# tt =TrajectoryTransformer()
x, y, mod = get_batch()
preds, _ = tt(x, mod, y)

In [681]:
for _ in range(1):
    for iteration in range(1):
      # every once in a while evaluate the loss on train and val sets
        if iteration % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss()
            print(f"step {iteration}: train loss {losses:.4f}")

        # sample a batch of data
        x, y, mods = get_batch()

        # evaluate the loss
        logits, loss = tt(x, mods,targets=y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

step 0: train loss 0.5497


In [679]:
tt.load_state_dict(state_dict)

<All keys matched successfully>

Wont remove the porints cuz they are extremly useful for debugging

In [655]:
import torch
import numpy as np

#We must assume that the context is a trajectory beginning with SAR
def beam_search(model, context, st_width=3, ac_width=2):
    window_by_index = [st_width] * st_dims + [ac_width] * ac_dims + \
                                        [1]*rw_dims + [1]*rtg_dims
    top_context = np.array([context])
    top_prob = np.array([0])

    for i in range(traj_len):
#         print(i, top_context.shape, top_prob.shape)
        sqr_context = []
        sqr_probs = []
        k = window_by_index[i%traj_len]

        for context, prob in zip(top_context, top_prob):
            mod = 0 if len(context) < block_size else len(context)%traj_len
            inp = torch.tensor(context[-block_size:], device=device).unsqueeze(0)
            logits, _ = model(inp, mods=torch.tensor(mod).reshape(1, 1))
#             print(logits.shape)
            logits = logits[0, -1, :]
            logits = torch.log_softmax(logits, dim=0).detach().cpu().numpy()
#             print('--------------------------logits----------------------------------')
#             print(logits)
#             print('--------------------------logits----------------------------------')
            
            k_logits_idx = np.argpartition(logits, -k)[-k:]
            sqr_probs.extend(logits[k_logits_idx] + prob)

            k_tokens = [dd.similar_to_different(tk, i) for tk in k_logits_idx]
            
            sqr_context.extend(np.concatenate([context, np.array(tok).reshape(1,)]) for tok in k_tokens)

#         print('--------------------------sqr----------------------------------')
#         print('sqr', sqr_context, sqr_probs)
#         print('-------------------------sqr----------------------------------')
#         print(np.shape(sqr_context), np.shape(sqr_probs))

        if i%traj_len < st_dims:
            idxs = np.argsort(sqr_probs)[-k:]
        else:
            idxs = np.argsort(sqr_probs)

        top_context = np.array(sqr_context)[idxs]
        top_prob = np.array(sqr_probs)[idxs]
#         print(top_context, top_prob)
       
    rw_idx, rewards = get_reward_idxs(top_context)
    
    
    return top_context[rw_idx], top_prob[rw_idx], rewards

def extract_action(trajectory):
    return trajectory[st_dims::trajectory_length]

def get_reward_idxs(trajectories):
    rewards = [dd.get_raw(traj[-2], 5) + dd.get_raw(traj[-1], 6) for traj in trajectories]
    return np.argsort(rewards), np.sort(rewards)
    
    

In [656]:
def meta_beam_search():
    top_

In [646]:
top_beams_context,  top_beams_prob, rewards= beam_search(tt, context)

0 (1, 7) (1,)
1 (3, 8) (3,)
2 (3, 9) (3,)
3 (3, 10) (3,)
4 (3, 11) (3,)
5 (6, 12) (6,)
6 (6, 13) (6,)


# Inference

In [652]:
def warm_up_step():
    action = 0
    state, reward, terminated, _, _ = env.step(action)
    to_encode = concat_traj(state, action, reward, desired_reward)
#     print(to_encode)
    _, context = dd.discretize_trajectory(to_encode)

    return context

def concat_traj(state, action, reward, desired_reward):
    state_ = state.reshape(-1)
    action_ = np.array(action).reshape(-1)
    reward_ = np.array(reward).reshape(-1)
    desired_reward_ = np.array(desired_reward).reshape(-1)
    return np.concatenate([state_, action_, reward_, desired_reward_])    

In [657]:
import gymnasium as gym

env = gym.make('CartPole-v1')
env._max_episode_steps = 150 

def testing(model):
    total_reward = 0.0
    desired_reward = 120

    _= env.reset(seed=None)
    terminated = False
    time = 0
    context = warm_up_step()
 
    while not terminated and time <= 500:
        context_lar, probs, rews = beam_search(tt, context)
        predicted_traj = context_lar[-1][-traj_len:]
        
        action_tok = predicted_traj[4]
        action = dd.different_to_similar(action_tok, 4)
        
        state, reward, terminated, _, _ = env.step(action)
        total_reward += reward
        desired_reward -= reward
        
        to_encode = concat_traj(state, action, reward, desired_reward)
        _, context = dd.discretize_trajectory(to_encode)
        
#         print('+++++', context_lar)
#         print('-----', context, '\n')
        
        time += 1

    print("total_reward = {}".format(total_reward))
    env.close()

In [658]:
for i in range(20):
    testing(tt)

total_reward = 12.0
total_reward = 26.0
total_reward = 22.0
total_reward = 62.0
total_reward = 99.0
total_reward = 24.0
total_reward = 43.0
total_reward = 117.0
total_reward = 38.0
total_reward = 24.0
total_reward = 47.0
total_reward = 30.0
total_reward = 34.0
total_reward = 45.0
total_reward = 71.0
total_reward = 49.0
total_reward = 39.0
total_reward = 24.0
total_reward = 16.0
total_reward = 43.0


In [628]:
rt = TrajectoryTransformer()

# Experimenting maybe we need it later

In [None]:
import gymnasium as gym

In [None]:
def to_tokens(traj):
    observations = kbd_observations.transform(np.array(traj[0:4]).reshape(1, -1)).reshape(-1)
    action = np.array(traj[4]).reshape(-1)
    reward = np.array(traj[5]).reshape(-1)
    rewardtg = kbd_rewardstg.transform(np.array(traj[6]).reshape(1, -1)).reshape(-1)
    tokenized = np.concatenate([observations, action, reward, rewardtg])
    tokenized += prevs 
    return tokenized.astype(int)

In [None]:
example = np.array([0.001888, 0.033522, -0.041096, -0.040241, 1.0, 1.0, 21.0])
to_tokens(example)

In [None]:
prevs

In [None]:
testing(tt, context)

In [None]:
test = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]

test[1:3:5]

In [None]:
import numpy as np

input_array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])

result = input_array[np.arange(0, len(input_array), 7)].reshape(-1, 4)

In [None]:
mod = 0
st_mask = [(x+mod)%7 < 4 for x in range(11)]
ac_mask = [(x+mod)%7 == 4 for x in range(33)]
rw_mask = [(x+mod)%7 == 5 for x in range(33)]
rtg_mask= [(x+mod)%7 == 6 for x in range(33)]

test = np.array(range(11))
print(test[st_mask])

In [None]:
ll = nn.Linear(5, 2)
inp = torch.tensor(np.random.rand(11, 5), dtype=torch.float)
out = ll(inp)
print(out.shape)
print(out)
out[st_mask] = torch.zeros(st_mask.shape)

In [None]:
mask

In [None]:
import numpy as np

mods = np.array([0, 1, 2])  # Replace with your desired tensor of mods

length = 14  # Length of the masks

st_mask = torch.stack([(torch.arange(length) + mod) % 7 < 4 for mod in mods])
ac_mask = torch.stack([(torch.arange(length) + mod) % 7 == 4 for mod in mods])
rw_mask = torch.stack([(torch.arange(length) + mod) % 7 == 5 for mod in mods])
rtg_mask = torch.stack([(torch.arange(length) + mod) % 7 == 6 for mod in mods])

test = torch.tensor(np.random.rand(3, 14, 5), dtype=torch.float)
# print(test)
print(test.shape, st_mask.shape)
# print(test[rtg_mask].reshape(3, -1, 5))
# print(test[rtg_mask].shape)

test[rtg_mask] = torch.zeros((6, 5), dtype=torch.float)

In [None]:
print(test)

In [None]:
out[st_mask] = torch.zeros(st_mask.shape)

In [106]:
state_dict = torch.load(save_path)
tt.load_state_dict(state_dict)

<All keys matched successfully>

In [108]:
context = dif.view(-1, traj_len)[100].unsqueeze(0)

In [113]:
context - dd.dif


tensor([[48, 34, 64, 66,  1,  0,  6]])

In [114]:
preds, _ = tt(context, [0])
print(preds.shape)
toks = torch.argmax(preds, dim=2)

torch.Size([1, 7, 100])


In [115]:
toks

tensor([[41, 67, 68,  1,  0,  4, 47]])

In [455]:
dd = DataDiscretizer(100, 7)
dif, sim = dd.fit_transform(observations, actions, rewards, rewardstg)

In [456]:
t = dd.get_raw(642, 6)
print(t)

63.324999999999996


In [492]:
context = dif.reshape(-1,traj_len)[5].detach().cpu().numpy()

In [493]:
context

array([ 56, 149, 245, 350, 401, 500, 640])

In [495]:
print(top_beams_context, top_beams_prob)

[[ 56 149 245 350 401 500 640  56 145 245 355 401 500 640]
 [ 56 149 245 350 401 500 640  56 145 245 354 401 500 640]
 [ 56 149 245 350 401 500 640  56 153 245 345 400 500 640]
 [ 56 149 245 350 401 500 640  56 145 245 354 400 500 640]
 [ 56 149 245 350 401 500 640  56 145 245 355 400 500 640]
 [ 56 149 245 350 401 500 640  56 153 245 345 401 500 640]] [-16.75159   -14.50677    -8.491723   -2.0663614  -2.05692    -1.7910172]


In [496]:
rewards

[60.345, 60.345, 60.345, 60.345, 60.345, 60.345]

In [515]:
desired_reward = 120
env.reset()
warm_up_step()

[-3.06963874e-03 -2.14497119e-01  1.95431262e-02  2.52352864e-01
  0.00000000e+00  1.00000000e+00  1.20000000e+02]
[-0.00306964 -0.21449712  0.01954313  0.25235286] [0. 1.] [120.]


array([ 53, 144, 254, 354, 400, 501, 680])