Load data

In [1]:
import pandas as pd
import json

df = pd.read_csv("../../results/data/lamarl_data/PPrgb_15.csv", index_col=0)
df = df.fillna('')
df["lang"] = df["lang"].apply(lambda x: x.split(" "))
df["obs"] = df["obs"].apply(json.loads)
df

Unnamed: 0,obs,lang
0,"[0.7857142857142857, 0.21428571428571427, 0.0,...",[]
1,"[0.5, 0.9285714285714286, 0.0, 0.0, 0.0, 0.0, ...",[]
2,"[0.2857142857142857, 0.6428571428571429, 0.0, ...",[]
3,"[0.14285714285714285, 0.8571428571428571, 0.0,...",[]
4,"[0.0, 0.7857142857142857, 0.0, 0.0, 0.0, 0.0, ...",[]
...,...,...
3999995,"[0.5714285714285714, 0.6428571428571429, 0.0, ...",[]
3999996,"[0.5714285714285714, 0.5, 0.0, 0.0, 0.0, 0.0, ...",[]
3999997,"[0.5714285714285714, 0.9285714285714286, 0.0, ...","[Prey, East]"
3999998,"[0.7142857142857143, 0.07142857142857142, 0.0,...",[]


In [2]:
df["obs"][0]

[0.7857142857142857,
 0.21428571428571427,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

Make model

In [11]:
import torch
import numpy as np
from torch import nn

from src.algo.language.lm import GRUEncoder, GRUDecoder, OneHotEncoder
from src.algo.nn_modules.mlp import MLPNetwork
    

class LanguageEncoder(nn.Module):

    """ 
    Class to manage and train the language modules: the Language Encoder, the 
    Observation Encoder and the Decoder. 
    """

    def __init__(self, input_dim, context_dim, hidden_dim, embed_dim, policy_layer_N, 
                 lr, vocab, max_message_len, diff=False, device="cuda:0"):
        super(LanguageEncoder, self).__init__()
        self.device = device

        self.word_encoder = OneHotEncoder(vocab, max_message_len)

        self.lang_encoder = GRUEncoder(
            context_dim, 
            hidden_dim, 
            embed_dim, 
            self.word_encoder,
            device=device)

        self.obs_encoder = MLPNetwork(
            input_dim, context_dim, hidden_dim, policy_layer_N)

        self.clip_loss = nn.CrossEntropyLoss()
        # self.captioning_loss = nn.NLLLoss()

        self.optim = torch.optim.Adam( 
            self.parameters(),
            # list(self.lang_encoder.parameters()) +
            # list(self.obs_encoder.parameters()) +
            # list(self.decoder.parameters()),
            lr=lr)

    def prep_rollout(self, device):
        self.device = device
        self.eval()
        self.to(self.device)
        self.lang_encoder.device = self.device

    def prep_training(self, device):
        self.device = device
        self.train()
        self.to(self.device)
        self.lang_encoder.device = self.device
    
    def store(self, obs, sent):
        self.buffer.store(obs, sent)

    def encode_sentences(self, sentence_batch):
        """ 
        Encode a batch of sentences. 
        :param sentence_batch (list(list(int))): Batch of enoded sentences.

        :return context_batch (torch.Tensor): Batch of context vectors, 
            dim=(batch_size, context_dim).
        """
        context_batch = self.lang_encoder(sentence_batch).squeeze(0)
        return context_batch

    def get_save_dict(self):
        save_dict = {
            "lang_encoder": self.lang_encoder.state_dict(),
            "obs_encoder": self.obs_encoder.state_dict()}
        return save_dict

    def load_params(self, save_dict):
        self.lang_encoder.load_state_dict(save_dict["lang_encoder"])
        self.obs_encoder.load_state_dict(save_dict["obs_encoder"])

In [14]:
obs_dim = 77
context_dim = 2
hidden_dim = 64
embed_dim = 4
n_layer = 2
lr = 0.007
vocab = ["Prey", "Center", "North", "South", "East", "West",
                        "Gem", "Yellow", "Green", "Purple"]
max_mess_len = 6

ll = LanguageEncoder(obs_dim, context_dim, hidden_dim, embed_dim, n_layer, lr, vocab, max_mess_len)

In [5]:
df["lang"] = ll.word_encoder.encode_batch(list(df["lang"]))
df

Unnamed: 0,obs,lang
0,"[0.7857142857142857, 0.21428571428571427, 0.0,...",[1]
1,"[0.5, 0.9285714285714286, 0.0, 0.0, 0.0, 0.0, ...",[1]
2,"[0.2857142857142857, 0.6428571428571429, 0.0, ...",[1]
3,"[0.14285714285714285, 0.8571428571428571, 0.0,...",[1]
4,"[0.0, 0.7857142857142857, 0.0, 0.0, 0.0, 0.0, ...",[1]
...,...,...
3999995,"[0.5714285714285714, 0.6428571428571429, 0.0, ...",[1]
3999996,"[0.5714285714285714, 0.5, 0.0, 0.0, 0.0, 0.0, ...",[1]
3999997,"[0.5714285714285714, 0.9285714285714286, 0.0, ...","[2, 6, 1]"
3999998,"[0.7142857142857143, 0.07142857142857142, 0.0,...",[1]


Train

In [22]:
def clip(ll, enc_obs, enc_lang, temp, device):
    # Compute similarity
    norm_enc_obs = enc_obs / enc_obs.norm(
        dim=1, keepdim=True)
    norm_enc_lang = enc_lang / enc_lang.norm(
        dim=1, keepdim=True)
    sim = norm_enc_obs @ norm_enc_lang.t() * temp
    mean_sim = sim.diag().mean()

    print(enc_obs)
    print(norm_enc_obs)
    print(sim.diag())
    print(mean_sim)

    # Compute CLIP loss
    labels = torch.arange(enc_obs.shape[0]).to(device)
    loss_o = ll.clip_loss(sim, labels)
    loss_l = ll.clip_loss(sim.t(), labels)
    clip_loss = (loss_o + loss_l) / 2

    return clip_loss, mean_sim.item()

def train_lang_encoder(data, ll, n_epochs=50000, batch_size=512, temp=0.07, eval_every=1000,
                       device="cuda:0"):
    train_data = data.iloc[:int(len(df) * 0.9)]
    eval_data = data.iloc[int(len(df) * 0.9):]

    clip_train_losses = []
    clip_eval_losses = []
    eval_sims = []

    ll.prep_training(device)

    for e_i in range(n_epochs):
        sample = train_data.sample(n=batch_size)
        
        obs_batch = torch.Tensor(np.array(list(sample["obs"]))).to(device)
        lang_batch = list(sample["lang"])
    
        enc_obs = ll.obs_encoder(obs_batch)
        enc_lang = ll.encode_sentences(lang_batch)

        clip_loss, mean_sim = clip(ll, enc_obs, enc_lang, temp, device)
        print(clip_loss, mean_sim)
        return
    
train_lang_encoder(df, ll)

tensor([[ 1.0000, -1.0000],
        [ 1.0000, -1.0000],
        [-1.0000,  1.0000],
        ...,
        [-0.9856,  0.9856],
        [-1.0000,  1.0000],
        [ 1.0000, -1.0000]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward0>)
tensor([[ 0.7071, -0.7071],
        [ 0.7071, -0.7071],
        [-0.7071,  0.7071],
        ...,
        [-0.7071,  0.7071],
        [-0.7071,  0.7071],
        [ 0.7071, -0.7071]], device='cuda:0', grad_fn=<DivBackward0>)
tensor([-0.0700, -0.0700,  0.0700,  0.0700,  0.0700,  0.0700,  0.0700,  0.0700,
         0.0700,  0.0700,  0.0700,  0.0700, -0.0700, -0.0700,  0.0700, -0.0700,
         0.0700,  0.0700, -0.0700,  0.0700,  0.0700, -0.0700,  0.0700, -0.0700,
         0.0700,  0.0700,  0.0700,  0.0700, -0.0700,  0.0700,  0.0700, -0.0700,
         0.0700, -0.0700,  0.0700,  0.0700,  0.0700, -0.0700,  0.0700, -0.0700,
         0.0700,  0.0700,  0.0700,  0.0700,  0.0700,  0.0700, -0.0700,  0.0700,
         0.0700,  0.0700,  0.0700,  0.0700,  0.0700, -0