# Representational similarity analysis



### NOTE: This is a WIP, I will add actual encoding comparisons here after we train the models

The approach and code is taken from: https://arxiv.org/pdf/1905.06401

present RSA as a variant of pattern-information analysis, to be applied for understanding neural activation patterns in human brains, for example syntactic computations (Tyler et al., 2013) or sensory cortical processing (Yamins and DiCarlo, 2016). The core idea is to find connections between data from neu-
roimaging, behavioral experiments and computational modeling by correlating representations of stimuli in each of these representation spaces via their pairwise (dis)similarities. RSA has also been used for measuring similarities between neural-network representation spaces.

Basic RSA measures correlation
between similarities in two different representa-
tions globally, i.e. how close they are in their total-
ity. In contrast, diagnostic models answer a more
specific question: to what extent a particular type
of information can be extracted from a given rep-
resentation. For example, while for a particular
neural encoding of sentences it may be possible to
predict the length of the sentence with high accu-
racy, the RSA between this representation and the
strings represented only by their length may be rel-
atively small in magnitude, since the neural repre-
sentation may be encoding many other aspects of
the input in addition to its length


The scores according to RSA in some cases show a different picture. This is expected, as RSA answers a substantially different question than the other two approaches: it looks at how the whole representations match in their similarity structure, whereas both the diagnostic model and RSAREGRESS focus on the part of the representation that encodes the target information the strongest.

In [7]:
def get_dummy_encodings(
    rows_test: int = 20, rows_ref: int = 10, cols: int = 300
) -> tuple[dict, dict]:
    """
    rows_test: number of sentences in the test
    rows_ref: number of sentences in the reference
    cols: size of sentence embedding
    """
    encodings1 = {
        "test": torch.randn(rows_test, cols),
        "ref": torch.randn(rows_ref, cols),
    }
    encodings2 = {
        "test": torch.randn(rows_test, cols),
        "ref": torch.randn(rows_ref, cols),
    }
    return encodings1, encodings2

In [8]:
enc1, enc2 = get_dummy_encodings()
rsa_report(enc1, enc2)

  x, y = torch.tensor(x), torch.tensor(y)


{'rsa': -0.04330206289887428,
 'rsa_regress': {'mse': {'mean': 0.0033659918466582895,
   'std': 0.0007196518294062319,
   'alpha': 10},
  'r2': {'mean': -41.69127594072556, 'std': 45.69369499187717, 'alpha': 10},
  'pearson_r': {'mean': -0.03999999791383733,
   'std': 0.2497999276668854,
   'alpha': 0.01}}}

In [5]:
# Code adapted from: https://github.com/gchrupala/correlating-neural-and-symbolic-representations-of-language/blob/master/rsa/report.py


def run_rsa():
    #   try:
    #       data_sent = json.load(open("data/out/ewt.json"))
    #   except FileNotFoundError:
    #       S.ewt_json()
    #       data_sent = json.load(open("data/out/ewt.json"))
    try:
        data = torch.load("data/out/ewt_embed.pt")
    except FileNotFoundError:
        S.ewt_embed()
        data = torch.load("data/out/ewt_embed.pt")

    result = {}

    result[alpha] = dict(bow=dict(), bert=dict(), bert24=dict(), infersent=dict())

    data_enc_bow = dict(test=data["bow"]["test"], ref=data["bow"]["ref"])
    result[alpha]["bow"] = RSA_report(data_tk, data_enc_bow)
    result[alpha]["bert"] = dict(random={}, trained={})
    result[alpha]["bert24"] = dict(random={}, trained={})
    result[alpha]["infersent"] = dict(random={}, trained={})

    for mode in ["random", "trained"]:
        for step in ["first", "last"]:
            result[alpha]["bert"][mode][step] = {}
            result[alpha]["bert24"][mode][step] = {}
            for layer in range(12):
                logging.info(
                    "Computing RSA/RSA_regress scores for {} {} {}".format(
                        mode, step, layer
                    )
                )
                data_enc = dict(
                    test=data["bert"]["test"][mode][layer][step],
                    ref=data["bert"]["ref"][mode][layer][step],
                )
                result[alpha]["bert"][mode][step][layer] = RSA_report(data_tk, data_enc)
            for layer in range(24):
                logging.info(
                    "Computing RSA/RSA_regress scores for {} {} {}".format(
                        mode, step, layer
                    )
                )
                data_enc = dict(
                    test=data["bert24"]["test"][mode][layer][step],
                    ref=data["bert24"]["ref"][mode][layer][step],
                )
                result[alpha]["bert24"][mode][step][layer] = RSA_report(
                    data_tk, data_enc
                )

        result[alpha]["infersent"][mode] = RSA_report(
            data_tk,
            dict(
                test=data["infersent"]["test"][mode], ref=data["infersent"]["ref"][mode]
            ),
        )
    json.dump(result, open("report/RSA_natural.json", "w"), indent=2)

In [6]:
# Code taken from: https://github.com/gchrupala/correlating-neural-and-symbolic-representations-of-language/blob/master/rsa/synsem.py


def ewt_embed():
    """Compute BoW, BERT and Infersent embeddings for the EWT data and save to file."""
    import rsa.pretrained as Pre
    from sklearn.feature_extraction.text import CountVectorizer

    def container():
        return dict(
            test=dict(random=dict(), trained=dict()),
            ref=dict(random=dict(), trained=dict()),
        )

    data = json.load(open("data/out/ewt.json"))
    emb = dict(bow={}, bert=container(), bert24=container(), infersent=container())
    # BOW
    v = CountVectorizer(tokenizer=lambda x: x.split())
    sent_ref = [s["sent"] for s in data["ref"]]
    sent_test = [s["sent"] for s in data["test"]]
    v.fit(sent_ref + sent_test)
    emb["bow"]["test"] = torch.tensor(
        v.transform(sent_test).toarray(), dtype=torch.float
    )
    emb["bow"]["ref"] = torch.tensor(v.transform(sent_ref).toarray(), dtype=torch.float)

    for split in ["test", "ref"]:
        sent = [datum["sent"] for datum in data[split]]
        for mode in ["random", "trained"]:
            if mode == "random":
                rep24 = list(Pre.encode_bert(sent, trained=False, large=True))
                rep = list(Pre.encode_bert(sent, trained=False))
                emb["infersent"][split][mode] = Pre.encode_infersent(
                    sent, trained=False
                )
            else:
                rep24 = list(Pre.encode_bert(sent, trained=True, large=True))
                rep = list(Pre.encode_bert(sent, trained=True))
                emb["infersent"][split][mode] = Pre.encode_infersent(sent, trained=True)

            pooled24 = torch.cat([pooled for _, pooled in rep24])
            pooled = torch.cat([pooled for _, pooled in rep])
            emb["bert24"][split][mode]["pooled"] = pooled24
            emb["bert"][split][mode]["pooled"] = pooled
            for i in range(len(rep24[0][0])):
                emb["bert24"][split][mode][i] = {}
                emb["bert24"][split][mode][i]["summed"] = torch.cat(
                    [layers[i].sum(dim=1) for layers, _ in rep24], dim=0
                )
                emb["bert24"][split][mode][i]["first"] = torch.cat(
                    [layers[i][:, 0, :] for layers, _ in rep24], dim=0
                )
                emb["bert24"][split][mode][i]["last"] = torch.cat(
                    [layers[i][:, -1, :] for layers, _ in rep24], dim=0
                )

            for i in range(len(rep[0][0])):
                emb["bert"][split][mode][i] = {}
                emb["bert"][split][mode][i]["summed"] = torch.cat(
                    [layers[i].sum(dim=1) for layers, _ in rep], dim=0
                )
                emb["bert"][split][mode][i]["first"] = torch.cat(
                    [layers[i][:, 0, :] for layers, _ in rep], dim=0
                )
                emb["bert"][split][mode][i]["last"] = torch.cat(
                    [layers[i][:, -1, :] for layers, _ in rep], dim=0
                )
    torch.save(emb, "data/out/ewt_embed.pt")