In [1]:
import sys
sys.path.append("/home/gerald/Documents/CPD/repository/LifelongInformationRetrieval") 

# Question Clustering
The objective of the current notebook is to cluster the queries of the datasets, such same topic query remains in a same cluster. The proposed methods remains simple, it use $K$-Means algorithm on vectorised sentences. To get sentence vector we use the **CLS** token of *BERT* classification model.

In [2]:
from lire.data_tools.dataset import MSMarco
import pandas as pd

data_folder   = "/media/gerald/00B1B02B44A76AB2/CPD/data"
train_queries = MSMarco.MSMarcoPassageRankingDataset.load_queries("train", data_folder)
dev_queries   = MSMarco.MSMarcoPassageRankingDataset.load_queries("dev", data_folder)
eval_queries  = MSMarco.MSMarcoPassageRankingDataset.load_queries("eval", data_folder)
queries_set   = pd.concat([train_queries, dev_queries, eval_queries])

In [3]:
query_example = queries_set.iloc[15][1]
print('Example of query: "', query_example, '"')

Example of query: " number of times congress voted to repeal aca "


## I. Compare Language Model for clustering

In [13]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [14]:
inputs = tokenizer(query_example, return_tensors="pt")
print(tokenizer.convert_ids_to_tokens(inputs.input_ids.squeeze().tolist()))
sequence_output, all_layer_output = model(**inputs, output_hidden_states=True)

['<s>', 'san', 'itiz', 'er', 'Ġtemperature', '</s>']


In [15]:
query_a = 'What is a transformer model in deep learning ?'
query_b = 'Why transformers models are so efficient in deep learning ?'
query_c = 'What is the coronavirus ?'
query_d = 'The pandemic of the coronavirus take place in late 2019'

We want $sim(q_a, q_b)>sim(q_a, q_c) + \epsilon_1 $ and that $sim(q_a, q_c) + \epsilon_2 <sim(q_c, q_d)$, with larger possible $\epsilon_1, \epsilon_2$ values. 

In [16]:
inputs = tokenizer(query_a, return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True)
query_a_embed_cls = outputs[1][-1][0][0]

inputs = tokenizer(query_b, return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True)
query_b_embed_cls = outputs[1][-1][0][0]

inputs = tokenizer(query_c, return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True)
query_c_embed_cls = outputs[1][-1][0][0]

inputs = tokenizer(query_d, return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True)
query_d_embed_cls = outputs[1][-1][0][0]


### Comparing using scalar product

In [17]:
print(torch.einsum("i, i -> ",query_a_embed_cls, query_b_embed_cls))

tensor(123.7903, grad_fn=<ViewBackward>)


In [18]:
print(torch.einsum("i, i -> ",query_a_embed_cls, query_c_embed_cls))

tensor(124.4663, grad_fn=<ViewBackward>)


In [19]:
print(torch.einsum("i, i -> ",query_b_embed_cls, query_c_embed_cls))

tensor(122.7851, grad_fn=<ViewBackward>)


In [20]:
print(torch.einsum("i, i -> ",query_b_embed_cls, query_d_embed_cls))

tensor(119.3706, grad_fn=<ViewBackward>)


In [21]:
print(torch.einsum("i, i -> ",query_c_embed_cls, query_d_embed_cls))

tensor(120.1931, grad_fn=<ViewBackward>)


### Comparing using l2 distance

In [22]:
print(((query_a_embed_cls - query_b_embed_cls)**2).sum())

tensor(0.1632, grad_fn=<SumBackward0>)


In [23]:
print(((query_a_embed_cls - query_c_embed_cls)**2).sum())

tensor(0.2444, grad_fn=<SumBackward0>)


In [24]:
print(((query_b_embed_cls - query_c_embed_cls)**2).sum())

tensor(0.2862, grad_fn=<SumBackward0>)


In [25]:
print(((query_b_embed_cls - query_d_embed_cls)**2).sum())

tensor(0.7310, grad_fn=<SumBackward0>)


In [26]:
print(((query_c_embed_cls - query_d_embed_cls)**2).sum())

tensor(0.5191, grad_fn=<SumBackward0>)


## Using sentence-bert

In [13]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('stsb-roberta-large')

In [14]:
#Sentences are encoded by calling model.encode()
query_a_embed = model.encode(query_a)
query_b_embed = model.encode(query_b)
query_c_embed = model.encode(query_c)
query_d_embed = model.encode(query_d)

NameError: name 'query_a' is not defined

In [None]:
cos_sim = util.pytorch_cos_sim(query_a_embed, query_b_embed)
print("cos(a, b) = ", cos_sim.item())
cos_sim = util.pytorch_cos_sim(query_a_embed, query_c_embed)
print("cos(a, c) = ", cos_sim.item())
cos_sim = util.pytorch_cos_sim(query_b_embed, query_c_embed)
print("cos(b, x) = ", cos_sim.item())
cos_sim = util.pytorch_cos_sim(query_b_embed, query_d_embed)
print("cos(b, d) = ", cos_sim.item())
cos_sim = util.pytorch_cos_sim(query_c_embed, query_d_embed)
print("cos(c, d) = ", cos_sim.item())

## Using sentence-distilbert

This model is used by default in the query clustering example shown in sentence-transformer [repository](https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/clustering/fast_clustering.py)
We use it to create the dataset **MSMarcoPassageRankingTopicQueryDataset**, however remarks that it is easy to create an other dataset based on topics deriving the class with only a different *configuration_path* attribute. 

In [30]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')

In [31]:
query_a_embed = model.encode(query_a)
query_b_embed = model.encode(query_b)
query_c_embed = model.encode(query_c)
query_d_embed = model.encode(query_d)

In [33]:
cos_sim = util.pytorch_cos_sim(query_a_embed, query_b_embed)
print("cos(a, b) = ", cos_sim.item())
cos_sim = util.pytorch_cos_sim(query_a_embed, query_c_embed)
print("cos(a, c) = ", cos_sim.item())
cos_sim = util.pytorch_cos_sim(query_b_embed, query_c_embed)
print("cos(b, c) = ", cos_sim.item())
cos_sim = util.pytorch_cos_sim(query_b_embed, query_d_embed)
print("cos(b, d) = ", cos_sim.item())
cos_sim = util.pytorch_cos_sim(query_c_embed, query_d_embed)
print("cos(c, d) = ", cos_sim.item())

cos(a, b) =  0.718645453453064
cos(a, c) =  0.3546810448169708
cos(b, c) =  0.32943886518478394
cos(b, d) =  0.45006805658340454
cos(c, d) =  0.4905753433704376


## II.Clustering using sentence bert 

In [15]:
import torch
import numpy as np
import tqdm
import os
import csv
import pickle
import time

from sentence_transformers import SentenceTransformer, util
embedding_cache_path = "/local/gerald/CPD/data/query_embeddings.pkl"

In [23]:
# According to https://www.sbert.net/examples/applications/clustering/README.html (fast clustering)

import numpy as np

def community_detection(embeddings, threshold=0.75, min_community_size=10, init_max_size=1000):
    """
    Function for Fast Community Detection
    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    Returns only communities that are larger than min_community_size. The communities are returned
    in decreasing order. The first element in each list is the central point in the community.
    """

    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)

    # Filter for rows >= min_threshold
    extracted_communities = []
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

            # Only check top k most similar entries
            top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()

            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)

            extracted_communities.append(new_cluster)

    # Largest cluster first
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)

    # Step 2) Remove overlapping communities
    unique_communities = []
    extracted_ids = set()

    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)

    return unique_communities

def completing_cluster(cluster_center, data):
    clusters = [[] for x in cluster_center]
    data = torch.Tensor(data)
    cluster_center = torch.Tensor(cluster_center)
    for i, d in zip(tqdm.notebook.trange(len(data)),data):
        cos_sim = util.pytorch_cos_sim(cluster_center, d)
        clusters[cos_sim.argmax()].append(i)
    return clusters

In [24]:


# defining the model with embedding will be used for clustering
model = SentenceTransformer('stsb-roberta-large')
# model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')

embedding_cache_path = "/local/gerald/CPD/data/query_embeddings2.pkl"



corpus_sentences = queries_set[1].tolist() 



if not os.path.exists(embedding_cache_path):
    print("Encode the corpus. This might take a while")
    corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True)
    print("Store file on disc")
    with open(embedding_cache_path, "wb") as fOut:
        pickle.dump({'sentences': corpus_sentences, 'embeddings': corpus_embeddings}, fOut)

print("Load pre-computed embeddings from disc")
with open(embedding_cache_path, "rb") as fIn:
    cache_data = pickle.load(fIn)
corpus_sentences = cache_data['sentences']
corpus_embeddings = cache_data['embeddings']
corpus_sentences_sub = corpus_sentences[:50000]
corpus_embeddings_sub = corpus_embeddings[:50000]
clusters = community_detection(corpus_embeddings_sub, threshold=0.8,
                               min_community_size=2, init_max_size=1000)
cluster_sentences = [[corpus_sentences_sub[s] for s in cluster] 
                     for cluster in clusters]
print("Number of clusters discovered ", len(cluster_sentences))
cluster_center = np.vstack([corpus_embeddings[cluster[0]] for cluster in clusters])
cluster_completed = completing_cluster(cluster_center, corpus_embeddings) 
torch.save(cluster_completed,"/local/gerald/CPD/data/cluster_completed2.pth")

Load pre-computed embeddings from disc
Number of clusters discovered  2727


  0%|          | 0/1010916 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [33]:
torch.save(cluster_center, "/local/gerald/CPD/data/cluster_center2.pth")

In [22]:
with open(embedding_cache_path, "rb") as fIn:
    cache_data = pickle.load(fIn)
corpus_sentences = cache_data['sentences']
corpus_embeddings = cache_data['embeddings']
clusters_final = torch.load("/local/gerald/CPD/data/cluster_completed2.pth")
cluster_sentences = [[corpus_sentences[s] for s in cluster] for cluster in clusters_final]

In [30]:
cluster_sentences[3540]

['what county is hollywood fl',
 "how far is harrah's casino from the french quarter",
 'what sea is canary islands in',
 'what county is denham springs louisiana in',
 'when did allies land on beaches in normandy france on d day',
 'how many soldiers died storming the beaches of normandy',
 'what are faverolle chickens',
 'where does ellie louise live',
 'when were the new orleans pelicans formed',
 'where have laura ingalls wilder lived',
 'what is the hallux of the foot',
 'seafield inpatient number',
 'what breed of canary is a pure white one',
 'demographics of holly springs nc population',
 'wells fargo telephone number',
 'what county is newhall',
 'how far is the td gardens from hyatt regency',
 'is tulane a private university',
 'what county is crawfordville fl',
 'what county is north port fl in',
 'what county is hendersonville tn in',
 'when did us storm normandy beach',
 'what county is holiday, fl',
 'navy federal cliffdale branch phone number',
 'did finn ever throw ball

## III.Cleaning the dataset

In [None]:
data_folder    = "/local/gerald/CPD/data"
embedding_path = "/local/gerald/CPD/data/query_embeddings2.pkl"
cluster_path   = "/local/gerald/CPD/data/cluster_completed2.pth"
cluster_center   = "/local/gerald/CPD/data/cluster_center2.pth"

train_queries = MSMarco.MSMarcoPassageRankingDataset.load_queries("train", data_folder)
dev_queries   = MSMarco.MSMarcoPassageRankingDataset.load_queries("dev", data_folder)
eval_queries  = MSMarco.MSMarcoPassageRankingDataset.load_queries("eval", data_folder)
train_qrels   = MSMarco.MSMarcoPassageRankingDataset.load_qrels("train", data_folder)
dev_qrels     = MSMarco.MSMarcoPassageRankingDataset.load_qrels("dev", data_folder)
qrels_set     = Qrels.merge_qrels(train_qrels, dev_qrels)

queries_set   = pd.concat([train_queries, dev_queries, eval_queries])
query_example = queries_set.iloc[11][1]
print('Example of query: "', query_example, '0"')

queries_sentences  = queries_set[1].tolist() 
queries_ids        = queries_set.index.values.tolist()
queries_inv_ids    = {v:i for i, v in enumerate(queries_ids)}
queries_ids_train  = len(train_queries)
queries_ids_dev    = len(train_queries) + len(dev_queries)
queries_ids_eval   = len(train_queries) + len(dev_queries) + len(eval_queries)
clusters    = torch.load(cluster_path)


### Retrieve the split and looking for empty/too small clusters

In [None]:
train_set, dev_set, eval_set = [], [], [] 
for cluster in clusters:
    train_set.append([])
    dev_set.append([])
    eval_set.append([])

    for query in cluster:
        if(str(queries_ids[query]) in qrels_set):
            if(query < queries_ids_train):
                train_set[-1].append(queries_ids[query])
            elif(query < queries_ids_dev):
                dev_set[-1].append(queries_ids[query])
            elif(query < queries_ids_eval):
                eval_set[-1].append(queries_ids[query])
            else:
                raise Exception("Irrelevant query id")

too_small_dev_cluster = [ i for i, cluster in enumerate(dev_set) if(len(cluster) <= 2)]
too_small_eval_cluster = [ i for i, cluster in enumerate(eval_set) if(len(cluster) <= 2)]

print("Number of dev sets that contains less than 2 queries : ", len(too_small_dev_cluster))
print("Number of eval sets that contains less than 2 queries : ", len(too_small_eval_cluster))

### Merge  of too small cluster and loading needed data

In [None]:
with open(embedding_path, "rb") as fIn:
    cache_data = pickle.load(fIn)
corpus_sentences = cache_data['sentences']
corpus_embeddings = cache_data['embeddings']

clusters_to_merge = too_small_dev_cluster
cluster_center = torch.load("/local/gerald/CPD/data/cluster_center2.pth")

clusters_center = torch.load("/local/gerald/CPD/data/cluster_center2.pth")
cluster_center_merged  = []
cluster_center_reindex = []
new_clusters = []

# reindexing
for i, cluster_center in enumerate(clusters_center):
    if i not in clusters_to_merge:
        cluster_center_merged.append(clusters_center[i])
        cluster_center_reindex.append(i)
        new_clusters.append(clusters[i])

# much faster using pytorch
cluster_center_merged  = torch.Tensor(cluster_center_merged)
corpus_embeddings = torch.Tensor(corpus_embeddings)

# merging
for i, cluster_index in zip(tqdm.notebook.trange(len(clusters_to_merge)), clusters_to_merge):
    cluster = clusters[cluster_index]
    for query in cluster:
        cos_sim = util.pytorch_cos_sim(cluster_center_merged, corpus_embeddings[query])
        new_clusters[cos_sim.argmax()].append(query)



### Retrieve agains the splits and save the clusters indexes

In [None]:

train_set, dev_set, eval_set = [], [], [] 
for cluster in new_clusters:
    train_set.append([])
    dev_set.append([])
    eval_set.append([])
    
    for query in cluster:
        if(str(queries_ids[query]) in qrels_set):
            if(query < queries_ids_train):
                train_set[-1].append(queries_ids[query])
            elif(query < queries_ids_dev):
                dev_set[-1].append(queries_ids[query])
            elif(query < queries_ids_eval):
                eval_set[-1].append(queries_ids[query])
            else:
                raise Exception("Irrelevant query id")
import json
with open('/web/gerald/public_html/lire_data/msmarco_topics_cleaned_train_stsb-roberta-large.json', 'w') as outfile:
    json.dump(train_set, outfile, indent = 4)
with open('/web/gerald/public_html/lire_data/msmarco_topics_cleaned_dev_stsb-roberta-large.json', 'w') as outfile:
    json.dump(dev_set, outfile, indent = 4)
with open('/web/gerald/public_html/lire_data/msmarco_topics_cleaned_eval_stsb-roberta-large.json', 'w') as outfile:
    json.dump(eval_set, outfile, indent = 4)

### Process Top1000 by topics

In [1]:
import sys
sys.path.append("/home/gerald/Documents/CPD/repository/LifelongInformationRetrieval") 
import h5py

import os
import json
import pandas as pd

from lire.data_tools.dataset import MSMarco
from lire.data_tools import data_large

In [2]:

with open('/web/gerald/public_html/lire_data/msmarco_topics_cleaned_train_stsb-roberta-large.json', 'r') as infile:
    train_set = json.load(infile)
with open('/web/gerald/public_html/lire_data/msmarco_topics_cleaned_dev_stsb-roberta-large.json', 'r') as infile:
    dev_set   = json.load(infile)
with open('/web/gerald/public_html/lire_data/msmarco_topics_cleaned_eval_stsb-roberta-large.json', 'r') as infile:
    eval_set  = json.load(infile)

data_folder   = "/media/gerald/00B1B02B44A76AB2/CPD/data"
train_queries = MSMarco.MSMarcoPassageRankingDataset.load_queries("train", data_folder)
dev_queries   = MSMarco.MSMarcoPassageRankingDataset.load_queries("dev", data_folder)
eval_queries  = MSMarco.MSMarcoPassageRankingDataset.load_queries("eval", data_folder)
queries_set   = pd.concat([train_queries, dev_queries, eval_queries])

In [3]:
queries_set_unique = set(queries_set.index.unique().tolist())
data_folder = os.path.join(data_folder, "MSMarcoPassageRankingDataset")
dev_top_1000_path = os.path.join(data_folder, "dev-top-1000.data")
train_top_1000_path = os.path.join(data_folder, "train-id-top-1000.data")
train_triplet_path = os.path.join(data_folder,'train_positive_negative.data')


In [4]:
dev_inverse_cluster = {qid:cid for cid, c in enumerate(dev_set) for qid in c}
train_inverse_cluster = {qid:cid for cid, c in enumerate(train_set) for qid in c}

In [5]:
hdf5_filepath = os.path.join(data_folder, "data.hdf5")
if(os.path.exists(hdf5_filepath)):
    os.remove(hdf5_filepath)

root = h5py.File(hdf5_filepath,'a')

dev = root.create_group("dev")
dev_1000 = dev.create_group("dev-top-1000")
train = root.create_group("train")
train_1000 = train.create_group("train-top-1000")
train_triple = train.create_group("qid-pid-nid-cluster")

In [6]:
data_large.HDF5DatasetManager.batched_csv_to_hdf5(hdf5_filepath, dev_1000.name,
                                                  dev_top_1000_path, col_name=["qid", "did"],
                                                  group_by_key="qid", col_type=int, batch_size=int(1e7)
                                                 )
print("Training data")
data_large.HDF5DatasetManager.batched_csv_to_hdf5(hdf5_filepath, train_1000.name,
                                                  train_top_1000_path, col_name=["qid", "did"],
                                                  group_by_key="qid", col_type=int, batch_size=int(1e7)
                                                 )

Preprocessing and counting nb lines, can be long
Nb lines processed: 6668967

100%|██████████| 6980/6980 [00:00<00:00, 8001.23it/s]
100%|██████████| 1/1 [00:39<00:00, 39.40s/it]


Training data
Preprocessing and counting nb lines, can be long
Nb lines processed: 90000000

KeyboardInterrupt: 

In [None]:
train_triple = train.create_group("qid-pid-nid-cluster")
data_large.HDF5DatasetManager.batched_csv_to_hdf5(hdf5_filepath, train_triple.name,
                                                  train_triplet_path, col_name=["qid", "pid", "nid"],
                                                  group_by_key="qid", group_by_func=lambda x : train_inverse_cluster[x],
                                                  col_type=int, batch_size=int(1e7)
                                                 )
