In [5]:
import ir_datasets
import pandas as pd

In [2]:
dataset = ir_datasets.load('msmarco-passage/train/judged')

In [7]:
print("Loading qrels")
qrels_collection =\
    pd.DataFrame.from_records(dataset.qrels_iter(),
                                columns=['query_id', 'doc_id', 'relevance','iteration'],
                                index='query_id')
print("Loading documents")
documents_collection =\
    pd.DataFrame.from_records(dataset.docs_iter(),
                                columns=['doc_id', 'text'],
                                index='doc_id')
print("Loading queries")
queries_collection =\
    pd.DataFrame.from_records(dataset.queries_iter(),
                                columns=['query_id', 'text'],
                                index='query_id')

Loading qrels
Loading documents
Loading queries


/people/gerald/Documents/repositories/continual_learning_of_long_topic


In [41]:
from lire.dataset import generate_dataset

In [47]:
subsample = text_queries[: 10000]

In [48]:
''' Clustering query to build tasks.
The following script create a train/val/test set with(n, 20, 40) queries respectivelly
'''

import pandas as pd
import numpy as np
import argparse
import pickle
import os
import torch
import tqdm
import random
import inspect
from sentence_transformers import SentenceTransformer, util



class ContinualGenerator():
    ''' Continual generator that split corpus based on topics.

        Parameters:

            query_set : [(query_id, query_txt)]
                The set of queries information (query_id and the text 
                associated to the query)
            pre_computed_embedding_path : str(path) Optional
                The path to the queries embedding if not given
                embeddings will be computed instead (can take time).
                If no file is available at the current path but the 
                parameter is given the embedding will be saved at the 
                location

    '''
    def __init__(self, query_set, pre_computed_embedding_path=None):
        self.query_set = query_set
        self.pre_computed_embedding_path = pre_computed_embedding_path


    def _get_embeddings(self, queries_content): 
        if self.pre_computed_embedding_path is None:
            model = SentenceTransformer('stsb-roberta-large')
            corpus_embeddings = model.encode(queries_content, show_progress_bar=True, convert_to_numpy=True)

        elif not os.path.exists(self.pre_computed_embedding_path):
            os.makedirs(os.path.dirname(self.pre_computed_embedding_path), exist_ok=True)
            model = SentenceTransformer('stsb-roberta-large')
            corpus_embeddings = model.encode(queries_content, show_progress_bar=True, convert_to_numpy=True)

            with open(self.pre_computed_embedding_path, "wb") as fOut:
                pickle.dump({'sentences': queries_content , 'embeddings': corpus_embeddings}, fOut)

        else:
            with open(self.pre_computed_embedding_path, "rb") as fOut:
                corpus_embeddings = pickle.load(fOut)['embeddings']

        return corpus_embeddings
    
    @staticmethod
    def community_filtering(cos_scores, alpha):
        return torch.arange(len(cos_scores))[cos_scores >= alpha]

    @staticmethod
    def community_detection_clustering(embeddings, ss_embeddings_estimation=50000,
                                       alpha=0.75, beta=0.55, mcs=2000):
        rp = torch.randperm(len(embeddings))
        sse = rp[:ss_embeddings_estimation]
        ss_embeddings = embeddings[sse]
        emcs = round((ss_embeddings_estimation/len(embeddings)) * mcs)
        print('Computing the cos score on ', len(ss_embeddings), ' embeddings')
        cos_scores = util.pytorch_cos_sim(ss_embeddings, ss_embeddings)

        print('Retrieve the ', emcs, ' closest neigbhors for each embeddings (max similarity)')
        top_k_values, top_k_indexes = cos_scores.topk(k=emcs, largest=True)

        print('Create the communities')
        communities = [(rp[top_k_indexes[i][0]], rp[ContinualGenerator.community_filtering(cos_scores[i], alpha)]) 
                    for i in range(len(top_k_values))  
                    if(top_k_values[i][-1] >= alpha)]
        print('Filtering communities to avoid overlapping (at the time there is ', len(communities),' communities)')
        sorted_communities = sorted(communities, key=lambda x: len(x[1]), reverse=True)

        unique_communities = []
        extracted_ids = set()

        for centroid, community in sorted_communities:
            add_cluster = True
            for idx in community:
                if idx in extracted_ids:
                    add_cluster = False
                    break

            if add_cluster:
                unique_communities.append((centroid, community))
                for idx in community:
                    extracted_ids.add(idx)
        centroids = [torch.Tensor(embeddings[centroid]) for centroid, _ in unique_communities]

        print('creating real clusters according to number of examples by clusters')
        print(len(embeddings))
        print(len(centroids))
        cluster_center = torch.stack(centroids)
        complete_communities = [[] for i in range(len(cluster_center))]
        for i, d in enumerate(embeddings):
            cos_sim = util.pytorch_cos_sim(cluster_center, d).flatten()
            max_value, max_index = cos_sim.max(-1)
            if max_value >= beta:
                complete_communities[max_index].append(i)
        
    
        return centroids, complete_communities




    def generate(self, t1=0.75, t2=0.5, mcs=2000, estimation_set_size=50000):
        ''' Generate groups of queries.
            
            Parameters:
                t1: float
                    the first threshold
                t2: float
                    the second threshold (all dataset)
                msc: int
                    minimum number of groups representent
                estimation_set_size: int 
                    the number of elements to perform clustering
                    according to t1
        '''
        queries_ids = [k for (k, v) in self.query_set]
        queries_content = [v for (k, v) in self.query_set]
        embeddings = self._get_embeddings(queries_content)
        centroids, communities =\
            ContinualGenerator.community_detection_clustering(embeddings, estimation_set_size, t1, t2, mcs)
        fcom = [[queries_ids[query_index] for j, query_index in enumerate(community)]
                 for i, community in enumerate(communities)]
        return fcom, centroids



In [49]:
generator = ContinualGenerator(subsample)

In [72]:
clusters, embeddings = generator.generate(t1=0.5, t2=0.5, mcs=10, estimation_set_size=500)


# three generation with parameters
# small 0.7 0.5 (mcs must be higher)
# medium 0.75 0.5
# large 0.75 0.55 (mcs must be lower)

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Computing the cos score on  500  embeddings
Retrieve the  5  closest neigbhors for each embeddings (max similarity)
Create the communities
Filtering communities to avoid overlapping (at the time there is  59  communities)
creating real clusters according to number of examples by clusters
1000
59


In [69]:
len(clusters)

53

In [71]:
[queries_collection.loc[i] for  i in clusters[0]]

[text    what is an example of paracrine signaling
 Name: 714345, dtype: object,
 text    can depression cause insomnia
 Name: 66378, dtype: object,
 text    what is substance p
 Name: 800666, dtype: object,
 text    how many paracetamol is an overdose
 Name: 291598, dtype: object]