In [16]:
from collections import Counter
from itertools import chain
import pandas as pd

import re
clean = re.compile('<.*?>')

import pandas as pd
import time
import requests

class ChatNoir:
    

    def __init__(self, queries: pd.DataFrame, size ):
        self.url = 'https://www.chatnoir.eu/api/v1/_search'
        self.all_queries = queries
        self.size = size
    
    def get_response(self):
        df = pd.DataFrame()
        df['topic'] = self.all_queries['topic']
        df['query'] = self.all_queries['query']
        df['tag'] = self.all_queries['tag']

        resp = []

        for i in range(0, len(self.all_queries.index)):
            query = self.all_queries.iloc[i]['query']

            request_data = {
            "apikey": "67fac2d9-0f98-4c19-aab0-18c848bfa130",
            "query": query,
            "size": self.size,
            "index": ["cw12"],
            }
            
            def chatnoir_req():
                try:
                    response_=requests.post(self.url, data=request_data)
                    response_.raise_for_status()
                    return response_.json()
                except requests.exceptions.HTTPError:
                    time.sleep(1)
                    return chatnoir_req()   
             
            result = chatnoir_req()
            filter_result = []
            for doc in result['results']:
                answer = {
                    'trec_id': doc['trec_id'], 
                    'uuid': doc['uuid'], 
                    'title': doc['title'], 
                    'snippet': doc['snippet'],
                    'target_hostname': doc['target_hostname'],
                    'score': doc['score']
                    }
                filter_result.append(answer)
            
            resp.append({'results': filter_result})
        
        df['response'] = resp
        
        return df

class Merge:

    def __init__(self, topics: [str], resp_df: pd.DataFrame, weights: dict, method: str):

        self.resp_df = resp_df
        self.weights = weights
        self.method = method
        self.original_topics = topics
       
    

    def merging(self):
        merged_topics = {}
        for topic in self.original_topics:
          splitdf = self.resp_df[self.resp_df['topic']==topic].reset_index(drop=True)
          res_tags_topic = {}
          for i in range(0, len(splitdf.index)):
            tag = splitdf.iloc[i]['tag']
            res = splitdf.iloc[i]['response']
            res_tags_topic[tag] = res

          merged_resp = self.merging_resp_topic(res_tags_topic)
          merged_topics[topic] = merged_resp

        
        df = self.to_df(merged_topics)
        return df

    def to_df(self, merged_topics):
        final_merged_df = pd.DataFrame()

        topics_col = []
        trecids_col = []
        uuids_col = []
        titles_col = []
        snippets_col = []
        hostnames_col=[]
        scores_col=[]
        updatedscores_col = []


        for topic, res in merged_topics.items():
          n = len(res['results'])
          topics_col.append(n*[topic])
          for doc in res['results']:
            trecids_col.append(doc['trec_id'])
            uuids_col.append(doc['uuid'])
            titles_col.append(re.sub(clean,'',doc['title']))
            snippets_col.append(re.sub(clean, '', doc['snippet']))
            hostnames_col.append(doc['target_hostname'])
            scores_col.append(doc['score'])
            updatedscores_col.append(doc['updated_score'])


        topics_col = list(chain.from_iterable(topics_col))
        
        final_merged_df = pd.DataFrame()
        final_merged_df['topic'] = topics_col
        final_merged_df['trec_id'] = trecids_col
        final_merged_df['uuid'] = uuids_col
        final_merged_df['title'] = titles_col
        final_merged_df['snippet'] = snippets_col
        final_merged_df['target_hostname'] = hostnames_col
        final_merged_df['score'] = scores_col
        final_merged_df['updated_score'] = updatedscores_col

        return final_merged_df

    def merging_resp_topic(self, res_tags):
        #resp['results'] = [doc1, doc2, ...]
        updated_resp_tags=[]
        for tag, resp in res_tags.items():
            updated_resp = self.update_scores_by_tags(tag, resp['results']) #(tag, updated_resp)
            updated_resp_tags.append(updated_resp)
        
        sorted_updated_resp_tags = list(chain.from_iterable([updated_resp[1] for updated_resp in updated_resp_tags]))
        #sort list by update_score: list of docs 
        sorted_updated_resp_tags = sorted(sorted_updated_resp_tags, key=lambda doc: doc['updated_score'], reverse=True)
        #find docs with same trec-id = trec-id appear multiple times
        trec_ids = [doc['trec_id'] for doc in sorted_updated_resp_tags]
        multiple_ids = [trec_id for trec_id, count in dict(Counter(trec_ids)).items() if count!=1]
        
        merged_resp=[]
        if multiple_ids!=[]:
            merged_resp = self.mergen_for_multiple_ids(sorted_updated_resp_tags, multiple_ids)
        else:
            merged_resp = sorted_updated_resp_tags
        return {'results':merged_resp}

    def update_scores_by_tags(self, tag, resp):
        updated_resp = []
        
        if "sensevec" in tag:
            weight = self.weights['sensevec']
        elif "embedded" in tag: #because embedded_1, embedded_2, ...
            weight = self.weights['embedded']
        else:
            weight = self.weights[tag]

        for doc in resp:
                doc['updated_score'] =  doc['score']*weight #update score with weight
                updated_resp.append(doc)

        return (tag, updated_resp)

    def mergen_for_multiple_ids(self, sorted_updated_resp_tags, multiple_ids):
        merged = []
        if self.method=="max":
            
            max_docs = []
            for trec_id in multiple_ids:
                
                docs_id = [doc for doc in sorted_updated_resp_tags if doc['trec_id']==trec_id] #get docs with same id
                #find doc with max-update-score
                
                max_score = max([doc['updated_score'] for doc in sorted_updated_resp_tags if doc['trec_id']==trec_id])
                
                max_doc = [doc for doc in docs_id if doc['updated_score']==max_score][0]
                
                max_docs.append(max_doc)
                
            
            #add results with unique trec_id
            for doc in sorted_updated_resp_tags:
                if doc['trec_id'] not in multiple_ids:
                    merged.append(doc)
            #add result with largest score from max_docs
            for max_doc in max_docs:
                merged.append(max_doc)
            #here sorting by score, not update-score, because update-score is only used for merging
            
            merged = sorted(merged, key=lambda doc: doc['score'], reverse=True)

        else: #self.method=="mean"
            avg_docs = []
            for trec_id in multiple_ids:
                from statistics import mean
                #get information, first doc index 0 and then update score:
                avg_doc = [doc for doc in sorted_updated_resp_tags if doc['trec_id']==trec_id][0] #already sorted, index[0] means the largest score
                avg_doc['updated_score'] = mean([doc['updated_score'] for doc in sorted_updated_resp_tags if doc['trec_id']==trec_id])
                avg_docs.append(avg_doc)
            
            for doc in sorted_updated_resp_tags:
                if doc['trec_id'] not in multiple_ids:
                    merged.append(doc)
            #add result with avg score from same_trec_ids_max_value
            for max_doc in avg_docs:
                merged.append(max_doc)
            merged = sorted(merged, key=lambda doc: doc['score'], reverse=True)

        return merged
if __name__ == "__main__":
    topics = ['What is the difference between sex and love?',
              'Which is the highest mountain in the world?',
               'Which is better, a laptop or a desktop?'
               ]
    
    df = pd.DataFrame()
    df['topic'] = [
        'What is the difference between sex and love?',
        'What is the difference between sex and love?',
        'Which is the highest mountain in the world?',  
        'Which is the highest mountain in the world?',
        'Which is better, a laptop or a desktop?',
        'Which is better, a laptop or a desktop?'
    ]
    df['query'] = [
        'What is the difference between sex and love?',
        'difference between sex and love',
        'Which is the highest mountain in the world?',
        'highest mountain world',
        'Which is better, a laptop or a desktop?',
        'better laptop or desktop'
    ]
    df['tag'] = [
        'original',
        'annotation',
        'original',
        'annotation',
        'original',
        'annotation'
    ]
    pd.set_option('display.max_columns', 10)
    
    chatnoir = ChatNoir(df, size=5)
    print(10*"=" + "RESULTS FROM CHATNOIR" + 10*"=")
    cn_resp_df = chatnoir.get_response()
    print(cn_resp_df)
    
    
    #RESULTS AFTER CHATNOIR
    
    #MERGING RESULTS
    print("MERGING RESPONSE AND SAVE TO DATAFRAME")
    weights = {
      'original':2,
      'annotation': 1.75,
      'sensevec': 1.5,
      'embedded': 1.5,
      'syns': 1,
      'preprocessing': 1
    }
    merged_resp = Merge(topics, cn_resp_df, weights, method='max').merging()
    print(merged_resp)

                                          topic  \
0  What is the difference between sex and love?   
1  What is the difference between sex and love?   
2   Which is the highest mountain in the world?   
3   Which is the highest mountain in the world?   
4       Which is better, a laptop or a desktop?   
5       Which is better, a laptop or a desktop?   

                                          query         tag  \
0  What is the difference between sex and love?    original   
1               difference between sex and love  annotation   
2   Which is the highest mountain in the world?    original   
3                        highest mountain world  annotation   
4       Which is better, a laptop or a desktop?    original   
5                      better laptop or desktop  annotation   

                                            response  
0  {'results': [{'trec_id': 'clueweb12-1214wb-88-...  
1  {'results': [{'trec_id': 'clueweb12-1811wb-62-...  
2  {'results': [{'trec_id': 'cluewe

In [15]:
cn_resp_df['response'][5]

{'results': [{'score': 1986.9352,
   'snippet': 'The benefits of having your own computer <em>or</em> <em>laptop</em> are many. <em>Desktops</em> come in a wide range of configurations and sizes. <em>Laptops</em> (including notebooks) have great mobility. Two other portable variations are netbooks and smartbooks. ',
   'target_hostname': 'www.barton.cc.ks.us',
   'title': '<em>Laptop</em> <em>or</em> <em>Desktop</em>?',
   'trec_id': 'clueweb12-1516wb-62-16691',
   'updated_score': 3477.1366,
   'uuid': 'bc13e394-95b9-5929-a790-e2b1b3ce7867'},
  {'score': 1985.8129,
   'snippet': 'The benefits of having your own computer <em>or</em> <em>laptop</em> are many. <em>Desktops</em> come in a wide range of configurations and sizes. <em>Laptops</em> (including notebooks) have great mobility. Two other portable variations are netbooks and smartbooks. ',
   'target_hostname': 'www.bartonccc.edu',
   'title': '<em>Laptop</em> <em>or</em> <em>Desktop</em>?',
   'trec_id': 'clueweb12-0401wb-86-2626