In [1]:
import pandas as pd
publication_data = pd.read_csv('publication.csv', header=None)

In [2]:
publications_id_document_dict = dict()
for i in range(0, len(publication_data)):
    pub_info = publication_data.iloc[i]
    s_id = pub_info[0]
    document = str(pub_info[1]) + " " + str(pub_info[2]) + " " + str(pub_info[3])
    publications_id_document_dict[s_id] = document

In [3]:
import jsonlines
publication_candidates = dict()
with jsonlines.open('candidate_document_headQ_recommendation_top100.jsonl') as reader:
    for item in reader:
        s_id = item["s_id"]
        candidate_docs_dict = item["candidate_docs"]
        publication_candidates[s_id] = list(candidate_docs_dict.keys())

In [4]:
candidate_data = pd.read_csv('dataset.csv', header=None)

In [5]:
candidate_id_document_dict = dict()
for i in range(0, len(candidate_data)):
    candidate_info = candidate_data.iloc[i]
    c_id = candidate_info[0]
    document = str(candidate_info[1]) + " " + str(candidate_info[2]) + " " + str(candidate_info[3])
    candidate_id_document_dict[c_id] = document

In [6]:
def intersection(lst1, lst2):
    intersect = set([value for value in lst1 if value in lst2])
    return intersect

In [7]:
import math
def query_likelihood_retrieval(q, d, C_model, alpha_d, lambda_, mu):
    word_list = list(intersection(q,d))
    scores = [0,0,0]
    # calculate c(w|d)
    count_of_w_in_d = dict()
    for word in d:
        if word in word_list:
            if word not in count_of_w_in_d:
                count_of_w_in_d[word]=0
            count_of_w_in_d[word]+=1
        #calculate sum of log(P(w|c)) for every w in d
        scores[0] += math.log(C_model[word])
        
    # calculate non-smoothing_query_likelihood
    for word in word_list:
        scores[0] += math.log((count_of_w_in_d[word]/len(word_list))/(alpha_d*C_model[word]))
    scores[0] += len(d) * math.log(alpha_d)
    
    # calculate c(w|q)
    count_of_w_in_q = dict()
    for word in q:
        if word in word_list:
            if word not in count_of_w_in_q:
                count_of_w_in_q[word]=0
            count_of_w_in_q[word]+=1
    # JM smoothing
    for word in word_list:
        scores[1] += count_of_w_in_q[word] + math.log(1 + (1-lambda_)/lambda_*count_of_w_in_d[word]/(len(d)*C_model[word]))
        
    # Dir smoothing
    for word in word_list:
        scores[2] += count_of_w_in_q[word] + math.log(1 + count_of_w_in_d[word]/(mu*C_model[word]))
    scores[2] += len(q) * math.log(mu/(mu+len(d)))
    return scores

In [8]:
candidate_document_ranking = dict()
for publication in publication_candidates.keys():
    candidates = publication_candidates[publication]
    # build the background model: A dict that contains P(w|C)
    collection = publications_id_document_dict[publication].split()
    for candidate in candidates:
        if candidate in candidate_id_document_dict:
            collection += candidate_id_document_dict[candidate].split()
    back_ground_model = dict()
    for word in collection:
        if word not in back_ground_model:
            back_ground_model[word] = 0
        back_ground_model[word] +=1
    length = len(collection)
    for word in back_ground_model.keys():
        back_ground_model[word] /= length
    # Calculate the query likelihood probability
    document_id_score_dict = dict()
    document_id_jmscore_dict = dict()
    document_id_dirscore_dict = dict()
    for candidate in candidates:
        if candidate in candidate_id_document_dict:
            q = publications_id_document_dict[publication].split()
            d = candidate_id_document_dict[candidate].split()
            alpha_d = 0.1
            lambda_ = 0.5
            mu = 0.5
            scores = query_likelihood_retrieval(q,d,back_ground_model,alpha_d, lambda_, mu)
            document_id_score_dict[candidate] = scores[0]
            document_id_jmscore_dict[candidate] = scores[1]
            document_id_dirscore_dict[candidate] = scores[2]
    # Get the top 10 rank
    rank_non_smoothing = sorted(document_id_score_dict, key=document_id_score_dict.get)[-3:]
    rank_jm_smoothing = sorted(document_id_jmscore_dict, key=document_id_jmscore_dict.get)[-3:]
    rank_dir_smoothing = sorted(document_id_dirscore_dict, key=document_id_dirscore_dict.get)[-3:]
    rank_non_smoothing.reverse()
    rank_jm_smoothing.reverse()
    rank_dir_smoothing.reverse()
    #write to the ranking dictionary
    candidate_document_ranking[publication] = dict()
    candidate_document_ranking[publication]['normal'] = rank_non_smoothing
    candidate_document_ranking[publication]['normal_score'] = [document_id_score_dict[d] for d in rank_non_smoothing]
    candidate_document_ranking[publication]['jm'] = rank_jm_smoothing
    candidate_document_ranking[publication]['jm_score'] = [document_id_jmscore_dict[d] for d in rank_jm_smoothing]
    candidate_document_ranking[publication]['dir'] = rank_dir_smoothing
    candidate_document_ranking[publication]['dir_score'] = [document_id_dirscore_dict[d] for d in rank_dir_smoothing]

In [9]:
print(candidate_document_ranking['gesis-ssoar-62031'])

{'normal': ['datasearch-httpwww-da-ra-deoaip--oaioai-da-ra-de621835', 'datasearch-httpwww-da-ra-deoaip--oaioai-da-ra-de619110', 'ZA7574'], 'normal_score': [-43.70560709101471, -48.27494507223969, -63.916460561092165], 'jm': ['ZA6702', 'ZA7573', 'ZA6597'], 'jm_score': [86.62126814656934, 83.37498113198336, 79.18381749171365], 'dir': ['datasearch-httpwww-da-ra-deoaip--oaioai-da-ra-de621835', 'datasearch-httpwww-da-ra-deoaip--oaioai-da-ra-de619110', 'ZA7574'], 'dir_score': [-318.20031215613153, -351.1137517044994, -394.2475009517014]}
