In [1]:
import os
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('/data/pubmed-data.tsv', sep='\t')
covid_query = df[(df.query_term == 'covid-19')]
cancer_query = df[(df.query_term == 'cancer')]

In [11]:
def calc_score1_metric(df):
    '''
    Function to calculate the index of the article (where it appeared in the search results).
    Index is calculated by looking at page number and position on the page. For example,
    if Article A is on Page 1, Position 1 (aka the first result), then its index is 1. If
    Article B is on Page 2, Position 1, then its index is 11.
    
    :param: df, pandas dataframe that contains the searches of a specific query.For example,
                if we want to calculate indexes for articles that resulted from "covid-19" query,
                we would filter our original dataset for "covid-19" and apply this function to this
                filtered dataframe.
    :return: pmid_score1_info, dictionary with the key being the PMID and the value being the average index.
    
    '''
    pmid_score1_info = {}
    for i in range(len(df)):
        pmid_list = df['PMID'].iloc[i].split(',')
        for pmid in pmid_list:
            page_num = df['page_num'].iloc[i]
            location_on_page  = pmid_list.index(pmid) + 1
            pmid_index = (page_num - 1)*10 + location_on_page
            if pmid not in pmid_score1_info:
                pmid_score1_info[pmid] = [1, pmid_index]
            else:
                pmid_score1_info[pmid][0] += 1 # [0] is the count
                pmid_score1_info[pmid][1] += pmid_index

    for pmid in pmid_score1_info:
        score = pmid_score1_info[pmid][1]/pmid_score1_info[pmid][0]
        pmid_score1_info[pmid] = score
        
    return pmid_score1_info

## Example of how to use it

In [13]:
calc_score1_metric(covid_query)

{'33725432': 1.5,
 '33301246': 1.6923076923076923,
 '33492523': 5.666666666666667,
 '32584423': 3.3333333333333335,
 '33704352': 5.75,
 '33951374': 5.823529411764706,
 '33323690': 4.1,
 '34037666': 8.75,
 '34215210': 9.5,
 '33232588': 12.0,
 '33522478': 3.4788732394366195,
 '33308664': 3.435810810810811,
 '33189872': 1.9263157894736842,
 '33666147': 4.782747603833866,
 '33139420': 3.6859205776173285,
 '32659413': 8.342857142857143,
 '32383182': 7.8545454545454545,
 '33126180': 10.954545454545455,
 '33322035': 10.80188679245283,
 '33301459': 8.977695167286246,
 '35289521': 1.9230769230769231,
 '35289510': 2.923076923076923,
 '35289509': 3.923076923076923,
 '35289493': 18.5,
 '35289488': 5.923076923076923,
 '35289486': 6.923076923076923,
 '35289434': 22.214285714285715,
 '35289365': 9.846153846153847,
 '35289339': 10.833333333333334,
 '35289338': 11.833333333333334,
 '35288778': 76.25,
 '35287944': 103.33333333333333,
 '35287332': 122.0,
 '35287313': 79.0,
 '35287290': 127.33333333333333