In [76]:
import pandas as pd
import numpy as np
from pathlib import Path
from ast import literal_eval

In [45]:
#Defining group by criteria
group_keys = ["query", "search_type", "page"]

In [46]:
# Load in data file
df =  pd.read_csv('../data/out/pmid_data.csv')

In [47]:
# Keep only what's needed
df = df[group_keys + ['references']]
df

Unnamed: 0,query,search_type,page,references
0,Moyamoya,relevance,1,"[11390315, 10724101, 6823678, 11197712, 102350..."
1,Moyamoya,relevance,1,"[22020027, 1282678, 5554209, 9409399, 23445954..."
2,Moyamoya,relevance,1,"[22020027, 18066556, 17138018, 11390315, 23445..."
3,Moyamoya,relevance,1,"[7221855, 16788009, 20495424, 23909250, 161318..."
4,Moyamoya,relevance,1,"[6823678, 26756907, 18625642, 9409432, 1106308..."
...,...,...,...,...
7201,literacy skills,pubdate_desc,2,"[27436235, 16338915, 28367762, 30146073, 98035..."
7202,literacy skills,pubdate_desc,2,[]
7203,literacy skills,pubdate_desc,2,[]
7204,literacy skills,pubdate_desc,2,[]


In [49]:
# Drop missing values, drop those without any references
df = df.dropna(subset=['references'])
df = df[df['references'].str.len() > 2]

In [55]:
# Check the counts, make sure we still have plenty of stats
df.groupby(['page','search_type']).size()

page  search_type 
1     pubdate_desc     970
      relevance       1567
2     pubdate_desc    1053
      relevance       1553
dtype: int64

In [63]:
# Convert references from string to list
df['refs'] = df['references'].apply(lambda x:set(literal_eval(x)))
df.refs

0       {11037190, 15528455, 10795533, 15362573, 16645...
1       {22870528, 15528455, 27180559, 11914260, 55542...
2       {11037190, 15362573, 16302616, 24201757, 33515...
3       {23909250, 22871684, 22149381, 26645256, 24605...
4       {22870528, 18787200, 24595588, 18463369, 18232...
                              ...                        
7196    {28706433, 19691970, 27617354, 26346443, 97294...
7197                       {23869633, 30811884, 29034727}
7198    {19481221, 19608198, 9450374, 25462034, 984911...
7199    {23343756, 20423439, 18184597, 25993372, 21288...
7201    {30851841, 9803527, 12584072, 9282700, 2836776...
Name: refs, Length: 5143, dtype: object

In [77]:
# For each item in group_keys, compute the mean set overlap of references
data = []
for keys, dx in df.groupby(group_keys):
    n = len(dx.refs)
    mean_jac = []
    for i in range(n):
        for j in range(i+1, n):
            r1 = dx.refs.iloc[i]
            r2 = dx.refs.iloc[j]
            k = len(r1.intersection(r2))
            jac = k / (len(r1)+len(r2)-k)
            mean_jac.append(jac)
    row = dict(zip(group_keys, keys))
    row['reference_diversity'] = np.average(mean_jac)
    data.append(row)

In [82]:
g = pd.DataFrame(data).dropna()
g

Unnamed: 0,query,search_type,page,reference_diversity
0,AMPK signalling pathway,pubdate_desc,1,0.000689
1,AMPK signalling pathway,pubdate_desc,2,0.001389
2,AMPK signalling pathway,relevance,1,0.028141
3,AMPK signalling pathway,relevance,2,0.010498
4,Acalabrutinib,pubdate_desc,1,0.032724
...,...,...,...,...
700,public transportation,relevance,1,0.003203
701,public transportation,relevance,2,0.000794
703,structural racism,pubdate_desc,2,0.002149
704,structural racism,relevance,1,0.017225


In [85]:
# Sanity check to see if value is different (looks like there might be something!)
g.reset_index().groupby(['search_type','page'])['reference_diversity'].mean()

search_type   page
pubdate_desc  1       0.006810
              2       0.006980
relevance     1       0.025754
              2       0.017880
Name: reference_diversity, dtype: float64

In [44]:
# Save to feature set
save_dest = Path("../data/features/")
apt_dx.to_csv(save_dest / "RCR.csv")