# Clean references

- For each document we want to extract the journal used in each references with the publishing year of the cited article. If we are dealing with Web of Science data we match the journal name with the WOS master journal list present in the folder 'Parper/Data'. This allows us to add the subject category of the journal. 

- import first modules and mongodb configuration

In [1]:
import os 
os.chdir('../../')

import yaml
import numpy as np
from joblib import Parallel, delayed
import multiprocessing
num_cores = 50

In [5]:
from package import Reference_cleaner
import tqdm

- Create a function to update mongodb yearly. This function extract journals and year of publication for each reference in a document.

- ```IS_WOS``` allows to deal with Web of Science and Pubmed Knowledge Graph documents.

In [3]:
def clean_ref_update_mongo(IS_WOS,skip_n):

    with open("mongo_config.yaml", "r") as infile:
        pars = yaml.safe_load(infile)['PC_BETA']

    if IS_WOS:
        data = Reference_cleaner(pars['client_name'],
                                 pars['db_name'],
                                 pars['wos']['collection_name'],
                                 IS_WOS)
        
        PATH = '/home/peltouz/Documents/GitHub/New-novelty-indicator-using-graph-theory-framework/Data sample/raxdata/'
        data.wos_cr2mongo(PATH)
        
        PATH = '/home/peltouz/Documents/GitHub/New-novelty-indicator-using-graph-theory-framework/Data/Wos_j_list/'
        data.get_wos_J_list(PATH)
        
        cr_var = 'CR'
        pmid_var = pars['wos']['pmid']
        
    else:
        data = Reference_cleaner(pars['client_name'],
                                 pars['db_name'],
                                 pars['pkg']['collection_name'],
                                 IS_WOS)
        
        cr_var = 'a14_referencelist'
        pmid_var = pars['pkg']['pmid']
        
    docs =  data.collection.find({cr_var:{'$exists':'true'}}).skip(skip_n).limit(100000)
    
    for doc in docs:
        j_dict = data.get_item_year_cat(doc[cr_var],'reference')
        query = { pmid_var: doc[pmid_var] }
        newvalues = { "$set": j_dict }
        data.collection.update_one(query, newvalues)


- clean up ref in parallel

In [7]:
skip_n = range(0, 5288086, 100000)
IS_WOS = np.repeat(False,len(skip_n))

Parallel(n_jobs=num_cores)(
    delayed(clean_ref_update_mongo)(
        is_wos, skip
    ) for is_wos, skip in zip(IS_WOS,skip_n)
)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

- Plot the distribution of the share of references captured in a document (Only relevant for WoS)

In [None]:
share_ref = [i['share_ref_captured'] for i in tqdm.tqdm(data.collection.find({"share_ref_captured":{"$exists":True}}))]
plt.figure()
fig = sns.displot(share_ref, kde=True).set(title='Share of References in WOS Journal Citation Reports').fig
fig.savefig('/home/peltouz/Documents/GitHub/New-novelty-indicator-using-graph-theory-framework/Figures/'+'wos_share_ref_captured.png')

# Clean Keywords

- Mesh terms are already cleaned, we just need restructure the variable in order to fit with the structur used in the section above. Here we will integrate the first year of appearence of the meshterm and the two frist element of the TreeNumber (the top two levels of meshterms classification hierarchy following Uddin and Khan, 2016)

In [6]:
def update_keyword_mongo(skip_n):
    
    with open("mongo_config.yaml", "r") as infile:
        pars = yaml.safe_load(infile)['PC_BETA']

    data = Reference_cleaner(pars['client_name'],
                             pars['db_name'],
                             pars['pkg']['collection_name'],
                             False)
    
    k_var = 'a06_meshheadinglist'
    pmid_var = pars['pkg']['pmid']
    docs = data.collection.find({k_var:{'$exists':'true'}}).skip(skip_n).limit(100000)
    
    for doc in docs:
        try:
            j_dict = data.get_item_year_cat(doc[k_var],'keyword')
            query = { pmid_var: doc[pmid_var] }
            newvalues = { "$set": j_dict }
            data.collection.update_one(query, newvalues)
        except:
            pass

In [7]:
skip_n = range(0, 25044330 , 100000)
Parallel(n_jobs=num_cores)(
    delayed(update_keyword_mongo)(
        skip
    ) for skip in skip_n
)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,