In [None]:
import numpy as np
import pandas as pd
from public_resources import *

In [None]:
search_term = 'multiome'

# Search public resources

In [None]:
#Figshare searches may return too many results - unclear how to do phrase searching (e.g. "Paired-seq")
fshare_df = figshare_query(search_term)
fshare_df

In [None]:
bs_df = biostudies_query(search_term)
bs_df

In [None]:
geo_df = geo_query(search_term)
geo_df

In [None]:
z_df = zenodo_query(search_term)
z_df

**Merge**

In [None]:
add_dois = {}

merged_df = pd.concat([bs_df, geo_df,fshare_df, z_df])
unfound_accs = [k for k in add_dois.keys() if k not in merged_df['accession'].unique()]
if unfound_accs:
    print('accession not in dataframe')
    print(unfound_accs)

In [None]:
#TESTING PURPOSES
add_dois = {}

merged_df = pd.concat([bs_df, geo_df, z_df])
unfound_accs = [k for k in add_dois.keys() if k not in merged_df['accession'].unique()]
if unfound_accs:
    print('accession not in dataframe')
    print(unfound_accs)

In [None]:
merged_df['doi'] = merged_df.apply(lambda x: add_dois.get(x['accession'], x['doi']), axis=1)

for pmid in merged_df[(merged_df['doi'].isna()) & (merged_df['pmid'].isna() == False)]['pmid'].dropna().unique():
    if pmid:
        doi = pmid_to_doi(pmid)
        merged_df.loc[merged_df['pmid'] == pmid, 'doi'] = doi
merged_df.drop(columns='pmid',inplace=True)

merged_df_nodoi = merged_df[(merged_df['doi'].isna()) | (merged_df['doi'] == '')]
merged_df_nodoi.rename(columns={'title': 'Title'}, inplace=True)
merged_df = merged_df[(merged_df['doi'].isna() == False) & (merged_df['doi'] != '')]

pub_info = []
for doi in merged_df['doi'].dropna().unique():
    if doi:
        pub_info.append(doi_checker(doi))

pub_df = pd.DataFrame(pub_info)
pub_df['doi'] = pub_df.apply(lambda x: x['DOI'] if pd.isnull(x['updated_doi']) else x['updated_doi'], axis=1)
pub_df = pub_df[pub_df['invalid DOI'] != 'yes']
pub_df.drop(columns=['updated_doi','invalid DOI'], inplace=True)

merged_df = merged_df.merge(pub_df, left_on='doi', right_on='DOI', how='left', suffixes=('_x',None))
merged_df['Title'] = merged_df.apply(lambda x: x['title'] if pd.isnull(x['doi']) else x['Title'], axis=1)
merged_df.drop(columns=['DOI','doi_x'], inplace=True)
merged_df.fillna({'file formats': ''}, inplace=True)


def join_responsibly(x):
    return ','.join(set([e for e in x if not pd.isnull(e)]))

merged_df = merged_df.groupby('doi').agg({
    'accession': lambda x: ','.join(set(x)),
    'file formats': lambda x: ','.join(set([e for r in x for e in r])),
    'developmental stage': join_responsibly,
    'organism part': join_responsibly,
    'disease': join_responsibly,
    'organism': join_responsibly,
    'Title': lambda x: ','.join(set(x)),
    'Journal': lambda x: ','.join(set(x)),
    'Year': lambda x: ','.join(set([str(e) for e in x])),
    'First authors': lambda x: ','.join(set(x)),
}).reset_index()

merged_df = pd.concat([merged_df,merged_df_nodoi])

merged_df.to_csv(f'{search_term}_records.csv', index=False)

# Identify publications

In [None]:
dois = publication_search(search_term)

new_pub_info = []
for doi in dois:
    if doi and doi not in merged_df['doi'].unique():
        new_pub_info.append(doi_checker(doi))
new_pub_df = pd.DataFrame(new_pub_info)
new_pub_df.drop(columns=['input_doi','invalid DOI'], inplace=True)
new_pub_df = new_pub_df[new_pub_df['doi'].isin(merged_df['doi'].unique()) == False]

all_pubs = pd.concat([merged_df, new_pub_df])

all_pubs.to_csv(f'{search_term}_publications.csv', index=False)
all_pubs

In [None]:
#add in preprint
#https://api.biorxiv.org/ - to public_resources.py