In [None]:
import csv
import pandas as pd
import requests
import time
from cellxgene_mods import CxG_API
from datetime import datetime
from pub_check import *


CxG_API.config()

pub_collections = CxG_API.get_collections()
pub_dataset_ids = [d['dataset_id'] for c in pub_collections for d in c['datasets']]
pub_collection_ids = [c['collection_id'] for c in pub_collections]

priv_collections = CxG_API.get_collections(visibility='PRIVATE')
priv_dataset_ids = [d['dataset_id'] for c in priv_collections for d in c['datasets']]
priv_collection_ids = [c['collection_id'] for c in priv_collections]

nonrev_priv_collections = [c for c in priv_collections if not c.get('revision_of')]

ds_coll = {d['dataset_id']:c['collection_id'] for c in pub_collections + priv_collections for d in c['datasets']}

wrk_dir = '/Users/jason/Downloads/'

# DOI review
**check for preprints that have been published**

In [None]:
pub_info = []
for c in pub_collections + priv_collections:
    if c.get('publisher_metadata') and c['publisher_metadata']['journal'] in ['bioRxiv', 'medRxiv']:
        r = doi_checker(c['doi'])
        pub_info.append(r)
pub_df = pd.DataFrame(pub_info)
if 'invalid DOI' in pub_df.columns:
    display(pub_df[pub_df['invalid DOI'].isna() == False])

**check for Collections that might now have DOIs**

In [None]:
for c in pub_collections:
    if not c.get('doi'):
        dois = pubtator_search(c['collection_id'])
        if dois:
            print(c['collection_id'], ','.join(dois))
        time.sleep(0.5)

# long-term private Collections
**pull private Collections older than a specified cut-off**

In [None]:
year_cutoff = 1.5

day_cutoff = 365.25 * year_cutoff
today = datetime.today()

sorted_collections = sorted(nonrev_priv_collections, key=lambda nonrev_priv_collections: nonrev_priv_collections['created_at'])

f = open(f'{wrk_dir}long_private_collections.csv', 'w', encoding='UTF8')

writer = csv.writer(f)
writer.writerow([
    'collection',
    'name',
    'doi',
    'contact name',
    'contact email',
    'created at',
    'number of datasets'
])

for collection in sorted_collections:
    date1 = datetime.strptime(collection['created_at'].split('T')[0], '%Y-%m-%d')
    difference = today - date1
    gap = difference.days

    if gap > day_cutoff:
        writer.writerow([
            collection['collection_url'],
            collection['name'],
            collection['doi'],
            collection['contact_name'],
            collection['contact_email'],                
            collection['created_at'],
            len(collection['datasets'])
        ])

f.close()

# private URLs made public
**looked for private Collection URLs in PubMed**

In [None]:
pubtator_res = []
for c in priv_collection_ids:
    dois = pubtator_search(c)
    if dois:
        new = {'collection_id':c, 'source': ','.join(dois)}
        pubtator_res.append(new)
    time.sleep(0.5)

**filter AirTable report for possible private URL sharing**

In [None]:
at_export = 'Sources of private collections_datasets-Grid view.csv'
df = pd.read_csv(f'{wrk_dir}{at_export}')

month = df.loc[0]['time period']
ignore_sources = ['lattice.atlassian.net','lattice-data.org','Direct / None']
uuid_pattern = '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'

df = df[
    (df['time period'] == month) &
    (df['source'].isin(ignore_sources) == False) &
    (df['url'].str.contains(uuid_pattern))
]
df.drop(columns=['dataset name','time period'], inplace=True)

act = df[df['url'].str.startswith('/collections/')]
act['collection_id'] = act['url'].apply(lambda x: x.split('/')[2])
act = act[act['collection_id'].isin(priv_collection_ids)]

ds_df = df[df['url'].str.startswith('/e/')]
ds_df['dataset_id'] = ds_df['url'].apply(lambda x: x.split('/')[2].split('.')[0])
ds_df = ds_df[ds_df['dataset_id'].isin(priv_dataset_ids)]
ds_df['collection_id'] = ds_df['dataset_id'].map(ds_coll)

act = pd.concat([act, ds_df, pd.DataFrame(pubtator_res)]).fillna({'visitors':0, 'dataset_id': ''})
act = act.groupby('collection_id').agg({
    'source': lambda x: ','.join(set(x)),
    'dataset_id': lambda x: ','.join(set(x)).strip(','),
    'visitors': 'sum'
}).sort_values('visitors', ascending=False)
act.reset_index(inplace=True)
act['collection_url'] = 'https://cellxgene.cziscience.com/collections/' + act['collection_id'].astype(str)
act = act[['collection_url','source','visitors','dataset_id']]

act.to_csv(f'{wrk_dir}private_URL_{month}.csv', index=False)

# is_primary_data evaluation

# in progress DOIs
**generate a report of DOIs currently in private Collections**

In [None]:
date = today.strftime('%Y_%m_%d')
priv_dois = set([
    c['doi'] for c in nonrev_priv_collections if c.get('doi')
])
filename = f'{wrk_dir}cxg_private_dois_{date}.txt'
with open(filename, 'w') as file:
    for doi in priv_dois:
        file.write(f'{doi}\n')