In [9]:
import pandas as pd
import json
import random

# Read in cvs
raw_data = 'asreview_dataset_relevant_Psychedelic Study.csv'

df = pd.read_csv(raw_data)



Clean the data and get url from doi

In [18]:
import requests
def get_url(doi: str) -> str:
    if not doi:
        return ''
    # PubMed API endpoint
    pubmed_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'

    # Parameters for the PubMed API request
    params = {
        'db': 'pubmed',
        'term': doi,
        'format': 'json'
    }

    try:
        # Send HTTP GET request to PubMed API
        response = requests.get(pubmed_api_url, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse JSON response
        data = response.json()

        if data['esearchresult']['idlist']:
            # Extract PubMed ID (PMID) from response
            pmid = data['esearchresult']['idlist'][0]

            # Construct PubMed URL
            pubmed_url = f'https://pubmed.ncbi.nlm.nih.gov/{pmid}/'

            return pubmed_url
        else:
            return ''

    except Exception as e:
        # Handle any exceptions (e.g., network errors, JSON parsing errors)
        print(f"Error occurred: {e}")
        return ''


In [26]:
from tqdm import tqdm

df.rename(columns={'abstract': 'text'}, inplace=True)
# replace NaN with empty string
df.fillna('', inplace=True)

# select first 150 rows
df_subsample = df.sample(150, random_state=1)
tqdm.pandas()  # Use tqdm's progress_apply method
df_subsample['pubmed_url'] = df_subsample['doi'].progress_apply(lambda x: get_url(x))
df_subsample['html'] = " "
# Write to csv
df_subsample.to_csv('asreview_dataset_cleaned_150.csv', index=False)


100%|██████████| 150/150 [01:21<00:00,  1.84it/s]


Sample 100 examples

In [27]:

from datetime import datetime
# Add additional column html
# Create data json
cleaned_data = 'asreview_dataset_cleaned_150.csv'
df = pd.read_csv(cleaned_data)
# fill nan with empty string
df.fillna('', inplace=True)
# remove those without pubmed_url
df = df[df['pubmed_url'] != '']
n = 100
relevant_cols = ['record_id',  'keywords', 'title', 'text', 'html', 'doi', 'pubmed_url', 'secondary_title']
# remove rows without doi, ''

data = df[relevant_cols].to_dict(orient='records')
nr_of_records = len(data)
random_subset = random.sample(data, n)
# replace NaN with empty string
current_date = datetime.now().strftime('%Y%m%d')
output_file = f'../prodigy/input/psychdelic_study_{n}_{current_date}.jsonl'
# Write first 100 into a jsonl
with open(output_file, 'w', encoding='utf-8') as f:
    for d in random_subset:
        f.write(json.dumps(d) + '\n')