In [1]:
import sys
!{sys.executable} -m pip install ipython python-dotenv requests

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
from IPython.display import Image
from dotenv import load_dotenv
import urllib.parse
import requests
import json
import csv
import os
import pandas as pd

## API Tutorial
This document has two aims:

- (1) To show you the process of registering and using an API application from Web of Science / Clarivate

- (2) Utilize a script written by the author to efficiently retrieve results from multiple pages.

### Create an Account or Log In
The first step is to register or log into https://developer.clarivate.com/ .

![Step 1](./tutorial_screenshots/Screenshot%202025-09-03%20at%203.40.48 PM.png)

### Register the Application Step 1
After you will register for an API application. Please click "Register".

![Step 2](./tutorial_screenshots/Screenshot%202025-09-03%20at%203.41.46 PM.png)

### Register a New Application Step 2
Then click on "Register a new Application" ...

![Step 3](./tutorial_screenshots/Screenshot%202025-09-03%20at%203.43.05 PM.png)

... and fill out the form.

![Step 4](./tutorial_screenshots/Screenshot%202025-09-03%20at%203.43.45 PM.png)

### Approval
Once the registration is approved you can click on the Name link of your application as seen below.

![Step 5](./tutorial_screenshots/Screenshot%202025-09-03%20at%203.44.04 PM.png)

Then click on the name if the API.

![Step 6](./tutorial_screenshots/Screenshot%202025-09-03%20at%203.44.37 PM.png)

This will take you to a window where you can click the "Try It" button and you can then see the special UI for your registered application.

![Step 7](./tutorial_screenshots/Screenshot%202025-09-03%20at%203.45.19 PM.png)

The rest is play around and learn! This interface allows you to use the AP without code. It also shows you all teh advance search options and even allows you to perform a test search! This test search will tell you the curl command and show you the example output.

However, it is not the most efficient way to use the API. Below is a notebook that will help you get more results faster. It is written in python and utilizes requests in place of curl. This helps keep it all in one language! Just remember to not exceed the limit of your subscription!

### Script Step 1
This is where you will place your information such as API key, Database, and search phrase. All the abriviations are seen and explained in your individual APU UI. Please refer there.

In [3]:
# Load environment variables from .env file
from dotenv import load_dotenv, find_dotenv
import os
print("find_dotenv() ->", find_dotenv())   # path dotenv will use when load_dotenv() is called without args
load_dotenv()  
# Show both casings and raw os.environ entry
print("os.environ.get('page_max') =", os.environ.get('page_max'))
print("os.environ.get('PAGE_MAX') =", os.environ.get('PAGE_MAX'))
# Also show if a python variable by that name exists in the kernel
print("globals().get('page_max') =", globals().get('page_max'))
# Use the environment variable, but fall back to a placeholder if it's not set.
# Replace 'YOUR_API_KEY' with your actual key if you are not using a .env file.
API_KEY = os.environ.get('API_KEY', 'YOUR_API_KEY')
database_code = os.environ.get('database_code')
page_limit = int(os.environ.get('page_limit', 50))
page_max = int(os.environ.get('page_max', 10))
sort_field = os.environ.get('sort_field')
ascending = os.environ.get('ascending')
base_url = os.environ.get('base_url')
search_phrase = os.environ.get('search_phrase')

print(search_phrase)
print(page_max)
print(API_KEY)

find_dotenv() -> /Users/gerardnasseruncc/Desktop/All/Github/Project_Repositories/Prisma_Compliance_Resource/APIs/Web_of_Science_API_Resources_and_Tutorials/.env
os.environ.get('page_max') = 10
os.environ.get('PAGE_MAX') = None
globals().get('page_max') = None
(indoor spaces OR indoor environment OR indoor microbiome OR built environment OR office OR home OR apartment OR classroom OR building) AND (plant wall OR green wall OR phytoremediation system OR vertical planting OR indoor plant OR potted plant OR house plant OR ornamental plants OR plant intervention OR soil intervention OR hydroponic) AND (skin microbiome OR skin samples OR soil microbiome OR surface microbiome OR surface samples OR dust OR dust samples OR bioaerosols OR indoor air OR rhizosphere microbiome OR biofilter) AND (16s gene OR 16S rrna OR 16s sequencing OR 16s amplification OR 16s data OR 16s analysis OR 16s rna OR shotgun sequencing OR bacterial characterization OR bacterial composition OR bacterial analysis OR micr

In [4]:
# This is a definition used to outline a repeatable task.
# This function creates an encoded query string for a search phrase.
def create_encoded_query(phrase):
    """Encodes a string for use in a URL."""
    return urllib.parse.quote(phrase)


### Script Step 2
This step will build your url phrase that will be added to the base url. This creates the full request that will then be sent to the API.

In [5]:
# Adjust this line to adjust your search fields.
search_phrase_with_tag = f'TS=({search_phrase})'
print(search_phrase_with_tag)

params = {
    'db': database_code,
    'q': search_phrase_with_tag,
    'limit': page_limit
}

if ascending == 'true':
    params['sortField'] = f'{sort_field} A' # Note: API docs use a space not a + like in the UI.
elif ascending == 'false':
    params['sortField'] = f'{sort_field} D'
else:
    params['sortField'] = sort_field


TS=((indoor spaces OR indoor environment OR indoor microbiome OR built environment OR office OR home OR apartment OR classroom OR building) AND (plant wall OR green wall OR phytoremediation system OR vertical planting OR indoor plant OR potted plant OR house plant OR ornamental plants OR plant intervention OR soil intervention OR hydroponic) AND (skin microbiome OR skin samples OR soil microbiome OR surface microbiome OR surface samples OR dust OR dust samples OR bioaerosols OR indoor air OR rhizosphere microbiome OR biofilter) AND (16s gene OR 16S rrna OR 16s sequencing OR 16s amplification OR 16s data OR 16s analysis OR 16s rna OR shotgun sequencing OR bacterial characterization OR bacterial composition OR bacterial analysis OR microbiome analysis OR microbial diversity OR microbiota OR metagenomics))


In [6]:
headers = {
    'accept': 'application/json',
    'X-ApiKey': API_KEY
}

all_results = []

### Script Step 3
This section does the actual search, retrieving results from pages 1-X, where X is the page limit you defined earlier.

In [9]:
import time

if not base_url:
    print("Error: 'base_url' is not set. Please check your .env file or environment variables.")
else:
    for i in range(1, page_max + 1):
        print(f"Requesting page {i}/{page_max}...")
        # Set the page for the current iteration
        params['page'] = i

        try:
            # Make the API call with a timeout and raise for status to catch HTTP errors
            response = requests.get(base_url, headers=headers, params=params, timeout=30)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"  ... Request error on page {i}: {e}")
            break

        print(response.url)  # Print the full URL for debugging

        try:
            data = response.json()
        except ValueError:
            print(f"  ... Failed to decode JSON on page {i}. Response text:\n{response.text}")
            break

        # The list of documents is in the 'hits' key of the JSON response
        page_hits = data.get('hits', [])
        if page_hits:
            # Add the results from this page to our master list
            all_results.extend(page_hits)
            print(f"  ... Success! Added {len(page_hits)} results.")
        else:
            # Stop if a page returns no results
            print("  ... No more results found. Stopping.")
            break

        # small delay to avoid hitting rate limits
        time.sleep(0.2)

Requesting page 1/10...
  ... Request error on page 1: 400 Client Error: Bad Request for url: https://api.clarivate.com/apis/wos-starter/v1/documents?db=WOk&q=TS%3D%28%28indoor+spaces+OR+indoor+environment+OR+indoor+microbiome+OR+built+environment+OR+office+OR+home+OR+apartment+OR+classroom+OR+building%29+AND+%28plant+wall+OR+green+wall+OR+phytoremediation+system+OR+vertical+planting+OR+indoor+plant+OR+potted+plant+OR+house+plant+OR+ornamental+plants+OR+plant+intervention+OR+soil+intervention+OR+hydroponic%29+AND+%28skin+microbiome+OR+skin+samples+OR+soil+microbiome+OR+surface+microbiome+OR+surface+samples+OR+dust+OR+dust+samples+OR+bioaerosols+OR+indoor+air+OR+rhizosphere+microbiome+OR+biofilter%29+AND+%2816s+gene+OR+16S+rrna+OR+16s+sequencing+OR+16s+amplification+OR+16s+data+OR+16s+analysis+OR+16s+rna+OR+shotgun+sequencing+OR+bacterial+characterization+OR+bacterial+composition+OR+bacterial+analysis+OR+microbiome+analysis+OR+microbial+diversity+OR+microbiota+OR+metagenomics%29%29&limi

### Script Step 4
Finally, the list of JSONs is exported into a more human readable format, a csv. For down stream use.

In [8]:
if all_results:
    publication_list = []

    # Begin Loop
    for record in all_results:
        # Create Empty Dict
        processed_record = {}

        # Get UID
        processed_record['UID'] = record.get('uid', 'N/A')

        # Get Nested Identifier Information
        identifiers_info = record.get('identifiers', {})
        processed_record['DOI'] = identifiers_info.get('doi', 'N/A')

        # Get Title
        processed_record['Title'] = record.get('title', 'N/A').lower()

        # Get the Data That is Nested
        source_info = record.get('source', {}) # Get source, or an empty dict if it's missing
        processed_record['Year'] = source_info.get('publishYear', 'N/A')
        processed_record['Journal'] = source_info.get('sourceTitle', 'N/A')

        # Get Processed Author Data
        names_info = record.get('names', {})
        authors_list = names_info.get('authors', []) # Get authors list, or an empty list
        if authors_list:
            # Create a list of just the author names
            author_names = [author.get('displayName') for author in authors_list]
            # Join them into a single string separated by a semicolon
            processed_record['Authors'] = '; '.join(author_names)
        else:
            processed_record['Authors'] = 'N/A'

        publication_list.append(processed_record)
    publication_dataframe = pd.DataFrame(publication_list)
    publication_dataframe.to_csv('wos_processed_results.csv', index=False)
    print(f"Data written to wos_processed_results.csv")
    print(pd.Timestamp.now())