## Human Biology is frequently being incorrectly categorized by ChatGPT

To address this issue, we will do some clustering/grouping to see if we can develop heuristics to minimize this error

1. Identify all records with topicCategory = human biology
2. Pull all topicCategories for those records
3. Create frequency tables to see how Human biology clusters with other topics

In [13]:
import os
import requests
import json
import pandas as pd
import time
import math

In [None]:
import matplotlib
import seaborn as sns

In [2]:
script_path = os.getcwd()
data_path = os.path.join(script_path,'data')

In [4]:
query_url = "https://api-staging.data.niaid.nih.gov/v1/query?&q=topicCategory.name%3A%22Human+biology%22&fields=_id,topicCategory&fetch_all=True"
r = requests.get(query_url)
cleanr = json.loads(r.text)
hits = cleanr['hits']
#print(len(cleanr['hits']))
df1 = pd.DataFrame(cleanr['hits'])
scroll_id = cleanr['_scroll_id']
total_hits = cleanr['total']
print(total_hits)

5611


In [7]:
%%time
## Scroll to get all the results

i = 0
#k = 3 
k = math.ceil(total_hits/1000)
while i < k:
    #r2 = requests.get(f'https://api.data.niaid.nih.gov/v1/query?scroll_id={scroll_id}')
    r2 = requests.get(f'https://api-staging.data.niaid.nih.gov/v1/query?scroll_id={scroll_id}')
    tmp = json.loads(r2.text)
    scroll_id = tmp['_scroll_id']
    tmpdf = pd.DataFrame(tmp['hits'])
    df1 = pd.concat((df1,tmpdf),ignore_index=True)
    #print(len(df1))
    i = i+1
    time.sleep(0.25)

CPU times: total: 594 ms
Wall time: 25.6 s


In [9]:
print(df1.head(n=2))

print(df1.iloc[0]['topicCategory'][0])

                  _id    _score  \
0  ncbi_sra_srp377201  9.154606   
1  ncbi_sra_srp095966  9.154606   

                                       topicCategory       _ignored  
0  [{'curatedBy': {'name': 'gpt-3.5-turbo', 'url'...            NaN  
1  [{'curatedBy': {'name': 'gpt-3.5-turbo', 'url'...  [all.keyword]  
{'curatedBy': {'name': 'gpt-3.5-turbo', 'url': 'https://openai.com/index/chatgpt'}, 'fromGPT': True, 'identifier': 'topic_3301', 'inDefinedTermSet': 'EDAM', 'name': 'Microbiology', 'url': 'http://edamontology.org/topic_3301'}


In [76]:
def pull_topic_names(topic_array):
    topiclist = []
    for eachtopic in topic_array:
        if eachtopic['name']!= "Human biology":
            topiclist.append(eachtopic['name'])
    topiclist.sort()
    return topiclist

df2 = df1[['_id','topicCategory']].copy()        

In [77]:
df2['topics'] = df1.apply(lambda row: pull_topic_names(row['topicCategory']),axis=1)
df2['term'] = df2['topics']
df2['topic_hash'] = [' | '.join(x) for x in df2['topics']]
df2.drop(columns='topicCategory',inplace=True)
df3 = df2.explode('term')
df3_clean = df3.drop_duplicates(['_id','term'],keep='first')
print(len(df3))
print(df3.head(n=2))

6611
                  _id                             topics               term  \
0  ncbi_sra_srp377201  [Microbiology, Molecular biology]       Microbiology   
0  ncbi_sra_srp377201  [Microbiology, Molecular biology]  Molecular biology   

                         topic_hash  
0  Microbiology | Molecular biology  
0  Microbiology | Molecular biology  


In [78]:
df4 = df3_clean.groupby('topic_hash').size().reset_index(name="records")
df4.sort_values('records',ascending=False,inplace=True)
print(df4)
df4.to_csv(os.path.join(data_path,'topic_hash.tsv'),sep='\t',header=True)

                      topic_hash  records
818  Metagenomics | Microbiology      180
317      Cell biology | Genetics      112
671      Genomics | Microbiology       98
625          Genetics | Genomics       96
636      Genetics | Microbiology       94
..                           ...      ...
619            Genetic variation        1
972        Regenerative medicine        1
256                Biotechnology        1
839            Molecular biology        1
927                 Pharmacology        1

[999 rows x 2 columns]


In [68]:
biodiversity = df2.loc[df2['topics'].astype(str).str.contains('Biodiversity')]
print(biodiversity.tail(n=20))

                  _id                                             topics  \
103   dryad_1rn8pk0pd  [Biodiversity, Ecology, Environmental sciences...   
271       prjna374136                            [Biodiversity, Zoology]   
615   egas00001005511  [Biodiversity, Biomarkers, Comparative genomic...   
738   dryad_zpc866t8b             [Biodiversity, Environmental sciences]   
855         prjeb2785                         [Biodiversity, Immunology]   
2316      prjna662142                      [Biodiversity, Plant biology]   
2527      dryad_63h33             [Biodiversity, Environmental sciences]   
2855  dryad_47d7wm3bt                            [Biodiversity, Ecology]   
2961    dryad_2864hc8                            [Biodiversity, Ecology]   

                                                   term  \
103   [Biodiversity, Ecology, Environmental sciences...   
271                             [Biodiversity, Zoology]   
615   [Biodiversity, Biomarkers, Comparative genomic...   
738

In [61]:
nah = df3.loc[df3['term'].isna()]
print(nah.head(n=20))

                                         _id topics term topic_hash
63                       mendeley_nzshbjg8h2     []  NaN           
72                       mendeley_yvsz3xc7hs     []  NaN           
75                       mendeley_zr6xj4jmxw     []  NaN           
100                      mendeley_rgbh3ttfzn     []  NaN           
149  hubmap_f873b94dc8554ce2d013ecd570f1d8b9     []  NaN           
229  hubmap_dc289471333309925e46ceb9bafafaf4     []  NaN           
233             dataverse_10.7910_dvn_io89oi     []  NaN           
235             dataverse_10.7910_dvn_meudnp     []  NaN           
427                            s-epmc3819378     []  NaN           
553                      mendeley_vgyjc23rnw     []  NaN           
610                      mendeley_csgs546vxy     []  NaN           
612                      mendeley_v7j8ynryj6     []  NaN           
690                            s-epmc2851913     []  NaN           
724             dataverse_10.7910_dvn_pj7uaj    