In [1]:
import json
import logging
import xml.etree.ElementTree as ET
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
filepath = os.path.normpath(os.getcwd() + '/..' + '/data/processed/data.json')
filepath

'/home/hazot/code/trec-clinical-trials-2023/data/processed/data.json'

In [3]:
df = pd.read_json(filepath)

In [4]:
print(df.shape)
print(df.columns)

(451538, 17)
Index(['nct_id', 'link_text', 'url', 'id_info', 'brief_title', 'sponsors',
       'brief_summary', 'detailed_description', 'primary_purpose',
       'intervention', 'eligibility', 'gender', 'minimum_age', 'maximum_age',
       'healthy_volunteers', 'keyword', 'condition_browse'],
      dtype='object')


In [7]:
df['gender'].value_counts()

gender
All       387617
Female     43441
Male       19609
Name: count, dtype: int64

In [8]:
df['minimum_age'].value_counts()

minimum_age
18 Years     279575
N/A           30526
20 Years      17938
40 Years       9334
21 Years       9238
              ...  
84 Days           1
118 Years         1
73 Hours          1
167 Days          1
23 Days           1
Name: count, Length: 318, dtype: int64

In [9]:
df['maximum_age'].value_counts()

maximum_age
N/A           214375
65 Years       29033
80 Years       23198
75 Years       22838
70 Years       18586
               ...  
54 Days            1
263 Months         1
93 Days            1
26 Hours           1
37 Days            1
Name: count, Length: 473, dtype: int64

In [10]:
df['healthy_volunteers'].value_counts()

healthy_volunteers
No                            322154
Accepts Healthy Volunteers    118138
Name: count, dtype: int64

### More complicated cases

In [16]:
df['intervention'][0]

[{'intervention_type': 'Biological',
  'intervention_name': 'V930',
  'description': 'V930 Over a 94 week duration, patients will receive a series of 5 injections (2.5 mg/injection), one every other week. Within 2 minutes of each injection of V930, each patient will be given an EP-IM injection consisting of two 60 msec pulses.',
  'arm_group_label': '1'},
 {'intervention_type': 'Biological',
  'intervention_name': 'V932',
  'description': 'V932 Over a 94 week duration, patients will receive a series of 5 injections, 6 patients will initially received intramuscular V932 vaccinations at a low dose (0.5x109 vg/injection),and following a safety assessment, up to an additional 35 patients will be treated with the high dose V932 (0.5x1011 vg/injection).',
  'arm_group_label': '2'}]

In [14]:
df['eligibility'][0].keys()

dict_keys(['criteria', 'gender', 'minimum_age', 'maximum_age', 'healthy_volunteers'])

In [11]:
df['keyword'][:10]

0                  Cancers expressing HER-2 and/or CEA
1                                                 None
2    [dexmedetomidine, midazolam, remifentanil, Ele...
3                                                 None
4    [adenocarcinoma of the colon, stage I colon ca...
5                                                 None
6                                                 None
7                                                 None
8    [ADHD, ADD, Attention Deficit Hyperactivity Di...
9     [wound healing, phototoxicity, Healthy Subjects]
Name: keyword, dtype: object

In [12]:
df['condition_browse'][:10]

0                                                 None
1                                                 None
2                                                 None
3    {'mesh_term': 'Attention Deficit Disorder with...
4                   {'mesh_term': 'Colonic Neoplasms'}
5            {'mesh_term': 'Constriction, Pathologic'}
6                        {'mesh_term': 'Otitis Media'}
7                                                 None
8                                                 None
9                                                 None
Name: condition_browse, dtype: object

### Less complicated

In [17]:
df['primary_purpose'].value_counts()

primary_purpose
Treatment                          224409
Prevention                          37354
Other                               17257
Supportive Care                     17069
Basic Science                       16628
Diagnostic                          15376
Health Services Research             8414
Screening                            2988
Device Feasibility                   1075
Educational/Counseling/Training       186
Name: count, dtype: int64

In [18]:
df['nct_id'].value_counts()

nct_id
NCT00647114    1
NCT04235829    1
NCT04232995    1
NCT04239365    1
NCT04237597    1
              ..
NCT02748460    1
NCT02749500    1
NCT02741791    1
NCT02741024    1
NCT03052816    1
Name: count, Length: 451538, dtype: int64

In [19]:
count_url = 0
for i in range(len(df['url'])):
    count_url += 1 if df['url'][i] else 0
print(count_url)

451538


In [20]:
len(df['id_info'][2].keys())

2

In [21]:
max_keys = 0
max_string_keys = ''
for i in range(len(df['id_info'])):
    if max_keys >= len(df['id_info'][i].keys()):
        break
    max_keys = len(df['id_info'][i].keys())
    max_string_keys = df['id_info'][i].keys()
print(max_keys)
print(max_string_keys)

min_keys = 100
min_string_keys = ''
for i in range(len(df['id_info'])):
    if min_keys <= len(df['id_info'][i].keys()):
        break
    min_keys = len(df['id_info'][i].keys())
    min_string_keys = df['id_info'][i].keys()
print(min_keys)
print(min_string_keys)

3
dict_keys(['org_study_id', 'secondary_id', 'nct_id'])
2
dict_keys(['org_study_id', 'nct_id'])


In [22]:
df['id_info'][0].keys()

dict_keys(['org_study_id', 'secondary_id', 'nct_id'])

In [23]:
count_nct_id = 0
for i in range(len(df['id_info'])):
    count_nct_id += 1 if df['id_info'][i].get('nct_id', 0) else 0
print(count_nct_id)

451538


In [24]:
df['brief_title']

0         A Study to Test V930/V932 in Patients With Can...
1         Fed Study of Benazepril HCl and Hydrochlorothi...
2         Effects of Two Different Sedation Regimes on A...
3         Safety, Tolerability and Efficacy Study of ABT...
4         Fluorouracil and Oxaliplatin With or Without P...
                                ...                        
451533    Evaluating 18F-FDG PET/CT With Liver SUVmax-ba...
451534    Imaging of in Vivo Sigma-2 Receptor Expression...
451535    Evaluation of the Efficacy Safety and Tolerabi...
451536          Resistant Starch, Gut Bacteria and Diabetes
451537    Ice T Postoperative Multimodal Pain Regimen in...
Name: brief_title, Length: 451538, dtype: object

In [25]:
df['brief_title'].isna().sum()

0

In [26]:
df['brief_summary'].isna().sum()

841

In [27]:
df['brief_summary'].apply(type).value_counts()

brief_summary
<class 'str'>         450697
<class 'NoneType'>       841
Name: count, dtype: int64

In [28]:
df['brief_summary'].value_counts()[:10]

brief_summary
To evaluate the Sun Protection Factor efficacy on human skin.                                                                                                                                                                                                                                                                        49
Investigators are building an empirical evidence base for real world data through large-scale replication of randomized controlled trials. The investigators' goal is to understand for what types of clinical questions real world data analyses can be conducted with confidence and how to implement such studies.                35
This study aims to evaluate the comparative risk of dementia/Alzheimer's disease onset between patients treated with medications that target specific metabolic pathways and patients treated with alternative medications for the same indication.                                                                                  14
Th

In [29]:
bad_indexes = df.loc[pd.isna(df["brief_summary"]), :].index
print(bad_indexes)
print('len(bad_indexes):', len(bad_indexes))

Index([   854,    936,    977,   2962,   3354,   3359,   4953,   5235,   5307,
         5517,
       ...
       441722, 442341, 443174, 444716, 445616, 447390, 448477, 449224, 449228,
       449655],
      dtype='int64', length=841)
len(bad_indexes): 841


### Check word (tokens) frequency

1. Word frequency in all strings

In [30]:
df['brief_summary'][:10000].str.split(expand=True).stack().value_counts()[:30]

the          37181
of           35599
and          25735
to           24576
in           20003
a            13574
is           13541
with         12917
will          9869
study         9650
The           8380
for           8205
or            7030
be            7026
patients      6955
this          6022
that          4916
are           4429
This          4273
as            4032
by            3847
treatment     3830
on            3829
may           3696
have          3683
an            2986
at            2926
who           2866
from          2823
blood         2718
Name: count, dtype: int64

In [31]:
df['brief_summary']

0         Treatment of patients with cancer types known ...
1         The objective of this study was to investigate...
2         Sedation may be necessary in intensive care to...
3         The purpose of this study is to test if the in...
4         RATIONALE Drugs used in chemotherapy, such as ...
                                ...                        
451533    The purpose of this study is to evaluate wheth...
451534    Pilot study using [18F]ISO-1 PET/CT to image s...
451535    Double blind, randomized multi-center, evaluat...
451536    The aim of the study is to investigate, if res...
451537    The purpose of this randomized controlled tria...
Name: brief_summary, Length: 451538, dtype: object

2. Token frequency in all strings

In [None]:
tokenizer = None