<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Dataset processing
  </div> 
  
<div style="
      font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Clinical trials ICTRP
  </div>


  <div style=" float:left; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  Jean-baptiste AUJOGUE
  </div> 
  
  <div style=" float:right; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  December 2022
  </div>

<a id="TOC"></a>

#### Table Of Content

1. [Clinical trials](#texts) <br>


#### Useful links

- [Clinical Trials ICTRP dataset download](https://www.who.int/clinical-trials-registry-platform)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import copy
import json
import zipfile
import unicodedata

# data
import numpy as np
import pandas as pd

# viz
# from tqdm import tnrange

#### Custom variables

In [3]:
path_to_data = os.path.join(os.getcwd(), 'ictrp')
path_to_data

'C:\\Users\\jb\\Desktop\\NLP\\perso - Transformers for NLP\\datasets\\ictrp'

In [15]:
# base_dataset_name  = 'ICTRPFullExport-774632-24-03-2022.zip'
base_dataset_name  = 'ICTRPFullExport-824884-18-11-2022.zip'
final_dataset_name = 'ictrp-dataset'
text_dataset_name  = 'ictrp-corpus'

<a id="texts"></a>

# 1. Clinical trials

[Table of content](#TOC)

## 1.1 Load corpus

[Table of content](#TOC)

In [16]:
def strip_accents(s):
    return ''.join(
        c for c in unicodedata.normalize('NFKD', s)
        if unicodedata.category(c) != 'Mn'
    )


def clean_text(s):
    s = (s if s else '')
    s = (s[1: -1] if len(s.split(' '))>1 else s)
    s = strip_accents(s)
    s = re.sub('<br[^>]*>', '\n', s).strip()
    s = re.sub('\n(\s)*[\W]+(\s)+', ', ', s)
    s = re.sub('(?:^)(\s)*[\W]+(\s)+', ' ', s)
    s = re.sub('(\s)+', ' ', s)
    s = s.strip()
    return s



def process_zipfile(zip_file, header = None, names = None):
    all_content = []
    archive = zipfile.ZipFile(zip_file, 'r')
    file = archive.namelist()[0]
    with archive.open(file, 'r') as f:
        df = pd.read_csv(f, header = header, names = names)
    return df
    

    
def clean_dataframe(df):
    columns = [
        'public_title', 'Scientific_title',
        'Inclusion_Criteria', 'Exclusion_Criteria',
        'Primary_Outcome', 'Secondary_Outcomes',
        'results_summary',
        
    ]
    df = df[columns].fillna('').applymap(clean_text)
    df['Inclusion_Criteria'] = df.Inclusion_Criteria.apply(lambda s: re.sub('([Ii]n|[eE]x)clusion [Cc]riteri[a|on](\s)*:', '', s).strip())
    df['Exclusion_Criteria'] = df.Exclusion_Criteria.apply(lambda s: re.sub('([Ii]n|[eE]x)clusion [Cc]riteri[a|on](\s)*:', '', s).strip())
    return df



def remove_weird(df, weird = ['a¢a', '¬a¢', '\x8d', 'a?¥', 'a€¢', 'AA¢AA¥', 'AA¢AA¤', 'a?¢', 'a€tm', '°a¥', '\\', 'a??s']):
    return df[(
        df.Inclusion_Criteria.apply(lambda s: sum([w in str(s).lower() for w in weird]) == 0) &
        df.Exclusion_Criteria.apply(lambda s: sum([w in str(s).lower() for w in weird]) == 0)
    )]



def get_character_count(texts):
    text = ' '.join(texts)
    return np.unique(list(text), return_counts = True)

In [17]:
# df_trials = process_zipfile(os.path.join(path_to_data, 'ICTRPFullExport-774632-24-03-2022.zip'), header = 0)

# df_trials.shape # (774632, 63)

# columns = df_trials.columns.tolist()
columns = [
    'TrialID',
    '(No column name)',
    'SecondaryIDs',
    'public_title',
    'Scientific_title',
    'url',
    'Public_Contact_Firstname',
    'Public_Contact_Lastname',
    'Public_Contact_Address',
    'Public_Contact_Email',
    'Public_Contact_Tel',
    'Public_Contact_Affiliation',
    'Scientific_Contact_Firstname',
    'Scientific_Contact_Lastname',
    'Scientific_Contact_Address',
    'Scientific_Contact_Email',
    'Scientific_Contact_Tel',
    'Scientific_Contact_Affiliation',
    'study_type',
    'study_design',
    'phase',
    'Date_registration',
    'Date_enrollement',
    'Target_size',
    'Recruitment_status',
    'Primary_sponsor',
    'Secondary_sponsors',
    'Source_Support',
    'Countries',
    'Conditions',
    'Interventions',
    'Age_min',
    'Age_max',
    'Gender',
    'Inclusion_Criteria',
    'Exclusion_Criteria',
    'Primary_Outcome',
    'Secondary_Outcomes',
    'Bridging_flag',
    'Bridged_type',
    'Childs',
    'type_enrolment',
    'Retrospective_flag',
    'results_actual_enrollment',
    'results_url_link',
    'results_summary',
    'results_date_posted',
    'results_date_first_publication',
    'results_baseline_char',
    'results_participant_flow',
    'results_adverse_events',
    'results_outcome_measures',
    'results_url_protocol',
    'results_IPD_plan',
    'results_IPD_description',
    'results_date_completed',
    'results_yes_no',
    'Ethics_Status',
    'Ethics_Approval_Date',
    'Ethics_Contact_Name',
    'Ethics_Contact_Address',
    'Ethics_Contact_Phone',
    'Ethics_Contact_Email',
]

In [18]:
df_trials = process_zipfile(os.path.join(path_to_data, base_dataset_name), names = columns)
df_trials = remove_weird(df_trials)

  df = pd.read_csv(f, header = header, names = names)


In [19]:
df_trials.shape

(824502, 63)

In [20]:
df_trials.head()

Unnamed: 0,TrialID,(No column name),SecondaryIDs,public_title,Scientific_title,url,Public_Contact_Firstname,Public_Contact_Lastname,Public_Contact_Address,Public_Contact_Email,...,results_IPD_plan,results_IPD_description,results_date_completed,results_yes_no,Ethics_Status,Ethics_Approval_Date,Ethics_Contact_Name,Ethics_Contact_Address,Ethics_Contact_Phone,Ethics_Contact_Email
0,NCT05537740,"""24 October 2022""","""21820""","""A First-in-human Study to Learn How Safe the ...","""First-in-human Dose-escalation and Expansion ...","""https://clinicaltrials.gov/show/NCT05537740""",""" ""","""Bayer Clinical Trials Contact""",,"""clinical-trials-contact@bayer.com""",...,,,,,,,,,,
1,NCT05544864,"""24 October 2022""","""GE IDE no. BA00219""","""Intracoronary Stenting and Restenosis - Rando...","""Intracoronary Stenting and Restenosis - Rando...","""https://clinicaltrials.gov/show/NCT05544864""",""" ; ""","""Jens Wiebe, MD;Jens Wiebe, MD""",,"""wiebe@dhm.mhn.de;wiebe@dhm.mhn.de""",...,,,,,,,,,,
2,NCT05573529,"""24 October 2022""","""1163/2020""","""EDOF and Multifocal IOL Study""","""Comparison of Visual Outcomes and Patient Sat...","""https://clinicaltrials.gov/show/NCT05573529""",""" ; ""","""Christina Leydolt, MD;Christina Leydolt, MD""",,"""christina.leydolt@meduniwien.ac.at;christina....",...,,,,,,,,,,
3,NCT05536791,"""31 October 2022""","""1160-0307""","""A Study in Europe Based on Medical Records Th...","""Safety of Dabigatran Etexilate (DE) for Treat...","""https://clinicaltrials.gov/show/NCT05536791""",""" ""","""Boehringer Ingelheim""",,"""clintriage.rdg@boehringer-ingelheim.com""",...,,,,,,,,,,
4,NCT05536804,"""31 October 2022""","""I8F-MC-GPIG;2021-005273-47;17217""","""A Study of Tirzepatide (LY3298176) in Partici...","""Tirzepatide Study of Renal Function in People...","""https://clinicaltrials.gov/show/NCT05536804""",""" ; ""","""Call 1-877-CTLILLY (1-877-285-4559) or 1-317-...",,""";ClinicalTrials.gov@lilly.com""",...,,,,,,,,,,


## 1.2 Clean corpus

[Table of content](#TOC)

In [21]:
df_trials = clean_dataframe(df_trials)
df_trials = remove_weird(df_trials) # a lot of texts are removed because they contain badly encoded latin-1 characters

In [22]:
df_trials.shape

(814391, 7)

In [23]:
df_trials.head()

Unnamed: 0,public_title,Scientific_title,Inclusion_Criteria,Exclusion_Criteria,Primary_Outcome,Secondary_Outcomes,results_summary
0,A First-in-human Study to Learn How Safe the S...,First-in-human Dose-escalation and Expansion S...,", Capable of giving signed informed consent., ...",,Number of participants with treatment-emergent...,Objective response rate (ORR);Fold change in s...,
1,Intracoronary Stenting and Restenosis - Random...,Intracoronary Stenting and Restenosis - Random...,", 1. Patients with ischemic symptoms and/or ev...",,Composite endpoint of major adverse cardiac ev...,Target lesion failure (TLF): a composite of ca...,
2,EDOF and Multifocal IOL Study,Comparison of Visual Outcomes and Patient Sati...,", Bilateral age-related cataract for which pha...",,distant corrected near visual acuity,,
3,A Study in Europe Based on Medical Records Tha...,Safety of Dabigatran Etexilate (DE) for Treatm...,", Written informed consent from parents/care g...",,Incidence of any bleeding events defined as Ma...,Incidence of Adverse Events (AEs);Incidence of...,
4,A Study of Tirzepatide (LY3298176) in Particip...,Tirzepatide Study of Renal Function in People ...,", All participants with or without diabetes:, ...",,Change from Baseline in Kidney Oxygenation in ...,Change from Baseline in Kidney Oxygenation in ...,


In [24]:
# # weird_idx = [
#  86252,
#  86318,
#  86320,
#  86322,
#  86323,
#  86324,
#  86333,
#  86335,
#  86407,
#  86408,
#  86409,
#  86412,
#  86417,
# ...

In [25]:
df_trials.Exclusion_Criteria[df_trials.Exclusion_Criteria.apply(lambda s: len(s.split(' ')))>5].tolist()[:100]

['Septicemia 2. Active malignancy or patient on immunosuppressive therapy within last 3 months 2.Uncontrolled hyperglycaemia 3.Clinically important gastrointestinal bleed 4.Pregnancy 5.History of hypersensitivity to steroid preparations',
 'Ophthalmological criteria: 1) Corrected visual acuity &lt; 0.5 2) Refractive Error with a spherical equivalent &gt; +6 or smaller - 6 D 3) Elevated intraocular pressure (higher than 22 mmHg) 4) Relevant anisocoria or pupil deformation 5) History of eye surgery apart from laser trabeculoplasty less than three months previously or extraocular surgery such as strabismus surgery 6) Topical ocular medication influencing IOP (intraocular pressure) or pupil size within 3 months prior to study-start 7) Combined surgery Non-surgical conditions in the eye to be operated 8) Iatrogenic or traumatic or congenital cataract 9) Pupillary abnormalities (irregular) 10) Very dark iris - Iris synechiae 11) Eye movement disorder (Nystagmus) 12) Dacryocystitis and all ot

#### Export to tsv

In [26]:
df_trials.to_csv(os.path.join(path_to_data, '{}.tsv'.format(final_dataset_name)), sep = "\t", index = False)

#### Export to txt

In [27]:
def clean_txt(t):
    t = t.strip()
    
    # replace linebreaks with comas
    t = re.sub('(\n)+', ', ', t)
    
    # remove non-alphanumeric preffix
    t = re.sub('(?:^)(\s)*[\W]+', ' ', t)
    
    # shrink consecutive punctuation
    t = re.sub('(?P<name>[,;:\.!?])(\s)*[,;:\.!?]+', '\g<name>', t)
    
    # shrink space
    t = re.sub('(\s)+', ' ', t).strip()
    return t

In [28]:
df_trials = pd.read_csv(os.path.join(path_to_data, '{}.tsv'.format(final_dataset_name)), sep = "\t")
df_trials = df_trials[[
    # 'Scientific_title',
    'Inclusion_Criteria',
    'Exclusion_Criteria',
    # 'Primary_Outcome',
    # 'Secondary_Outcomes',
    # 'results_summary',
]].fillna('')

In [29]:
texts = df_trials.apply(func = lambda row: [t for t in row if len(t.split()) > 5], axis = 1)
texts = [clean_txt(t) for ts in texts for t in ts]

In [30]:
len(texts)

1081670

In [31]:
chars, counts = get_character_count(texts[:100000])

In [32]:
chars_to_hide = [char for char, count in zip(chars, counts) if count < 40]
len(chars_to_hide)

25

In [33]:
chars_to_hide

['!',
 '$',
 '|',
 '\x81',
 '\x9d',
 '¢',
 '¤',
 '¥',
 '¦',
 '©',
 '«',
 '\xad',
 '¶',
 '»',
 'Æ',
 'æ',
 '÷',
 'œ',
 'ƒ',
 'ˆ',
 '‚',
 '„',
 '‡',
 '‰',
 '€']

In [34]:
# We choose to hide all characters appearing less than 40 times
texts = [re.sub('( )+', ' ', re.sub('[{}]'.format(''.join(chars_to_hide)), ' ', t)) for t in texts]
texts = [clean_txt(t) for t in texts]

In [35]:
len(texts)

1081670

In [36]:
texts[257835]

'Patient aged 18 and over, Patient with homonymous lateral hemianopia with brain injury acquired for at least 3, months and at most 5 years after ischemic stroke, cerebral hematoma, Patient able to understand French both orally and in writing, Patient giving free, informed and written consent, Absence of a neurological disease interfering with the passing of tests, Non-, Patients subject to a legal protection measure (safeguarding of justice, guardianship, and trusteeship, protected adults), Pre-existing severe ophthalmological disorders (ophthalmic consultation before, inclusion) pre-existing visual field defects, monocular visual acuity less than 5/10, severe oculomotor disorders, Contraindication of tACS and / or magnetic resonance imaging (pacemaker or implantable, defibrillator, intracranial electrodes or other intracranial implant, cranial vault, anomalies facing stimulation electrodes (plates, prostheses, uncovered skull), Pregnant women and nursing mothers, Recruitment in other

In [37]:
with open(os.path.join(path_to_data, '{}.txt'.format(text_dataset_name)), 'w', encoding = 'utf-8') as f:
    f.write('\n'.join(texts))

In [38]:
# with open(os.path.join(path_to_data, '{}.txt'.format(text_dataset_name)), 'r', encoding = 'utf-8') as f:
#     texts = f.readlines()

In [39]:
# w = '¥'

# #for w in ['a¢a', '¬a¢', '\x8d', 'a?¥', 'a€¢', 'AA¢AA¥', 'AA¢AA¤', 'a?¢', 'a€tm', '°a¥', '\\', 'a??s']:
# [t.replace(w, 'XXXXX'+ w + 'XXXXX') for t in texts if w in t]

[Table of content](#TOC)