<a href="https://colab.research.google.com/github/MWFK/NLP-Semantic-Similarity/blob/main/ClinicalTrials/Data%20Engineering/03.%20InclusionCriteria%20N-Grams%20Frequencies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Objectives

Download Lung Cancer clinical trials, and find the most frequent phrases in the InclusionCriteria using N-Grams.

### Libs

In [1]:
import re
import pandas as pd
# pd.set_option('display.max_columns', None)  
# pd.set_option('display.max_colwidth', None)
import requests
from itertools import compress
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet

from google.colab import files

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


### Data

In [2]:
####### Study #######
# lung cancer 
# NStudiesFound: 10152

####### Study Fields #######
# NCTId, OrgFullName, OfficialTitle, OverallStatus, Phase, Keyword, DetailedDescription, 
# Condition, EligibilityCriteria, HealthyVolunteers, Gender, MinimumAge, StudyPopulation, 
# LocationFacility, LocationCity, LocationCountry

####### New Fields to add #######
# LocationStatus, Phase

####### Deleted Fields #######
# LocationState, LocationZip

####### Range Min_MAX ######
# 1 to 10152

####### Format #######
# CSV

step    = 1000
min_rnk = 1
max_rnk = step

for req in range(11): 
    
    print("Downloading Lung Cancer clinical trials with ranks from ", min_rnk, " to ", max_rnk)
    url = 'https://clinicaltrials.gov/api/query/study_fields?expr=lung+cancer&fields=NCTId%2C+OrgFullName%2C+OfficialTitle%2C+OverallStatus%2C+Phase%2C+Keyword%2C+DetailedDescription%2C+%0D%0ACondition%2C+EligibilityCriteria%2C+HealthyVolunteers%2C+Gender%2C+MinimumAge%2C+StudyPopulation%2C+%0D%0ALocationFacility%2C+LocationStatus%2C+LocationCity%2C+LocationCountry&min_rnk='+str(min_rnk)+'&max_rnk='+str(max_rnk)+'&fmt=csv'
    session = requests.Session()
    retry   = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://' , adapter)
    session.mount('https://', adapter)

    clinicaltrials = session.get(url)
    print('Download Request Status: ', clinicaltrials.status_code)
    
    csv_file = open('/content/'+str(req)+'-batch.csv', 'wb')
    csv_file.write(clinicaltrials.content)
    csv_file.close()
    
    min_rnk = max_rnk + 1
    max_rnk += step

Downloading Lung Cancer clinical trials with ranks from  1  to  1000
Download Request Status:  200
Downloading Lung Cancer clinical trials with ranks from  1001  to  2000
Download Request Status:  200
Downloading Lung Cancer clinical trials with ranks from  2001  to  3000
Download Request Status:  200
Downloading Lung Cancer clinical trials with ranks from  3001  to  4000
Download Request Status:  200
Downloading Lung Cancer clinical trials with ranks from  4001  to  5000
Download Request Status:  200
Downloading Lung Cancer clinical trials with ranks from  5001  to  6000
Download Request Status:  200
Downloading Lung Cancer clinical trials with ranks from  6001  to  7000
Download Request Status:  200
Downloading Lung Cancer clinical trials with ranks from  7001  to  8000
Download Request Status:  200
Downloading Lung Cancer clinical trials with ranks from  8001  to  9000
Download Request Status:  200
Downloading Lung Cancer clinical trials with ranks from  9001  to  10000
Download Req

In [3]:
df = pd.read_csv(r'/content/0-batch.csv', skiprows=10)
for req in range(1, 11): 
    tmp = pd.read_csv('/content/' +str(req)+ '-batch.csv', skiprows=10)
    print('Batch ', req, ': ', tmp.shape)
    df = df.append(tmp, ignore_index=True)

df.to_csv(r'/content/batchs.csv')
print('All Batchs: ',df.shape)

Batch  1 :  (1000, 18)
Batch  2 :  (1000, 18)
Batch  3 :  (1000, 18)
Batch  4 :  (1000, 18)
Batch  5 :  (1000, 18)
Batch  6 :  (1000, 18)
Batch  7 :  (1000, 18)
Batch  8 :  (1000, 18)
Batch  9 :  (1000, 18)
Batch  10 :  (277, 18)
All Batchs:  (10277, 18)


### EligibilityCriteria Processing

In [4]:
# Convert the EligibilityCriteria to string then lowercase
EligibilityCriteria = df['EligibilityCriteria'].astype(str).str.lower().tolist()
EligibilityCriteria[:10]

['inclusion criteria:||age ≥ 18 years at the time of screening.|eastern cooperative oncology group performance status of ≤ 2.|written informed consent obtained from the patient.|histologically and cytologically documented stage 3b-4 lung cancer (according to version 8 of the international association for the study of lung cancer staging system).|patients with stage 1 to 3, who undergo radical therapy with disease free survival (dfs) >12 months.|willingness and ability to comply with scheduled visits and other study procedures.||exclusion criteria:||history of another primary malignancy except for malignancy treated with curative intent with known active disease ≥ 5 years before date of the informed consent.|without signed informed consent.|unwillingness or inability to comply with scheduled visits or other study procedures.|previously diagnosed with vte before signing informed consent.',
 'inclusion criteria:||20 or more pack year smoking history|clinical need for diagnostic bronchosco

### N-Grams Split and Frequencies functions

In [5]:
def ngrams_split(text, ngram):
    n_grams = ngrams(word_tokenize(text), ngram)
    return [ ' '.join(grams) for grams in n_grams]

In [6]:
def ngrams_frequency(text, ngram):
  bigram_fd = nltk.FreqDist(ngrams_split(text, ngram))
  return bigram_fd.most_common()

### InclusionCriteria Processing

In [7]:
# Convert the EligibilityCriteria to string then lowercase
EligibilityCriteria = df['EligibilityCriteria'].astype(str).str.lower().tolist()
EligibilityCriteria[:5]

['inclusion criteria:||age ≥ 18 years at the time of screening.|eastern cooperative oncology group performance status of ≤ 2.|written informed consent obtained from the patient.|histologically and cytologically documented stage 3b-4 lung cancer (according to version 8 of the international association for the study of lung cancer staging system).|patients with stage 1 to 3, who undergo radical therapy with disease free survival (dfs) >12 months.|willingness and ability to comply with scheduled visits and other study procedures.||exclusion criteria:||history of another primary malignancy except for malignancy treated with curative intent with known active disease ≥ 5 years before date of the informed consent.|without signed informed consent.|unwillingness or inability to comply with scheduled visits or other study procedures.|previously diagnosed with vte before signing informed consent.',
 'inclusion criteria:||20 or more pack year smoking history|clinical need for diagnostic bronchosco

In [8]:
# Seperate the InclusionCriteria from the ExclusionCriteria
InclusionCriteria = [txt[21:txt.find('exclusion criteria')-2] for txt in EligibilityCriteria]
InclusionCriteria[:5]

['age ≥ 18 years at the time of screening.|eastern cooperative oncology group performance status of ≤ 2.|written informed consent obtained from the patient.|histologically and cytologically documented stage 3b-4 lung cancer (according to version 8 of the international association for the study of lung cancer staging system).|patients with stage 1 to 3, who undergo radical therapy with disease free survival (dfs) >12 months.|willingness and ability to comply with scheduled visits and other study procedures.',
 '20 or more pack year smoking history|clinical need for diagnostic bronchoscopy or consent to study driven bronchoscopy',
 'aged 40-74 years;|resident in the hexi district of tianjin city for at least 3 years;|having no self-reported history of any malignant tumor.',
 'men or women diagnosed with lung cancer all types and stages confirmed over 12 months of recruitment period by a pathologist||aged at least18 years at diagnosis|patients who provide their informed consent form',
 'i

In [9]:
# Split each Trial InclusionCriteria into list of lists of the InclusionCriteria
InclusionCriteriaList = list(map(lambda txt : txt.split("|"), InclusionCriteria))
InclusionCriteriaList = [list(filter(lambda txt: txt!= '', ltexts)) for ltexts in InclusionCriteriaList] # Delete the empty string generated by successive || (sometimes they seperate with | or ||)
print(len(InclusionCriteriaList))
InclusionCriteriaList[:2]

10277


[['age ≥ 18 years at the time of screening.',
  'eastern cooperative oncology group performance status of ≤ 2.',
  'written informed consent obtained from the patient.',
  'histologically and cytologically documented stage 3b-4 lung cancer (according to version 8 of the international association for the study of lung cancer staging system).',
  'patients with stage 1 to 3, who undergo radical therapy with disease free survival (dfs) >12 months.',
  'willingness and ability to comply with scheduled visits and other study procedures.'],
 ['20 or more pack year smoking history',
  'clinical need for diagnostic bronchoscopy or consent to study driven bronchoscopy']]

In [10]:
### Convert the InclusionCriteria from a list of lists to a list
InclusionCriteriaList_Flat  = [item for sublist in InclusionCriteriaList for item in sublist]
print(len(InclusionCriteriaList_Flat))
InclusionCriteriaList_Flat[:5]

117886


['age ≥ 18 years at the time of screening.',
 'eastern cooperative oncology group performance status of ≤ 2.',
 'written informed consent obtained from the patient.',
 'histologically and cytologically documented stage 3b-4 lung cancer (according to version 8 of the international association for the study of lung cancer staging system).',
 'patients with stage 1 to 3, who undergo radical therapy with disease free survival (dfs) >12 months.']

In [11]:
### Join all texts into one text
AllInclusionCriteria = " ".join(InclusionCriteriaList_Flat)
print(len(AllInclusionCriteria))
print(len(AllInclusionCriteria.split(' ')))

ngrams_frequency(AllInclusionCriteria, 1)[:5]
# => we need to remove the punctuation, stopwords, tag the words then lemmatize them.

12447275
1857722


[(',', 59653), ('of', 59227), ('or', 56111), (')', 49893), ('(', 48976)]

### InclusionCriteria Cleansing

In [12]:
def remove_punctuation(text):
    return "".join([i for i in text if i not in string.punctuation])

In [13]:
def remove_stopwords(text):
    return ' '.join([word for word in word_tokenize(text) if not word in stopwords.words('english')])

'''
Part-of-speech tagging, words, or tokens are assigned part of speech tags, which are typically morphosyntactic subtypes of fundamental syntactic 
categories in the language such as a noun, or verb. By lemmatizing lexemes, inflected forms of a word are grouped together under a common root. 

The tagging and lemmatization of parts of speech are essential to linguistic pre-processing. This website uses morphosyntactic descriptors and 
part-of-speech tagging as acronyms. In the context of the NLTK Lemmatization, the part of speech tags are pre-defined with shortcuts for the 
NLTK WordNetLemmatizer as below.

https://www.holisticseo.digital/python-seo/nltk/lemmatize
'''

In [14]:
# Custom tagger, so the tags can be interpreted by the WordNetLemmatizer()
# Because NLTK nltk.pos_tag() and WordNetLemmatizer() do not use the same naming covention
def nltk_pos_tagger(nltk_tag):
  
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [15]:
def tag_lemmatize_sentence(sentence):

    # Tag the Tokenized text
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  

    # Aply the custom tagger, so the tags can be interpreted by the WordNetLemmatizer()
    wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])), nltk_tagged)
    
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

# sentence = "I am voting for better that politician. in this NLTK, Lemmatization example sentences"
# print(sentence) 
# print(tag_lemmatize_sentence(sentence))

In [16]:
# print(sentence)
# print(remove_punctuation(sentence))
# print(remove_stopwords(remove_punctuation(sentence)))
# print(lemmatizer.lemmatize(remove_stopwords(remove_punctuation(sentence))))
# print(tag_lemmatize_sentence(remove_stopwords(remove_punctuation(sentence))))

In [17]:
%%time
# Clean the inclusion Criteria
ProcessedAllInclusionCriteria = tag_lemmatize_sentence(remove_stopwords(remove_punctuation(AllInclusionCriteria)))
print(len(ProcessedAllInclusionCriteria))
print(len(ProcessedAllInclusionCriteria.split(' ')))

9716457
1293791
CPU times: user 4min 51s, sys: 20.1 s, total: 5min 11s
Wall time: 5min 11s


### Execution

In [18]:
'''
from pandas import ExcelWriter
# from pandas.io.parsers import ExcelWriter

Then the save_xls function works as expected:

def save_xls(list_dfs, xls_path):
    with ExcelWriter(xls_path) as writer:
        for n, df in enumerate(list_dfs):
            df.to_excel(writer,'sheet%s' % n)
        writer.save()
'''

%%time
for ngram in range(1,85):
  #data = ngrams_frequency(ProcessedAllInclusionCriteria, ngram)
  #data_df = pd.DataFrame(data, columns=['Phrase', 'Frequency']).head(100)
  #print('For N-Grams=', ngram, 'The most frequent phrases are: \n', data[:10],'\n\n')
  pd.DataFrame(ngrams_frequency(ProcessedAllInclusionCriteria, ngram), columns=['Phrase', 'Frequency']).head(100).to_excel(str(str(ngram) + ' Words per sentence.xlsx'), index=False)
  files.download(str(str(ngram) + ' Words per sentence.xlsx'))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

CPU times: user 14min 43s, sys: 19.3 s, total: 15min 2s
Wall time: 15min 1s
