<a href="https://colab.research.google.com/github/MWFK/NLP-Semantic-Similarity/blob/main/ClinicalTrials/Data%20Engineering/05.%20Stack_Processed_IEC_N_GRAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Objectives

##### Steps
1. Get 1000 Clinical Trials from the Lung Cancer Study
2. Normalize the Eligibility Criteria
3. Clean&Process the Eligibility Criteria
4. Seperate the Eligibility Criteria into Inclusion and Exlusion then convert them into a list of lists.
5. Convert the List of Lists into Stacked DataFrame for both Inclusion and Exclusion Criteria.
6. Apply the NGRAM(2-6) on Inclusion and Exclusion Criteria.
7. Export the IC/EC phrases that have more than 19 frequency.

P.S.
Lung Cancer Clinical Trial's Data

### Libs

In [100]:
import re
import pandas as pd
from pandas import ExcelWriter
# pd.set_option('display.max_columns', None)  
# pd.set_option('display.max_colwidth', None)
import requests
from itertools import compress
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from string import punctuation

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from google.colab import files

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Data

In [51]:
####### Search Expression #######
# None

####### Study Fields #######
# NCTId,BriefTitle,Condition,Keyword,EligibilityCriteria 	

####### New Fields to add #######
# LocationStatus, Phase

####### Range Min_MAX ######
# 1 to 1000

####### Format #######
# CSV

url = 'https://clinicaltrials.gov/api/query/study_fields?expr=lung+cancer&fields=NCTId%2CBriefTitle%2CCondition%2CKeyword%2CEligibilityCriteria+%09&min_rnk=1&max_rnk=1000&fmt=csv'
session = requests.Session()
retry   = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://' , adapter)
session.mount('https://', adapter)

clinicaltrials = session.get(url)
print('Download Request Status: ', clinicaltrials.status_code)

csv_file = open('/content/'+str(1)+'-batch.csv', 'wb')
csv_file.write(clinicaltrials.content)
csv_file.close()

df = pd.read_csv(r'/content/1-batch.csv', skiprows=10)
print(df.shape)
df.head()

Download Request Status:  200
(1000, 6)


Unnamed: 0,Rank,NCTId,BriefTitle,Condition,Keyword,EligibilityCriteria
0,1,NCT03581708,Venous Thromboembolism in Advanced Lung Cancer,Lung Neoplasms|Venous Thromboembolism,lung cancer|Venous Thromboembolism,Inclusion Criteria:||Age ≥ 18 years at the tim...
1,2,NCT01130285,Validation of a Multi-gene Test for Lung Cance...,Lung Cancer,Lung Cancer,Inclusion Criteria:||20 or more pack year smok...
2,3,NCT03992833,Methods of Computed Tomography Screening and M...,Lung Neoplasms|Computed Tomography|Mass Screen...,,Inclusion Criteria:||Aged 40-74 years;|Residen...
3,4,NCT02725892,National Lung Cancer Registry in Men and Women...,Oncology & Epidemiology & Lung Cancer,lung cancer epidemiology algeria registry,Inclusion Criteria:||Men or women diagnosed wi...
4,5,NCT00897650,Protein and RNA Expression Patterns in Predict...,Lung Cancer,lung cancer,Inclusion criteria||Diagnosis of suspected lun...


### Normalizing Eligibility Criteria

In [52]:
# The dictionnary does not have all the variation of the possible synonyms

texts     = [### Stage
             'Has Limited Stage SCLC (Stage I-III, by AJCC 8th Edition Cancer Staging).',
             'Anti-cancer Agents in Patients With Advanced Solid Malignancies (PETRA)',
             ### Previous Treatment
             'Olaparib in Newly Diagnosed Treatment naïve Limited-Stage',
             'RR SCLC who progressed or recurred following platinum-based regimen',
             ### Performance Status
             'Has Eastern Cooperative Oncology Group (ECOG) Performance score 0 or 1',
             'ECOG PS - 0/1-2',
              ### Diagnosis
             'Has pathologically (histologically or cytologically) confirmed Small Cell Lung Cancer (SCLC).',
             'Subject with histological or cytological confirmation of extensive stage Small Cell Lung Cancer \
             (ES SCLC) or Stage IIIb or IV squamous Non-Small Cell Lung Cancer (sqNSCLC)',
             ]

displayed = [### Stage
             'Localised', 
             'Locally Advanced', 
             'Metastatic',
             ### Previous Treatment
             'Newly Diagnosed',
             'Prior Therapy',
             ### Performance Status
             'ECOG 0 or KPS 100',
             'ECOG 1 or KPS 80-90',
             'ECOG 2 or KPS 60-70',
             'ECOG 3 or KPS 40-50',
             'ECOG 4 or KPS 10-30',
             ### Diagnosis
             'Histologically',
             ]

synonyms  = [### Stage
             [r'limited\sstage', r'ls\s', r'stage\si-iii', r'no\smetastatic\sdisease', r'stage\si', r'stage\sii', r'stage\siii'], 
             [r'stage\sii'],
             [r'advanced', r'secondary', r'stage\siv', r'extensive-stage', r'extensive\sstage', 
              r'\ses\s', r'no\sevidence\sof\smetastatic\sdisease', r'extensive\sdisease'],
             ### Previous Treatment
             [r'de\snovo', r'treatment\snaïve', r'no\sprior\streatment', r'not\spreviously\streated', 
              r'previously\suntreated', r'Not\sreceived\sprior\streatment', 	r'first-line',	r'first\sline'],
             [r'relapse',	r'recurrent',	r'progressed',	r'recurred', r'rr\s', r'refractory',	r'previoustly\streated',	r'after\sprior',	r'treated\swith\sprior',	
              r'prior\stherapy', r'prior\ssystemic\stherapy', r'received\stherapy',	r'progressive',	r'progression',	r'previously\sreceived',	r'treated\swith', 	
              r'received\streatment',	r'prior\streatment', 	r'prior\sline',	r'have\sreceived',	r'progression',	r'has\sreceived'],
             ### Performance Status
             [r'eastern\scooperative\soncology\sgroup', r'ecog\s0', r'kps\s100',   r'0\sor\s1', r'0\sto\s1', r'0-2', r'0/1-2', r'≤\s1'],
             [r'eastern\scooperative\soncology\sgroup', r'ecog\s1', r'kps\s80-90', r'0\sor\s1', r'0\sto\s1', r'0-2', r'0/1-2', r'≤\s1'],
             [r'eastern\scooperative\soncology\sgroup', r'ecog\s2', r'kps\s60-70',                           r'0-2', r'0/1-2'],
             [r'eastern\scooperative\soncology\sgroup', r'ecog\s3', r'kps\s40-50'],
             [r'eastern\scooperative\soncology\sgroup', r'ecog\s4', r'kps\s10-30'],
             ### Diagnosis
             [r'pathologically',	r'cytologically', 	r'histologic',	r'histological',	r'cytological'],
             ]

In [53]:
# Before normalizing texts, we need to normalize conditionned terms
def normalizer_exceptions(texts, displayed, synonym):
  normalized_texts = []
  for text in texts:
    if text.find('brain metastases')<0:
      normalized_texts.append(text.replace(synonym, displayed))
    else: 
      normalized_texts.append(text)
  return normalized_texts 

In [54]:
# normalize a list of texts with one displayed term that has several synonyms
def normalizer(texts, displayed, synonyms):
  normalized_texts = []
  for text in texts:
    for regex in synonyms:
      text = re.sub(regex, displayed, text)
    normalized_texts.append(text)
  return normalized_texts

In [55]:
# normalize a list of texts with several displayed terms that each have several synonyms
def normalization(texts, displayed, synonyms):
  normalized_texts = texts
  for idx, dis_syn in enumerate(list(zip(displayed, synonyms))):
    normalized_texts = normalizer(normalized_texts, dis_syn[0], dis_syn[1])
  return normalized_texts

In [56]:
# Test
texts = [text.lower() for text in texts]
normalized_texts = normalization(texts, displayed, synonyms)
normalized_texts

['has Localised sclc (Localised, by ajcc 8th edition cancer staging).',
 'anti-cancer agents in patients with Metastatic solid malignancies (petra)',
 'olaparib in newly diagnosed Newly Diagnosed limited-stage',
 'Prior Therapysclc who Prior Therapy or Prior Therapy following platinum-based regimen',
 'has ECOG 0 or KPS 100 (ecog) performance score ECOG 0 or KPS 100',
 'ecog ps - ECOG 0 or KPS 100',
 'has Histologically (Histologicallyally or Histologically) confirmed small cell lung cancer (sclc).',
 'subject with Histologicallyal or Histologically confirmation of Metastatic small cell lung cancer              (es sclc) or Localisediib or iv squamous non-small cell lung cancer (sqnsclc)']

### Normalize the Eligibility Criteria

In [57]:
### Apply Normalization of Eligibility Criteria

# Convert the EligibilityCriteria to string then lowercase
EligibilityCriteria = df['EligibilityCriteria'].astype(str).str.lower().tolist()
NormalizeEligibilityCriteria = normalization(EligibilityCriteria, displayed, synonyms)
NormalizeEligibilityCriteria[:5]

['inclusion criteria:||age ≥ 18 years at the time of screening.|ECOG 0 or KPS 100 performance status of ≤ 2.|written informed consent obtained from the patient.|Histologicallyally and Histologically documented stage 3b-4 lung cancer (according to version 8 of the international association for the study of lung cancer staging system).|patients with stage 1 to 3, who undergo radical therapy with disease free survival (dfs) >12 months.|willingness and ability to comply with scheduled visits and other study procedures.||exclusion criteria:||history of another primary malignancy except for malignancy Prior Therapy curative intent with known active disease ≥ 5 years before date of the informed consent.|without signed informed consent.|unwillingness or inability to comply with scheduled visits or other study procedures.|previously diagnosed with vte before signing informed consent.',
 'inclusion criteria:||20 or more pack year smoking history|clinical need for diagnostic bronchoscopy or conse

### Clean & Process the Eligibility Criteria

In [58]:
# " Don't consider the '|' and the '-' when removing the punctuation"
my_punctuation = punctuation.replace("|", "").replace("-", "")

# Initiaalise the lemmatizer outside of the function
lemmatizer = WordNetLemmatizer()

def remove_punctuation(text):
    return "".join([i for i in text if i not in my_punctuation])

def remove_stopwords(text):
    return ' '.join([word for word in word_tokenize(text) if not word in stopwords.words('english')])

# Custom tagger, so the tags can be interpreted by the WordNetLemmatizer()
# Because NLTK nltk.pos_tag() and WordNetLemmatizer() do not use the same naming covention
def nltk_pos_tagger(nltk_tag):
  
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def tag_lemmatize_sentence(sentence):

    # Tag the Tokenized text
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  

    # Aply the custom tagger, so the tags can be interpreted by the WordNetLemmatizer()
    wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])), nltk_tagged)
    
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)


In [59]:
%%time
# Clean & Process the Eligibility Criteria
ProcessedEligibilityCriteria = list(map(lambda text: tag_lemmatize_sentence(remove_stopwords(remove_punctuation(text))), NormalizeEligibilityCriteria))

CPU times: user 34.8 s, sys: 2.41 s, total: 37.3 s
Wall time: 37.3 s


In [60]:
ProcessedEligibilityCriteria[:10]

['inclusion criteria||age ≥ 18 year time screening|ECOG 0 KPS 100 performance status ≤ 2|written inform consent obtain patient|Histologicallyally Histologically document stage 3b-4 lung cancer accord version 8 international association study lung cancer stag system|patients stage 1 3 undergo radical therapy disease free survival dfs 12 months|willingness ability comply schedule visit study procedures||exclusion criteria||history another primary malignancy except malignancy Prior Therapy curative intent know active disease ≥ 5 year date inform consent|without sign informed consent|unwillingness inability comply schedule visit study procedures|previously diagnose vte signing inform consent',
 'inclusion criteria||20 pack year smoke history|clinical need diagnostic bronchoscopy consent study drive bronchoscopy||exclusion criteria||lung cancer within 3 month date enrollment',
 'inclusion criteria||aged 40-74 years|resident hexi district tianjin city least 3 years|having self-reported histo

### Seperate the Eligibility Criteria

In [61]:
# Seperate the InclusionCriteria from the ExclusionCriteria
InclusionCriteria = [txt[18:txt.find('exclusion criteria')] for txt in ProcessedEligibilityCriteria]
InclusionCriteria[:5]

['||age ≥ 18 year time screening|ECOG 0 KPS 100 performance status ≤ 2|written inform consent obtain patient|Histologicallyally Histologically document stage 3b-4 lung cancer accord version 8 international association study lung cancer stag system|patients stage 1 3 undergo radical therapy disease free survival dfs 12 months|willingness ability comply schedule visit study procedures||',
 '||20 pack year smoke history|clinical need diagnostic bronchoscopy consent study drive bronchoscopy||',
 '||aged 40-74 years|resident hexi district tianjin city least 3 years|having self-reported history malignant tumor||',
 '||men woman diagnose lung cancer type stage confirm 12 month recruitment period pathologist||aged least18 year diagnosis|patients provide inform consent form||',
 '||diagnosis suspect lung cancer lung cancer||']

In [62]:
# Seperate the ExclusionCriteria from the InclusionCriteria
ExclusionCriteria = [txt[txt.find('exclusion criteria')+18:] for txt in ProcessedEligibilityCriteria]
ExclusionCriteria[:5]

['||history another primary malignancy except malignancy Prior Therapy curative intent know active disease ≥ 5 year date inform consent|without sign informed consent|unwillingness inability comply schedule visit study procedures|previously diagnose vte signing inform consent',
 '||lung cancer within 3 month date enrollment',
 '||pregnant woman exclude',
 '||patients provide inform consent form|patients mental psychological disorder accord treat clinician',
 '||inability undergo therapy']

In [63]:
# Split each Trial InclusionCriteria into list of lists of the InclusionCriteria
InclusionCriteriaList = list(map(lambda txt : txt.split("|"), InclusionCriteria))
InclusionCriteriaList = [list(filter(lambda txt: txt!= '', ltexts)) for ltexts in InclusionCriteriaList] # Delete the empty string generated by successive || (sometimes they seperate with | or ||)
InclusionCriteriaList[:2]

[['age ≥ 18 year time screening',
  'ECOG 0 KPS 100 performance status ≤ 2',
  'written inform consent obtain patient',
  'Histologicallyally Histologically document stage 3b-4 lung cancer accord version 8 international association study lung cancer stag system',
  'patients stage 1 3 undergo radical therapy disease free survival dfs 12 months',
  'willingness ability comply schedule visit study procedures'],
 ['20 pack year smoke history',
  'clinical need diagnostic bronchoscopy consent study drive bronchoscopy']]

In [64]:
# Split each Trial ExclusionCriteria into list of lists of the ExclusionCriteria
ExclusionCriteriaList = list(map(lambda txt : txt.split("|"), ExclusionCriteria))
ExclusionCriteriaList = [list(filter(lambda txt: txt!= '', ltexts)) for ltexts in ExclusionCriteriaList] # Delete the empty string generated by successive || (sometimes they seperate with | or ||)
ExclusionCriteriaList[:2]

[['history another primary malignancy except malignancy Prior Therapy curative intent know active disease ≥ 5 year date inform consent',
  'without sign informed consent',
  'unwillingness inability comply schedule visit study procedures',
  'previously diagnose vte signing inform consent'],
 ['lung cancer within 3 month date enrollment']]

### Convert the List of Lists of IC into Stacked DataFrame of Processed and Raw IC

In [65]:
# We'll ad the Raw IC to the same dataframe as the Processed IC

# Seperate the Inclusion Criteria from the Exclusion Criteria
InclusionCriteriaRaw = [txt[18:txt.find('exclusion criteria')] for txt in df['EligibilityCriteria'].astype(str).str.lower().tolist()]
# Split each Trial InclusionCriteria into list of lists of the InclusionCriteria
InclusionCriteriaRawList = list(map(lambda txt : txt.split("|"), InclusionCriteriaRaw)) 
InclusionCriteriaRawList = [list(filter(lambda txt: txt!= '' and txt!= ':', ltexts)) for ltexts in InclusionCriteriaRawList] # Delete the empty string generated by successive || (sometimes they seperate with | or ||)
InclusionCriteriaRawList[:2]

[['age ≥ 18 years at the time of screening.',
  'eastern cooperative oncology group performance status of ≤ 2.',
  'written informed consent obtained from the patient.',
  'histologically and cytologically documented stage 3b-4 lung cancer (according to version 8 of the international association for the study of lung cancer staging system).',
  'patients with stage 1 to 3, who undergo radical therapy with disease free survival (dfs) >12 months.',
  'willingness and ability to comply with scheduled visits and other study procedures.'],
 ['20 or more pack year smoking history',
  'clinical need for diagnostic bronchoscopy or consent to study driven bronchoscopy']]

In [66]:
# Create the DataFrame that has the stacked processed IC

NCTId = df['NCTId'].tolist()
ICStacked = pd.DataFrame()
for idx, row in enumerate(InclusionCriteriaList):
  # We concatenate the ID of the Clinical Trial with the lits of IC and the number of each IC
  tmp = pd.concat([pd.Series([NCTId[idx] for _ in range(len(row))]), pd.Series(list(range(1, len(row)+1))), pd.Series(row)], axis=1, ignore_index=True)
  ICStacked = ICStacked.append(tmp, ignore_index=True)

ICStacked.columns = ['NCTId', '#IC', 'IC']
ICStacked['#IC']  = ICStacked['#IC'].astype(int)
ICStacked.head()

  import sys


Unnamed: 0,NCTId,#IC,IC
0,NCT03581708,1,age ≥ 18 year time screening
1,NCT03581708,2,ECOG 0 KPS 100 performance status ≤ 2
2,NCT03581708,3,written inform consent obtain patient
3,NCT03581708,4,Histologicallyally Histologically document sta...
4,NCT03581708,5,patients stage 1 3 undergo radical therapy dis...


In [67]:
# Create the DataFrame that has the stacked raw IC

NCTId = df['NCTId'].tolist()
ICStackedRaw = pd.DataFrame()
for idx, row in enumerate(InclusionCriteriaRawList):
  # We concatenate the ID of the Clinical Trial with the lits of IC and the number of each IC
  tmp = pd.concat([pd.Series([NCTId[idx] for _ in range(len(row))]), pd.Series(list(range(1, len(row)+1))), pd.Series(row)], axis=1, ignore_index=True)
  ICStackedRaw = ICStackedRaw.append(tmp, ignore_index=True)

ICStackedRaw.columns = ['NCTId', '#IC', 'ICRaw']
ICStackedRaw['#IC']  = ICStackedRaw['#IC'].astype(int)
ICStackedRaw.head()

  import sys


Unnamed: 0,NCTId,#IC,ICRaw
0,NCT03581708,1,age ≥ 18 years at the time of screening.
1,NCT03581708,2,eastern cooperative oncology group performance...
2,NCT03581708,3,written informed consent obtained from the pat...
3,NCT03581708,4,histologically and cytologically documented st...
4,NCT03581708,5,"patients with stage 1 to 3, who undergo radica..."


In [68]:
ICStacked['ICRaw'] = ICStackedRaw['ICRaw']
ICStacked.head()

Unnamed: 0,NCTId,#IC,IC,ICRaw
0,NCT03581708,1,age ≥ 18 year time screening,age ≥ 18 years at the time of screening.
1,NCT03581708,2,ECOG 0 KPS 100 performance status ≤ 2,eastern cooperative oncology group performance...
2,NCT03581708,3,written inform consent obtain patient,written informed consent obtained from the pat...
3,NCT03581708,4,Histologicallyally Histologically document sta...,histologically and cytologically documented st...
4,NCT03581708,5,patients stage 1 3 undergo radical therapy dis...,"patients with stage 1 to 3, who undergo radica..."


### Convert the List of Lists of EC into Stacked DataFrame of Processed and Raw EC

In [69]:
# We'll ad the Raw IC to the same dataframe as the Processed IC

# Seperate the Inclusion Criteria from the Exclusion Criteria
ExclusionCriteria = [txt[txt.find('exclusion criteria')+18:] for txt in df['EligibilityCriteria'].astype(str).str.lower().tolist()]
# Split each Trial InclusionCriteria into list of lists of the InclusionCriteria
ExclusionCriteriaRawList = list(map(lambda txt : txt.split("|"), ExclusionCriteria)) 
ExclusionCriteriaRawList = [list(filter(lambda txt: txt!= '' and txt!= ':', ltexts)) for ltexts in ExclusionCriteriaRawList] # Delete the empty string generated by successive || (sometimes they seperate with | or ||)
ExclusionCriteriaRawList[:2]

[['history of another primary malignancy except for malignancy treated with curative intent with known active disease ≥ 5 years before date of the informed consent.',
  'without signed informed consent.',
  'unwillingness or inability to comply with scheduled visits or other study procedures.',
  'previously diagnosed with vte before signing informed consent.'],
 ['lung cancer within 3 months after the date of enrollment']]

In [70]:
# Create the DataFrame that has the stacked processed EC

NCTId = df['NCTId'].tolist()
ECStacked = pd.DataFrame()
for idx, row in enumerate(ExclusionCriteriaList):
  # We concatenate the ID of the Clinical Trial with the lits of IC and the number of each IC
  tmp = pd.concat([pd.Series([NCTId[idx] for _ in range(len(row))]), pd.Series(list(range(1, len(row)+1))), pd.Series(row)], axis=1, ignore_index=True)
  ECStacked = ECStacked.append(tmp, ignore_index=True)

ECStacked.columns = ['NCTId', '#EC', 'EC']
ECStacked['#EC']  = ECStacked['#EC'].astype(int)
ECStacked.head()

  import sys


Unnamed: 0,NCTId,#EC,EC
0,NCT03581708,1,history another primary malignancy except mali...
1,NCT03581708,2,without sign informed consent
2,NCT03581708,3,unwillingness inability comply schedule visit ...
3,NCT03581708,4,previously diagnose vte signing inform consent
4,NCT01130285,1,lung cancer within 3 month date enrollment


In [71]:
# Create the DataFrame that has the stacked raw EC

NCTId = df['NCTId'].tolist()
ECStackedRaw = pd.DataFrame()
for idx, row in enumerate(ExclusionCriteriaRawList):
  # We concatenate the ID of the Clinical Trial with the lits of IC and the number of each IC
  tmp = pd.concat([pd.Series([NCTId[idx] for _ in range(len(row))]), pd.Series(list(range(1, len(row)+1))), pd.Series(row)], axis=1, ignore_index=True)
  ECStackedRaw = ECStackedRaw.append(tmp, ignore_index=True)

ECStackedRaw.columns = ['NCTId', '#EC', 'ECRaw']
ECStackedRaw['#EC']  = ECStackedRaw['#EC'].astype(int)
ECStackedRaw.head()

  import sys


Unnamed: 0,NCTId,#EC,ECRaw
0,NCT03581708,1,history of another primary malignancy except f...
1,NCT03581708,2,without signed informed consent.
2,NCT03581708,3,unwillingness or inability to comply with sche...
3,NCT03581708,4,previously diagnosed with vte before signing i...
4,NCT01130285,1,lung cancer within 3 months after the date of ...


In [72]:
ECStacked['ECRaw'] = ECStackedRaw['ECRaw']
ECStacked.head()

Unnamed: 0,NCTId,#EC,EC,ECRaw
0,NCT03581708,1,history another primary malignancy except mali...,history of another primary malignancy except f...
1,NCT03581708,2,without sign informed consent,without signed informed consent.
2,NCT03581708,3,unwillingness inability comply schedule visit ...,unwillingness or inability to comply with sche...
3,NCT03581708,4,previously diagnose vte signing inform consent,previously diagnosed with vte before signing i...
4,NCT01130285,1,lung cancer within 3 month date enrollment,lung cancer within 3 months after the date of ...


### Export Results

In [73]:
ICStacked.to_excel('ICStacked.xlsx', index=False)
ECStacked.to_excel('ECStacked.xlsx', index=False)
files.download('ICStacked.xlsx')
files.download('ECStacked.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### N-GRAM Top 20 Frequencies phrases of IC and EC

In [74]:
def ngrams_split(text, ngram):
    n_grams = ngrams(word_tokenize(text), ngram)
    return [ ' '.join(grams) for grams in n_grams]

def ngrams_frequency(text, ngram):
  ngram_fd = nltk.FreqDist(ngrams_split(text, ngram))
  return ngram_fd.most_common()

def save_xls(list_dfs, xls_path):
  with ExcelWriter(xls_path) as writer:
      for n, df in enumerate(list_dfs):
          df.to_excel(writer,'sheet%s' % n)
      writer.save()

In [99]:
### Join all texts into one text
AllInclusionCriteria = " ".join(ICStacked['IC'].astype(str).tolist())

with ExcelWriter('/content/IC Frequency.xlsx') as writer:
  for ngram in range(2,7):
    # Keep the phrases that have a frequency of more than 19
    ICNgramsFqFiltered = [row for idx,row in enumerate(ngrams_frequency(AllInclusionCriteria, ngram))if row[1]>=20]
    pd.DataFrame(ICNgramsFqFiltered, columns=['Phrase', 'Frequency']).to_excel(writer, 'IC with ' + str(str(ngram) + ' Words per sentence.xlsx'), index=False)

files.download('/content/IC Frequency.xlsx')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [98]:
### Join all texts into one text
AllExclusionCriteria = " ".join(ECStacked['EC'].astype(str).tolist())

with ExcelWriter('/content/EC Frequency.xlsx') as writer:
  for ngram in range(2,7):
    # Keep the phrases that have a frequency of more than 19
    ECNgramsFqFiltered = [row for idx,row in enumerate(ngrams_frequency(AllExclusionCriteria, ngram))if row[1]>=20]
    pd.DataFrame(ECNgramsFqFiltered, columns=['Phrase', 'Frequency']).to_excel(writer, 'EC with ' + str(str(ngram) + ' Words per sentence.xlsx'), index=False)

files.download('/content/EC Frequency.xlsx')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>