<a href="https://colab.research.google.com/github/MWFK/NLP-Semantic-Similarity/blob/main/ClinicalTrials/Data%20Engineering/06.%20Filters_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Objectives

1. Download the lung cancer data with specific features.
2. Filter The lung cancer data according to the user input by matching them with the features values.


### Libs

In [117]:
import re
import pandas as pd
import numpy as np
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

### Data

In [158]:
####### Search Expression #######
# Lung Cancer

####### Study Fields #######
'''
NCTId, OrgFullName, OfficialTitle, OverallStatus, Phase, DetailedDescription, 
Condition, EligibilityCriteria, HealthyVolunteers, Gender, MinimumAge, StudyPopulation, 
LocationFacility, LocationCity, LocationCountry, LocationStatus
'''

####### Range Min_MAX ######
# 1 to 1000

####### Format #######
# CSV

url = 'https://clinicaltrials.gov/api/query/study_fields?expr=lung+cancer&fields=NCTId%2C+OrgFullName%2C+OfficialTitle%2C+OverallStatus%2C+Phase%2C+DetailedDescription%2C+%0D%0ACondition%2C+EligibilityCriteria%2C+HealthyVolunteers%2C+Gender%2C+MinimumAge%2C+StudyPopulation%2C+%0D%0ALocationFacility%2C+LocationCity%2C+LocationCountry%2C+LocationStatus&min_rnk=1&max_rnk=1000&fmt=csv'
session = requests.Session()
retry   = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://' , adapter)
session.mount('https://', adapter)

clinicaltrials = session.get(url)
print('Download Request Status: ', clinicaltrials.status_code)

csv_file = open('/content/'+str(1)+'-batch.csv', 'wb')
csv_file.write(clinicaltrials.content)
csv_file.close()

raw = pd.read_csv(r'/content/1-batch.csv', skiprows=10)
print(raw.shape)
raw.head()


Download Request Status:  200
(1000, 17)


Unnamed: 0,Rank,NCTId,OrgFullName,OfficialTitle,OverallStatus,Phase,DetailedDescription,Condition,EligibilityCriteria,HealthyVolunteers,Gender,MinimumAge,StudyPopulation,LocationFacility,LocationCity,LocationCountry,LocationStatus
0,1,NCT03581708,Guangdong Provincial People's Hospital,Real-world Study of the Incidence and Risk Fac...,Not yet recruiting,,VTE has high incidence in lung cancer and incr...,Lung Neoplasms|Venous Thromboembolism,Inclusion Criteria:||Age ≥ 18 years at the tim...,No,All,18 Years,Patients diagnosed with advanced staged lung c...,Guangdong General Hospital,Guangzhou,China,
1,2,NCT01130285,University of Toledo,Validation of a Multi-gene Test for Lung Cance...,"Active, not recruiting",,"Because more than 160,000 individuals die of l...",Lung Cancer,Inclusion Criteria:||20 or more pack year smok...,Accepts Healthy Volunteers,All,50 Years,The study population will consist of subjects ...,National Jewish Health|University of Michigan|...,Denver|Ann Arbor|Detroit|Rochester|Cleveland|C...,United States|United States|United States|Unit...,
2,3,NCT03992833,Tianjin Medical University Cancer Institute an...,Methods of Computed Tomography Screening and M...,Recruiting,Not Applicable,"In this population-based study, participants w...",Lung Neoplasms|Computed Tomography|Mass Screen...,Inclusion Criteria:||Aged 40-74 years;|Residen...,Accepts Healthy Volunteers,All,40 Years,,Tianjin Medical University Cancer Institute An...,Tianjin,China,Recruiting
3,4,NCT02725892,AstraZeneca,LuCaReAl: Lung Cancer Registry in Algeria.,Completed,,The study consists of:||All patients meeting i...,Oncology & Epidemiology & Lung Cancer,Inclusion Criteria:||Men or women diagnosed wi...,No,All,,each sanitary region defined by the Ministry o...,Research Site|Research Site|Research Site,Algiers|Constantine|Oran,Algeria|Algeria|Algeria,
4,5,NCT00897650,Vanderbilt-Ingram Cancer Center,Molecular Fingerprints in Lung Cancer: Predict...,Completed,,OBJECTIVES:||To determine protein and/or RNA e...,Lung Cancer,Inclusion criteria||Diagnosis of suspected lun...,No,All,,People who have or may have lung cancer.,Vanderbilt-Ingram Cancer Center,Nashville,United States,


### Filter by [1] HealthyVolunteers

In [55]:
df = raw
print('Data dimensions before Filtering : ', df.shape, '\n')
print(df['HealthyVolunteers'].unique())
print(df['HealthyVolunteers'].nunique())
print(df['HealthyVolunteers'].value_counts())
print(df.loc[df['HealthyVolunteers'] == 'nan'].shape)

Data dimensions before Filtering :  (1000, 17) 

['No' 'Accepts Healthy Volunteers' nan]
2
No                            855
Accepts Healthy Volunteers    129
Name: HealthyVolunteers, dtype: int64
(0, 17)


In [56]:
print(df['HealthyVolunteers'].unique())

df['HealthyVolunteers'] = df['HealthyVolunteers'].replace('No', 'no')
df['HealthyVolunteers'] = df['HealthyVolunteers'].replace('Accepts Healthy Volunteers', 'yes')
df['HealthyVolunteers'] = df['HealthyVolunteers'].replace(np.nan, 'yes_no')

print(df['HealthyVolunteers'].unique())
print(df['HealthyVolunteers'].value_counts())

['No' 'Accepts Healthy Volunteers' nan]
['no' 'yes' 'yes_no']
no        855
yes       129
yes_no     16
Name: HealthyVolunteers, dtype: int64


In [59]:
df = raw
HealthyVolunteers_Input = input("Are you a healthy volunteer? (Example: yes ; no))")
print(HealthyVolunteers_Input)

df = df.loc[df['HealthyVolunteers'].isin([HealthyVolunteers_Input, 'yes_no'])] 
print(df['HealthyVolunteers'].unique())
print(df['HealthyVolunteers'].value_counts())

Are you a healthy volunteer? (yes/no)no
no
['no' 'yes_no']
no        855
yes_no     16
Name: HealthyVolunteers, dtype: int64


### Filtering by [2] Age

In [None]:
df = raw
print('Data dimensions before Filtering : ', df.shape, '\n')
df['MinimumAge'] = df['MinimumAge'].replace(np.nan, '0 Months')
print(df['MinimumAge'].value_counts())

In [144]:
# convert ages to month base
def ages_to_months(ages):
  return pd.Series([int(age[:age.find('Years')])*12 if (age.find('Years')!=-1) else int(age[:age.find('Months')]) for age in ages.tolist()])

# ages = pd.Series(['18 Years', '99 Months', '7 Months', '6 Years', '0 Months'])
# ages_to_months(ages)

0    216
1     99
2      7
3     72
4      0
dtype: int64

In [145]:
Age_Input = pd.Series(input("Can we know your age: (Example: 29 Years ; 9 Months)"))
print('\n', Age_Input)

df = df[ages_to_months(df['MinimumAge']) <= ages_to_months(Age_Input).tolist()[0]]
print(df.shape)

Can we know your age: (Example: 29 Years ; 9 Months)50 Years

 0    50 Years
dtype: object
(944, 17)


### Filtering by [3] Gender

In [179]:
df = raw
print(df['Gender'].unique())
print(df['Gender'].value_counts())
df['Gender'] = df['Gender'].replace(np.nan, 'All')
print(df['Gender'].value_counts())

Gender_Input = input("Can we know your Gender: (Example: Male ; Female ; All)")
print('\n', Gender_Input)

df = df[df['Gender'].isin([Gender_Input, 'All'])]
print(df.shape)

['All' nan 'Female']
All       990
Female      8
Name: Gender, dtype: int64
All       992
Female      8
Name: Gender, dtype: int64
Can we know your Gender: (Example: Male ; Female ; All)Female

 Female
(1000, 18)
