<a href="https://colab.research.google.com/github/MWFK/NLP-Semantic-Similarity/blob/main/ClinicalTrials/Data%20Engineering/06.%20Filters_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Objectives

1. Download the lung cancer data with specific features.
2. Process HealthyVolunteer feature then use it as a filter.
3. Process Age feature then use it as a filter.
4. Procss Gender feature then use it as filter.
5. Procss Gender feature then use it as filter.
6. Process then filter by LocationStatus(LocationCountry, LocationCity, LocationFacility).
7. Process then filter by user willingness to travel destination(Country, City).
8. Process then filter by Condition.
9. Main function.

### Libs

In [24]:
import re
import pandas as pd
import numpy as np
import requests
from itertools import compress
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

### Data

In [25]:
def get_data():
  ####### Search Expression #######
  # Lung Cancer

  ####### Study Fields #######
  '''
  NCTId, OrgFullName, OfficialTitle, OverallStatus, Phase, DetailedDescription, 
  Condition, EligibilityCriteria, HealthyVolunteers, Gender, MinimumAge, StudyPopulation, 
  LocationFacility, LocationCity, LocationCountry, LocationStatus
  '''

  ####### Range Min_MAX ######
  # 1 to 1000

  ####### Format #######
  # CSV

  url = 'https://clinicaltrials.gov/api/query/study_fields?expr=lung+cancer&fields=NCTId%2C+OrgFullName%2C+OfficialTitle%2C+OverallStatus%2C+Phase%2C+DetailedDescription%2C+%0D%0ACondition%2C+EligibilityCriteria%2C+HealthyVolunteers%2C+Gender%2C+MinimumAge%2C+StudyPopulation%2C+%0D%0ALocationFacility%2C+LocationCity%2C+LocationCountry%2C+LocationStatus&min_rnk=1&max_rnk=1000&fmt=csv'
  session = requests.Session()
  retry   = Retry(connect=3, backoff_factor=0.5)
  adapter = HTTPAdapter(max_retries=retry)
  session.mount('http://' , adapter)
  session.mount('https://', adapter)

  clinicaltrials = session.get(url)
  print('Download Request Status: ', clinicaltrials.status_code)

  csv_file = open('/content/'+str(1)+'-batch.csv', 'wb')
  csv_file.write(clinicaltrials.content)
  csv_file.close()

  return pd.read_csv(r'/content/1-batch.csv', skiprows=10)

### Filter [1] by HealthyVolunteers

In [26]:
def F_HealthyVolunteers(df):
  print("Options before filtering with HealthyVolunteers: ", df.shape)
  df['HealthyVolunteers'] = df['HealthyVolunteers'].replace('Accepts Healthy Volunteers', 'Yes')
  df['HealthyVolunteers'] = df['HealthyVolunteers'].replace(np.nan, 'Yes_No')
  HealthyVolunteers_Input = input("Are you a healthy volunteer? (Example: Yes ; No))")
  df = df.loc[df['HealthyVolunteers'].isin([HealthyVolunteers_Input, 'Yes_No'])] 
  print("Options before filtering with HealthyVolunteers: ", df.shape)
  return df

### Filter [2] by Age

In [27]:
# convert ages to month base
def ages_to_months(ages):
  return pd.Series([int(age[:age.find('Years')])*12 if (age.find('Years')!=-1) else int(age[:age.find('Months')]) for age in ages.tolist()])

# Filtering by Age
def F_Age(df):
  print("Options before filtering with Age: ", df.shape)
  Age_Input = pd.Series(input("Can we know your age: (Example: 29 Years ; 9 Months)"))
  df['MinimumAge'] = df['MinimumAge'].replace(np.nan, '404 Years')
  df = df[ages_to_months(df['MinimumAge']) <= ages_to_months(Age_Input).tolist()[0]]
  print("Options before filtering with Age: ", df.shape)
  return df

### Filter [3] by Gender

In [28]:
def F_Gender(df):
  print("Options before filtering with Gender: ", df.shape)
  df['Gender'] = df['Gender'].replace(np.nan, 'All')
  Gender_Input = input("Can we know your Gender: (Example: Male ; Female ; All)")
  df = df[df['Gender'].isin([Gender_Input, 'All'])]
  print("Options before filtering with Gender: ", df.shape)
  return df

### Filter [4] by Phase

In [29]:
def F_Phase(df):

  print("Options before filtering with Phase: ", df.shape)

  df['Phase'] = df['Phase'].replace(np.nan, 'No Phase') 
  df['Phase'] = df['Phase'].replace('Not Applicable', 'No Phase') 

  Phase_Input = input("Which Phase are you in: (Example: Phase 1; Phase 2; Phase 3; Phase 4; No Phase): ")
  df = df[df['Phase'] == Phase_Input]

  print("Options after filtering with Phase: ", df.shape)

  return df

### Filter [5, 6, 7, 8] by LocationStatus

In [30]:
'''
If  LocationFacility == 'Recruiting'
Keep the following features: Location Facility, Location City, Location Country
'''

def F_LocationStatus(df):
  print("Options before filtering with Location Status: ", df.shape)

  lfacility = df['LocationFacility'].astype(str).to_list()
  lstatus   = df['LocationStatus'].astype(str).to_list()
  lcity     = df['LocationCity'].astype(str).to_list()
  lcountry  = df['LocationCountry'].astype(str).to_list()

  allfacility = [text.split('|') for text in lfacility]
  allstatus   = [text.split('|') for text in lstatus]
  allcity     = [text.split('|') for text in lcity]
  allcountry  = [text.split('|') for text in lcountry]

  # This mask is for detecting the 'Recruiting' status
  allmasks = []
  for onelist in allstatus:
    masks = []
    for status in onelist:
      if status == 'Recruiting':
        masks.append(True)
      else:
        masks.append(False)
    allmasks.append(masks)

  # Using the mask, we'll filter these features rows, it will onlykeep the 'Recruiting' 
  filtered_status = []
  for idx,x in enumerate(allstatus):
    filtered_status.append(list(compress(allstatus[idx], allmasks[idx])))

  filtered_country = []
  for idx,x in enumerate(allstatus):
    filtered_country.append(list(compress(allcountry[idx], allmasks[idx])))

  filtered_facility = []
  for idx,x in enumerate(allstatus):
    filtered_facility.append(list(compress(allfacility[idx], allmasks[idx])))

  filtered_city = []
  for idx,x in enumerate(allstatus):
    filtered_city.append(list(compress(allcity[idx], allmasks[idx])))

  df['LocationFacility'] = filtered_facility
  df['LocationStatus']   = filtered_status
  df['LocationCity']     = filtered_city
  df['LocationCountry']  = filtered_country
  df.head()

  # After appyling the mask the previous features, we'll only keep the row that have 'Recruiting' status.
  df = df[df['LocationFacility'].map(lambda location_list: len(location_list)) > 0]
  
  print("Options after filtering with Location Status: ", df.shape)
  return df

### Filtering with [9] User Travel Distance

In [31]:
# Filter the dataset based on the User willingness travel destination(Country, City)
def F_TravelDistance(df):
  print("Options before filtering with user Travel distance willingness: ", df.shape)

  ### Filter by Patient destination Country 
  TravelCountry_Input = input("To which Country are you willing to Travel(Examlpe: United Kingdom): ")
  df = df[df['LocationCountry'].map(lambda location_list: location_list.count(TravelCountry_Input)) > 0]
  print(df.shape)

  ### Filter by Patient destination City
  TravelCity_Input = input("To which City are you willing to Travel(Example: London): ")
  df = df[df['LocationCity'].map(lambda location_list: location_list.count(TravelCity_Input)) > 0]

  print("Options after filtering with user Travel distance willingness: ", df.shape)
  return df

### Filter By [10] Condition

In [32]:
def F_Condition(df):
  print("Options before filtering with Condition: ", df.shape)
  df['Condition'] = df['Condition'].replace(np.nan, 'No Condition')
  Condition_Input = input("Which Condition are looking for: (Example: Non-small Cell Lung Cancer ; Small Cell Lung Cancer): ")
  df = df[df['Condition'] == Condition_Input]
  print("Options after filtering with Condition: ", df.shape)
  return df

### Main

In [33]:
# Filters 1,2,3 [expression, study_fields, number of data points]
df = get_data() 
# Filter 4 [Age]
df = F_Age(df)  # Update this function
#  Filters 5,6,7,8 [Clinical Trial: LocationStatus, LocationCountry, LocationCity, LocationFacility]
df = F_LocationStatus(df) # 4 filters
# Filter 9,10 [User: Country, City]
df = F_TravelDistance(df)
# Filter 11 [Condition]
df = F_Condition(df)
# Filter 12 [Phase]
df = F_Phase(df)
# Filter 13 [Gender]
df = F_Gender(df)
# Filter 14 [Healthy Volunteer]
df = F_HealthyVolunteers(df)
# Display results
df

Download Request Status:  200
Options before filtering with Age:  (1000, 17)
Can we know your age: (Example: 29 Years ; 9 Months)60 Years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Options before filtering with Age:  (915, 17)
Options before filtering with Location Status:  (915, 17)
Options after filtering with Location Status:  (223, 17)
Options before filtering with user Travel distance willingness:  (223, 17)
To which Country are you willing to Travel(Examlpe: United Kingdom): United States
(65, 17)
To which City are you willing to Travel(Example: London): New York
Options after filtering with user Travel distance willingness:  (8, 17)
Options before filtering with Condition:  (8, 17)
Which Condition are looking for: (Example: Non-small Cell Lung Cancer ; Small Cell Lung Cancer): Lung Cancer


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Options after filtering with Condition:  (4, 17)
Options before filtering with Phase:  (4, 17)
Which Phase are you in: (Example: Phase 1; Phase 2; Phase 3; Phase 4; No Phase): Phase 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Options after filtering with Phase:  (1, 17)
Options before filtering with Gender:  (1, 17)
Can we know your Gender: (Example: Male ; Female ; All)Female
Options before filtering with Gender:  (1, 17)
Options before filtering with HealthyVolunteers:  (1, 17)
Are you a healthy volunteer? (Example: Yes ; No))No
Options before filtering with HealthyVolunteers:  (1, 17)


Unnamed: 0,Rank,NCTId,OrgFullName,OfficialTitle,OverallStatus,Phase,DetailedDescription,Condition,EligibilityCriteria,HealthyVolunteers,Gender,MinimumAge,StudyPopulation,LocationFacility,LocationCity,LocationCountry,LocationStatus
878,879,NCT03567642,Memorial Sloan Kettering Cancer Center,"Phase 1 Study of Combination Osimertinib, Plat...",Recruiting,Phase 1,,Lung Cancer,Inclusion Criteria:||Written informed consent|...,No,All,18 Years,,"[Memoral Sloan Kettering Cancer Center, Memori...","[Basking Ridge, Middletown, Montvale, Commack,...","[United States, United States, United States, ...","[Recruiting, Recruiting, Recruiting, Recruitin..."
