<a href="https://colab.research.google.com/github/MWFK/NLP-Semantic-Similarity/blob/main/ClinicalTrials/00.%20Model_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Objectives

Get and process data, then apply Count Vectorizer followed by Cosine Similarity.

### Libs

In [1]:
# Python libs to manipulate dataframes and arrays
import pandas as pd
import numpy as np

# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

### Data

In [24]:
def get_data():

  # Download Clinical Trials data
  print('Downloading Clinical Trials Data')
  ct_dt = pd.read_csv(r'https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/Batches_0.csv', sep=',', engine='python', encoding="utf-8")
  for btch in range(1, 4):
      url = 'https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/Batches_' +str(btch)+ '.csv'
      tmp = pd.read_csv(url, sep=',', engine='python', encoding="ISO-8859-1")
      ct_dt = ct_dt.append(tmp, ignore_index=True)
  ct_dt['AllLocation'] = ct_dt['LocationCity'].str.lower().map(str) + ' | ' + ct_dt['LocationState'].str.lower().map(str) + ' | ' + ct_dt['LocationCountry'].str.lower().map(str)
  print('Clinical Trials Data: ',ct_dt.shape, '\n')

  # Download User input data
  print('Downloading Test data')
  test = pd.read_csv('https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/TestData.csv', sep=';', engine='python', encoding = "utf-8", skiprows=[0], names=['PatientID','ConditionOrDisease','Age','Gender','LocationCountry','TravelDistance','InclusionCriteria'])
  print('Test Data: ', test.shape)

  return ct_dt, test

ctdt, test = get_data()

Downloading Clinical Trials Data
Clinical Trials Data:  (10152, 21) 

Downloading Test data
Test Data:  (7, 7)


### Data Processing

In [101]:
# Strip Leading and Trailing Space
def cleansing(data):
  cols = data.select_dtypes(['object']).columns
  data[cols] = data[cols].apply(lambda x: x.str.strip().fillna(''))
  return data

In [103]:
def data_filtering(ct_dt, test):

  print('Data dimensions before Filtering : ', ct_dt.shape, '\n')

  ### Filtering by Age ###
  print('Filtering by Age...')
  tmp = ct_dt[ct_dt.iloc[:,13] <= test.iloc[0,2]]               # compare numerics
  tmp = tmp[tmp.iloc[:,13].str.find(test.iloc[0,2][-5:]) != -1] # Detect the Year/Month
  print('Data dimensions: ', tmp.shape, '\n')

  ### Filtering by Gender ###
  print('Filtering by Gender...')
  tmp = tmp[(tmp.iloc[:,12] == test.iloc[0,3]) | (tmp.iloc[:,12] == 'All')] 
  print('Data dimensions: ', tmp.shape, '\n')

  ### Filtering by Travel Distance ###
  print('Filtering by Travel Distance...')
  tmp = tmp[tmp.iloc[:,20].str.find(test.iloc[0,5].lower()) != -1] 
  print('Data dimensions: ', tmp.shape, '\n')

  return tmp

# filtered = data_filtering(ctdt, test)

### Vectorize the Data

In [4]:
def vectorization(test, filtered, ctdt):

  ctdt['InclusionCriteria'] = ctdt['InclusionCriteria'].fillna(' ')
  filtered['InclusionCriteria']   = filtered['InclusionCriteria'].fillna(' ')

  cv          = CountVectorizer(stop_words='english')
  cv_ctdt     = cv.fit(ctdt['InclusionCriteria'])

  cv_filtered = cv_ctdt.transform(filtered['InclusionCriteria'])

  cv_test0    = cv_ctdt.transform(test.iloc[:1,6].fillna(' '))

  return cv_test0, cv_filtered


# filtered = data_processing(ctdt)
# cv_test0, cv_filtered = vectorization(test.iloc[:1,], filtered, ctdt)
# filtered['Similarity'] = pd.Series(cosine_similarity(cv_test0, cv_filtered)[0]).values

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


### Modeling

In [5]:
def get_results(cv_test0, cv_filtered, filtered, ctdt):

  filtered['Similarity'] = pd.Series(cosine_similarity(cv_test0, cv_filtered)[0]).values

  ctdt_filtered = ctdt
  ctdt_filtered['Similarity'] = 0
  print('Dataset shape: ', ctdt_filtered.shape)
  print('Filtered Dataset shape: ', filtered.shape)

  # Take off the existing NCTId from the whole dataset then add the filter data to 
  ctdt_filtered = ctdt_filtered[~ctdt_filtered['NCTId'].isin(filtered['NCTId'])]
  ctdt_filtered = ctdt_filtered.append(filtered, ignore_index=True)

  print('\nScore > 0 ', ctdt_filtered[ctdt_filtered['Similarity']>0].shape)
  print('Score > 0.20', ctdt_filtered[ctdt_filtered['Similarity']>0.2].shape)
  print('Score > 0.25', ctdt_filtered[ctdt_filtered['Similarity']>0.25].shape)
  print('Score > 0.30', ctdt_filtered[ctdt_filtered['Similarity']>0.3].shape)

  ctdt_filtered['Similarity'] = ctdt_filtered['Similarity'].apply(lambda score: score if score>0.2 else 0) # if score>0.25
  ctdt_filtered = ctdt_filtered.sort_values(by=['Similarity'], ascending=False)

  return ctdt_filtered

  #get_results(cv_test0, cv_filtered, filtered, ctdt).head(2)

### Execution Test

In [102]:
# Get The Clinical Trials Data, and the Test data
ctdt, test = get_data()

# Clean Data
ctdt = cleansing(ctdt)
test = cleansing(test)

# Filter the Clinical Trials Data based on the test data
filtered = data_filtering(ctdt, test.iloc[4:5,])

# Vectorize the filtered data and the test data
cv_test0, cv_filtered = vectorization(test.iloc[4:5,], filtered, ctdt)

# Get the final results and it's related stats
get_results(cv_test0, cv_filtered, filtered, ctdt).head(2)

Downloading Clinical Trials Data
Clinical Trials Data:  (10152, 21) 

Downloading Test data
Test Data:  (7, 7)
Data dimensions before Filtering :  (10152, 21) 

Filtering by Age...
Data dimensions:  (9565, 21) 

Filtering by Gender...
Data dimensions:  (9520, 21) 

Filtering by Travel Distance...
Data dimensions:  (9520, 21) 

Dataset shape:  (10152, 22)
Filtered Dataset shape:  (9520, 22)

Score > 0  (9004, 22)
Score > 0.20 (1371, 22)
Score > 0.25 (449, 22)
Score > 0.30 (87, 22)


Unnamed: 0,Rank,NCTId,OrgFullName,OfficialTitle,OverallStatus,Keyword,DetailedDescription,Condition,EligibilityCriteria,InclusionCriteria,ExclusionCriteria,HealthyVolunteers,Gender,MinimumAge,StudyPopulation,LocationFacility,LocationCity,LocationState,LocationZip,LocationCountry,AllLocation,Similarity
968,379,NCT03084692,University of Alberta,Feasibility and Preliminary Efficacy of a Comb...,Completed,caregiver|lung cancer|yoga|resistance exercise,To determine the feasibility of a combined the...,Lung Cancer,Inclusion Criteria:||Diagnosis of Primary Lung...,Diagnosis of Primary Lung Cancer Stage I-IV (a...,Metastatic lung disease that would preclude sa...,Accepts Healthy Volunteers,All,18 Years,,University of Alberta/ Cross Cancer Institute,Edmonton,Alberta,T6G 2G4,Canada,edmonton | alberta | canada,0.42543
2789,2303,NCT00312819,Hadassah Medical Organization,Randomized Phase 2 Trial of Treatment of Advan...,Completed,Non-small cel lung cancer|chemotherapy|randomi...,Metastatic non small cell lung cancer remains ...,Non-small Cell Lung Cancer,Inclusion Criteria:||Metastatic non-small cell...,Metastatic non-small cell lung cancer|Previous...,Over 18|Willingness to abstain from alcohol|No...,No,Female,18 Years,,Oncology Institute Meir Medical Center,Kfar Saba,,,Israel,kfar saba | nan | israel,0.409946


### Exporting Results

In [107]:
# Get The Clinical Trials Data, and the Test data
ctdt, test = get_data()

# Clean Data
ctdt = cleansing(ctdt)
test = cleansing(test)

for index, row in test.iterrows():

    #print(index, row['InclusionCriteria'])
    print('\n###################################')
    print('Processing the user input: [', index,']')
    print('###################################\n')

    # Filter the Clinical Trials Data based on the test data
    filtered = data_filtering(ctdt, test.iloc[index:index+1,])

    # Vectorize the filtered data and the test data
    cv_test0, cv_filtered = vectorization(test.iloc[index:index+1,], filtered, ctdt)

    # Get the final results and it's related stats
    get_results(cv_test0, cv_filtered, filtered, ctdt).to_excel('/content/'+str(index)+'.xlsx')

Downloading Clinical Trials Data
Clinical Trials Data:  (10152, 21) 

Downloading Test data
Test Data:  (7, 7)

###################################
Processing the user input: [ 0 ]
###################################

Data dimensions before Filtering :  (10152, 21) 

Filtering by Age...
Data dimensions:  (9517, 21) 

Filtering by Gender...
Data dimensions:  (9403, 21) 

Filtering by Travel Distance...
Data dimensions:  (645, 21) 

Dataset shape:  (10152, 22)
Filtered Dataset shape:  (645, 22)

Score > 0  (615, 22)
Score > 0.20 (93, 22)
Score > 0.25 (29, 22)
Score > 0.30 (10, 22)

###################################
Processing the user input: [ 1 ]
###################################

Data dimensions before Filtering :  (10152, 22) 

Filtering by Age...
Data dimensions:  (9173, 22) 

Filtering by Gender...
Data dimensions:  (9138, 22) 

Filtering by Travel Distance...
Data dimensions:  (372, 22) 

Dataset shape:  (10152, 22)
Filtered Dataset shape:  (372, 22)

Score > 0  (363, 22)
Score