<a href="https://colab.research.google.com/github/MWFK/NLP-Semantic-Similarity/blob/main/ClinicalTrials/Models/00.%20Model_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Objectives

Get and process data, then apply Count Vectorizer followed by Cosine Similarity.

### Libs

In [1]:
# We'll use it to download final results
from google.colab import files

# Python libs to manipulate dataframes and arrays
import pandas as pd
import numpy as np
import string

# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# NLTK
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

# Accelerate processing
from functools import lru_cache

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Data

In [2]:
def get_data():

  # Download Clinical Trials data
  print('Downloading Clinical Trials Data')
  ct_dt = pd.read_csv(r'https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/Batches_0.csv', sep=',', engine='python', encoding="utf-8")
  for btch in range(1, 4):
      url = 'https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/Batches_' +str(btch)+ '.csv'
      tmp = pd.read_csv(url, sep=',', engine='python', encoding="ISO-8859-1")
      ct_dt = ct_dt.append(tmp, ignore_index=True)
  ct_dt['AllLocation'] = ct_dt['LocationCity'].str.lower().map(str) + ' | ' + ct_dt['LocationState'].str.lower().map(str) + ' | ' + ct_dt['LocationCountry'].str.lower().map(str)
  print('Clinical Trials Data: ',ct_dt.shape, '\n')

  # Download User input data
  print('Downloading Test data')
  test = pd.read_csv('https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/TestData.csv', sep=';', engine='python', encoding = "utf-8", skiprows=[0], names=['PatientID','ConditionOrDisease','Age','Gender','LocationCountry','TravelDistance','InclusionCriteria'])
  print('Test Data: ', test.shape)

  return ct_dt, test

# ctdt, test = get_data()

### Data Processing

In [3]:
# Strip Leading and Trailing Space
def cleansing(data):
  cols = data.select_dtypes(['object']).columns
  data[cols] = data[cols].apply(lambda x: x.str.strip().fillna(''))
  return data

In [4]:
def data_filtering(ct_dt, test):

  print('Data dimensions before Filtering : ', ct_dt.shape, '\n')

  ### Filtering by Age ###
  print('Filtering by Age...')
  tmp = ct_dt[ct_dt.iloc[:,13] <= test.iloc[0,2]]               # compare numerics
  tmp = tmp[tmp.iloc[:,13].str.find(test.iloc[0,2][-5:]) != -1] # Detect the Year/Month
  print('Data dimensions: ', tmp.shape, '\n')

  ### Filtering by Gender ###
  print('Filtering by Gender...')
  tmp = tmp[(tmp.iloc[:,12] == test.iloc[0,3]) | (tmp.iloc[:,12] == 'All')] 
  print('Data dimensions: ', tmp.shape, '\n')

  ### Filtering by Travel Distance ###
  print('Filtering by Travel Distance...')
  tmp = tmp[tmp.iloc[:,20].str.find(test.iloc[0,5].lower()) != -1] 
  print('Data dimensions: ', tmp.shape, '\n')

  return tmp

# filtered = data_filtering(ctdt, test)

### Vectorize the Data

In [5]:
# Sources
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# https://scikit-learn.org/stable/modules/feature_extraction.html

stopwords = set(stopwords.words('english'))
# The Lemmatizer uses nltk 'punkt' and 'wordnet'.
@lru_cache(maxsize=10000)
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t.lower()) for t in word_tokenize(doc) if t.lower() not in stopwords] # Lemmatize words that are not stopwords, so in the countvectorizer it does not make a conflict with the stop word parameter

In [6]:
def vectorization(test, filtered, ctdt):

  ctdt['InclusionCriteria'] = ctdt['InclusionCriteria'].fillna(' ')
  filtered['InclusionCriteria']   = filtered['InclusionCriteria'].fillna(' ')

  cv      = CountVectorizer(encoding='utf-8', decode_error='ignore', strip_accents='ascii', lowercase=True, tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 2), max_features=50000)
  cv_ctdt = cv.fit(ctdt['InclusionCriteria'])

  cv_filtered = cv_ctdt.transform(filtered['InclusionCriteria'])

  cv_test0    = cv_ctdt.transform(test.iloc[:1,6].fillna(' '))

  return cv_test0, cv_filtered


# filtered = data_processing(ctdt)
# cv_test0, cv_filtered = vectorization(test.iloc[:1,], filtered, ctdt)
# filtered['Similarity'] = pd.Series(cosine_similarity(cv_test0, cv_filtered)[0]).values

### Modeling

In [7]:
def get_results(cv_test0, cv_filtered, filtered, ctdt):

  filtered['Similarity'] = pd.Series(cosine_similarity(cv_test0, cv_filtered)[0]).values

  ctdt_filtered = ctdt
  ctdt_filtered['Similarity'] = 0
  print('Dataset shape: ', ctdt_filtered.shape)
  print('Filtered Dataset shape: ', filtered.shape)

  # Take off the existing NCTId from the whole dataset then add the filter data to 
  ctdt_filtered = ctdt_filtered[~ctdt_filtered['NCTId'].isin(filtered['NCTId'])]
  ctdt_filtered = ctdt_filtered.append(filtered, ignore_index=True)

  print('\nScore > 0 ', ctdt_filtered[ctdt_filtered['Similarity']>0].shape)
  print('Score > 0.05', ctdt_filtered[ctdt_filtered['Similarity']>0.05].shape)
  print('Score > 0.10', ctdt_filtered[ctdt_filtered['Similarity']>0.1].shape)
  print('Score > 0.15', ctdt_filtered[ctdt_filtered['Similarity']>0.15].shape)
  print('Score > 0.20', ctdt_filtered[ctdt_filtered['Similarity']>0.2].shape)
  print('Score > 0.25', ctdt_filtered[ctdt_filtered['Similarity']>0.25].shape)
  print('Score > 0.30', ctdt_filtered[ctdt_filtered['Similarity']>0.3].shape)

  ctdt_filtered['Similarity'] = ctdt_filtered['Similarity'].apply(lambda score: score if score>0.1 else 0) # if score>0.25
  ctdt_filtered = ctdt_filtered.sort_values(by=['Similarity'], ascending=False)

  return ctdt_filtered

  #get_results(cv_test0, cv_filtered, filtered, ctdt).head(2)

### Execution Test

In [8]:
# # Get The Clinical Trials Data, and the Test data
# ctdt, test = get_data()

# # Clean Data
# ctdt = cleansing(ctdt)
# test = cleansing(test)

# # Filter the Clinical Trials Data based on the test data
# filtered = data_filtering(ctdt, test.iloc[:1,])

# # Vectorize the filtered data and the test data
# cv_test0, cv_filtered = vectorization(test.iloc[:1,], filtered, ctdt)

# # Get the final results and it's related stats
# ctdt_filtered = get_results(cv_test0, cv_filtered, filtered, ctdt)
# ctdt_filtered.head(2)

In [9]:
# print(*ctdt_filtered.iloc[:1,0])

### Exporting Results

In [10]:
# Get The Clinical Trials Data, and the Test data
ctdt, test = get_data()

# Clean Data
ctdt = cleansing(ctdt)
test = cleansing(test)

for index, row in test.iterrows():

    #print(index, row['InclusionCriteria'])
    print('\n###################################')
    print('Processing the user input: [', index,']')
    print('###################################\n')

    # Filter the Clinical Trials Data based on the test data
    filtered = data_filtering(ctdt, test.iloc[index:index+1,])

    # Vectorize the filtered data and the test data
    cv_test0, cv_filtered = vectorization(test.iloc[index:index+1,], filtered, ctdt)

    # Get the final results and it's related stats
    get_results(cv_test0, cv_filtered, filtered, ctdt).to_excel('/content/Patient_'+str(index)+'.xlsx', index=False)

Downloading Clinical Trials Data
Clinical Trials Data:  (10152, 21) 

Downloading Test data
Test Data:  (7, 7)

###################################
Processing the user input: [ 0 ]
###################################

Data dimensions before Filtering :  (10152, 21) 

Filtering by Age...
Data dimensions:  (9517, 21) 

Filtering by Gender...
Data dimensions:  (9403, 21) 

Filtering by Travel Distance...
Data dimensions:  (645, 21) 



  "The parameter 'token_pattern' will not be used"


Dataset shape:  (10152, 22)
Filtered Dataset shape:  (645, 22)

Score > 0  (626, 22)
Score > 0.05 (549, 22)
Score > 0.10 (351, 22)
Score > 0.15 (94, 22)
Score > 0.20 (6, 22)
Score > 0.25 (0, 22)
Score > 0.30 (0, 22)

###################################
Processing the user input: [ 1 ]
###################################

Data dimensions before Filtering :  (10152, 22) 

Filtering by Age...
Data dimensions:  (9173, 22) 

Filtering by Gender...
Data dimensions:  (9138, 22) 

Filtering by Travel Distance...
Data dimensions:  (372, 22) 



  "The parameter 'token_pattern' will not be used"


Dataset shape:  (10152, 22)
Filtered Dataset shape:  (372, 22)

Score > 0  (367, 22)
Score > 0.05 (301, 22)
Score > 0.10 (118, 22)
Score > 0.15 (12, 22)
Score > 0.20 (0, 22)
Score > 0.25 (0, 22)
Score > 0.30 (0, 22)

###################################
Processing the user input: [ 2 ]
###################################

Data dimensions before Filtering :  (10152, 22) 

Filtering by Age...
Data dimensions:  (9650, 22) 

Filtering by Gender...
Data dimensions:  (9535, 22) 

Filtering by Travel Distance...
Data dimensions:  (68, 22) 



  "The parameter 'token_pattern' will not be used"


Dataset shape:  (10152, 22)
Filtered Dataset shape:  (68, 22)

Score > 0  (67, 22)
Score > 0.05 (25, 22)
Score > 0.10 (3, 22)
Score > 0.15 (0, 22)
Score > 0.20 (0, 22)
Score > 0.25 (0, 22)
Score > 0.30 (0, 22)

###################################
Processing the user input: [ 3 ]
###################################

Data dimensions before Filtering :  (10152, 22) 

Filtering by Age...
Data dimensions:  (105, 22) 

Filtering by Gender...
Data dimensions:  (104, 22) 

Filtering by Travel Distance...
Data dimensions:  (23, 22) 



  "The parameter 'token_pattern' will not be used"


Dataset shape:  (10152, 22)
Filtered Dataset shape:  (23, 22)

Score > 0  (22, 22)
Score > 0.05 (15, 22)
Score > 0.10 (8, 22)
Score > 0.15 (3, 22)
Score > 0.20 (0, 22)
Score > 0.25 (0, 22)
Score > 0.30 (0, 22)

###################################
Processing the user input: [ 4 ]
###################################

Data dimensions before Filtering :  (10152, 22) 

Filtering by Age...
Data dimensions:  (9565, 22) 

Filtering by Gender...
Data dimensions:  (9520, 22) 

Filtering by Travel Distance...
Data dimensions:  (9520, 22) 



  "The parameter 'token_pattern' will not be used"


Dataset shape:  (10152, 22)
Filtered Dataset shape:  (9520, 22)

Score > 0  (9149, 22)
Score > 0.05 (7480, 22)
Score > 0.10 (2696, 22)
Score > 0.15 (192, 22)
Score > 0.20 (11, 22)
Score > 0.25 (2, 22)
Score > 0.30 (0, 22)

###################################
Processing the user input: [ 5 ]
###################################

Data dimensions before Filtering :  (10152, 22) 

Filtering by Age...
Data dimensions:  (9565, 22) 

Filtering by Gender...
Data dimensions:  (9451, 22) 

Filtering by Travel Distance...
Data dimensions:  (471, 22) 



  "The parameter 'token_pattern' will not be used"


Dataset shape:  (10152, 22)
Filtered Dataset shape:  (471, 22)

Score > 0  (464, 22)
Score > 0.05 (436, 22)
Score > 0.10 (351, 22)
Score > 0.15 (149, 22)
Score > 0.20 (4, 22)
Score > 0.25 (0, 22)
Score > 0.30 (0, 22)

###################################
Processing the user input: [ 6 ]
###################################

Data dimensions before Filtering :  (10152, 22) 

Filtering by Age...
Data dimensions:  (9212, 22) 

Filtering by Gender...
Data dimensions:  (9175, 22) 

Filtering by Travel Distance...
Data dimensions:  (9175, 22) 



  "The parameter 'token_pattern' will not be used"


Dataset shape:  (10152, 22)
Filtered Dataset shape:  (9175, 22)

Score > 0  (8930, 22)
Score > 0.05 (8108, 22)
Score > 0.10 (6032, 22)
Score > 0.15 (2337, 22)
Score > 0.20 (138, 22)
Score > 0.25 (4, 22)
Score > 0.30 (1, 22)


### Downloading results

In [11]:
for index in range(7):
  files.download('Patient_'+str(index)+'.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>