<a href="https://colab.research.google.com/github/MWFK/NLP-Semantic-Similarity/blob/main/ClinicalTrials/Models/ctdt_FE_Cosine_CPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Libs


In [None]:
# Python libs to manipulate dataframes and arrays
import pandas as pd
import numpy as np

# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

3.6.0


# Feature Engineering 1
CounVectoriser and TFIDF on the whole InclusionCriterias

In [None]:
# Python libs to manipulate dataframes and arrays
import pandas as pd
import numpy as np

# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity


def get_data():

  # Download Clinical Trials data
  print('Downloading Clinical Trials Data')
  ct_dt = pd.read_csv(r'https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/Batches_0.csv', sep=',', engine='python', encoding="utf-8")
  for btch in range(1, 4):
      url = 'https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/Batches_' +str(btch)+ '.csv'
      tmp = pd.read_csv(url, sep=',', engine='python', encoding="ISO-8859-1")
      ct_dt = ct_dt.append(tmp, ignore_index=True)
  ct_dt['AllLocation'] = ct_dt['LocationCity'].str.lower().map(str) + ' | ' + ct_dt['LocationState'].str.lower().map(str) + ' | ' + ct_dt['LocationCountry'].str.lower().map(str)
  print('Clinical Trials Data: ',ct_dt.shape, '\n')

  # Download User input data
  print('Downloading Test data')
  test = pd.read_csv('https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/TestData.csv', sep=';', engine='python', encoding = "utf-8", skiprows=[0], names=['PatientID','ConditionOrDisease','Age','Gender','LocationCountry','TravelDistance','InclusionCriteria'])
  print('Test Data: ', test.shape)

  return ct_dt, test

ctdt, test = get_data()


def data_processing(ct_dt):

  print('Data dimensions before Filtering : ', ct_dt.shape, '\n')

  ### Filtering by Age ###
  print('Filtering by Age...')
  tmp = ct_dt[ct_dt.iloc[:,13] <= test.iloc[:1,2][0]]               # compare numerics
  tmp = tmp[tmp.iloc[:,13].str.find(test.iloc[:1,2][0][-5:]) != -1] # Detect the Year/Month
  print('Data dimensions: ', tmp.shape, '\n')

  ### Filtering by Gender ###
  print('Filtering by Gender...')
  tmp = tmp[(tmp.iloc[:,12] == test.iloc[:1,3][0]) | (tmp.iloc[:,12] == 'All')] 
  print('Data dimensions: ', tmp.shape, '\n')

  ### Filtering by Travel Distance ###
  print('Filtering by Travel Distance...')
  tmp = tmp[tmp.iloc[:,20].str.find(test.iloc[:1,5][0].lower()) != -1] 
  print('Data dimensions: ', tmp.shape, '\n')

  return tmp

filtered = data_processing(ct_dt)

Downloading Clinical Trials Data
Clinical Trials Data:  (10152, 21) 

Downloading Test data
Test Data:  (7, 7)
Data dimensions before Filtering :  (10152, 21) 

Filtering by Age...
Data dimensions:  (9517, 21) 

Filtering by Gender...
Data dimensions:  (9403, 21) 

Filtering by Travel Distance...
Data dimensions:  (645, 21) 



In [None]:
%%time

#[1] Data
ctdt['InclusionCriteria']      = ctdt['InclusionCriteria'].fillna(' ') # the whole dataset
filtered['InclusionCriteria']  = filtered['InclusionCriteria'].fillna(' ')   # the subset of the data that we'll score
test0                          = test.iloc[:,6].fillna(' ')            # the user input

'''
TF-IDF is better than Count Vectorizers because it not only focuses on the frequency of words present in the corpus but also provides the importance of the words. 
We can then remove the words that are less important for analysis, hence making the model building less complex by reducing the input dimensions.
TFIDF is based on the logic that words that are too abundant in a corpus and words that are too rare are both not statistically important for finding a pattern. 
The Logarithmic factor in tfidf mathematically penalizes the words that are too abundant or too rare in the corpus by giving them low tfidf scores.
To train a model on the actual linguistic relationship of the words, there are two other word embedding techniques widely used in NLP, they are "word2vec" and "Glove". 
'''

#[2] Instantiating Vectorization and Feature Engineerings methods
cv  = CountVectorizer(encoding='utf-8', decode_error='ignore', strip_accents='ascii', lowercase=True, stop_words='english', analyzer='word', ngram_range=(2, 2), max_features=100000)
tf = TfidfTransformer(use_idf=True)


#[3] Processing the whole dataset
cv_fitted_ctdt = cv.fit(ctdt['InclusionCriteria'])
cv_trans_ctdt  = cv_fitted_ctdt.transform(ct_dt['InclusionCriteria']) 

tf_fitted_ctdt = tf.fit(cv_trans_ctdt)
tf_trans_ctdt  = tf_fitted_ctdt.transform(cv_trans_ctdt)


#[4] Processing the subset of the data that we'll score 
cv_filtered = cv_fitted_ctdt.transform(filtered['InclusionCriteria'])
tf_filtered = tf_fitted_ctdt.transform(cv_filtered)


#[5] Processing the user input
cv_test0 = cv_fitted_ctdt.transform(test0)
tf_test0 = tf_fitted_ctdt.transform(cv_test0)

CPU times: user 5.97 s, sys: 39.3 ms, total: 6.01 s
Wall time: 6 s


In [None]:
# for index, row in test.iterrows():
#     #print(index, row['InclusionCriteria'])
#     print('Processing the user input: ', index)
#     test_row = test.iloc[index:index+1,6].fillna(' ') 
#     cv_row = cv_fitted_ctdt.transform(test_row)
#     tf_row = tf_fitted_ctdt.transform(cv_row)

Processing the user input:  0
Processing the user input:  1
Processing the user input:  2
Processing the user input:  3
Processing the user input:  4
Processing the user input:  5
Processing the user input:  6


In [None]:
tmp['Similarity'] = pd.Series(cosine_similarity(tf_test0, tf_filtered)[0]).values

ctdt_filtered = ctdt
ctdt_filtered['Similarity'] = 0
print('Dataset shape: ', ctdt_filtered.shape)
print('Filtered Dataset shape: ', filtered.shape)

# Take off the existing NCTId from the whole dataset then add the filter data to 
ctdt_filterred = ctdt_filtered[~ctdt_filtered['NCTId'].isin(filtered['NCTId'])]
ctdt_filterred = ctdt_filtered.append(filtered, ignore_index=True)

print('\nScore > 0 ', ctdt_filtered[ctdt_filtered['Similarity']>0].shape)
print('Score > 0.20', ctdt_filtered[ctdt_filtered['Similarity']>0.2].shape)
print('Score > 0.25', ctdt_filtered[ctdt_filtered['Similarity']>0.25].shape)
print('Score > 0.30', ctdt_filtered[ctdt_filtered['Similarity']>0.3].shape)

ctdt_filtered['Similarity'] = ctdt_filtered['Similarity'].apply(lambda score: score if score>0 else 0) # if score>0.25
ctdt_filtered = ctdt_filtered.sort_values(by=['Similarity'], ascending=False)

Dataset shape:  (10152, 22)
Filtered Dataset shape:  (645, 21)

Score > 0  (0, 22)
Score > 0.20 (0, 22)
Score > 0.25 (0, 22)
Score > 0.30 (0, 22)


In [None]:
def get_results(tf_test0, tf_filtered, filtered, ctdt):
  tmp['Similarity'] = pd.Series(cosine_similarity(tf_test0, tf_filtered)[0]).values

  ctdt_filtered = ctdt
  ctdt_filtered['Similarity'] = 0
  print('Dataset shape: ', ctdt_filtered.shape)
  print('Filtered Dataset shape: ', filtered.shape)

  # Take off the existing NCTId from the whole dataset then add the filter data to 
  ctdt_filtered = ctdt_filtered[~ctdt_filterred['NCTId'].isin(filtered['NCTId'])]
  ctdt_filtered = ctdt_filtered.append(filtered, ignore_index=True)

  print('\nScore > 0 ', ctdt_filtered[ctdt_filtered['Similarity']>0].shape)
  print('Score > 0.20', ctdt_filtered[ctdt_filtered['Similarity']>0.2].shape)
  print('Score > 0.25', ctdt_filtered[ctdt_filtered['Similarity']>0.25].shape)
  print('Score > 0.30', ctdt_filtered[ctdt_filtered['Similarity']>0.3].shape)

  ctdt_filtered['Similarity'] = ctdt_filtered['Similarity'].apply(lambda score: score if score>0.1 else 0) # if score>0.25
  ctdt_filtered = ctdt_filtered.sort_values(by=['Similarity'], ascending=False)

  return ctdt_filterred

# the data have been vectorized
get_results(tf_test0, tf_filtered, filtered, ctdt).head(2)

Dataset shape:  (10152, 22)
Filtered Dataset shape:  (645, 21)

Score > 0  (0, 22)
Score > 0.20 (0, 22)
Score > 0.25 (0, 22)
Score > 0.30 (0, 22)


  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Rank,NCTId,OrgFullName,OfficialTitle,OverallStatus,Keyword,DetailedDescription,Condition,EligibilityCriteria,InclusionCriteria,ExclusionCriteria,HealthyVolunteers,Gender,MinimumAge,StudyPopulation,LocationFacility,LocationCity,LocationState,LocationZip,LocationCountry,AllLocation,Similarity
0,1,NCT03581708,Guangdong Provincial People's Hospital,Real-world Study of the Incidence and Risk Fac...,Not yet recruiting,lung cancer|Venous Thromboembolism,VTE has high incidence in lung cancer and incr...,Lung Neoplasms|Venous Thromboembolism,Inclusion Criteria:||Age ≥ 18 years at the tim...,Age ≥ 18 years at the time of screening.|Easte...,History of another primary malignancy except f...,No,All,18 Years,Patients diagnosed with advanced staged lung c...,Guangdong General Hospital,Guangzhou,Guagndong,510080,China,guangzhou | guagndong | china,0.0
1,2,NCT01130285,University of Toledo Health Science Campus,Validation of a Multi-gene Test for Lung Cance...,"Active, not recruiting",Lung Cancer,"Because more than 160,000 individuals die of l...",Lung Cancer,Inclusion Criteria:||20 or more pack year smok...,20 or more pack year smoking history|clinical ...,Lung Cancer within 3 months after the date of ...,Accepts Healthy Volunteers,All,50 Years,The study population will consist of subjects ...,National Jewish Health|University of Michigan|...,Denver|Ann Arbor|Detroit|Rochester|Cleveland|C...,Colorado|Michigan|Michigan|Minnesota|Ohio|Ohio...,80206|48109|48202|55905|44195|43221|43606|4360...,United States|United States|United States|Unit...,denver|ann arbor|detroit|rochester|cleveland|c...,0.0


In [None]:
print(ct_dt_tmp['InclusionCriteria'][0])
print(*test0)

'Age ≥ 18 years at the time of screening.|Eastern Cooperative Oncology Group performance status of ≤ 2.|Written informed consent obtained from the patient.|Histologically and cytologically documented Stage 3B-4 lung cancer (according to Version 8 of the International Association for the Study of Lung Cancer Staging system).|Patients with stage 1 to 3, who undergo radical therapy with disease free survival (DFS) >12 months.|Willingness and ability to comply with scheduled visits and other study procedures.||'

### Feature Engineering 2
CountVectorizer and TFIDF on the filtred subset of the InclusionCriterias

In [None]:
# Python libs to manipulate dataframes and arrays
import pandas as pd
import numpy as np

# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

# To get the word vectors, you need a word embedding model. Let’s download the FastText model using gensim’s downloader api.
import gensim
print(gensim.__version__)

# upgrade gensim if you can't import softcossim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess


def get_data():

  # Download Clinical Trials data
  print('Downloading Clinical Trials Data')
  ct_dt = pd.read_csv(r'https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/Batches_0.csv', sep=',', engine='python', encoding="utf-8")
  for btch in range(1, 4):
      url = 'https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/Batches_' +str(btch)+ '.csv'
      tmp = pd.read_csv(url, sep=',', engine='python', encoding="ISO-8859-1")
      ct_dt = ct_dt.append(tmp, ignore_index=True)
  ct_dt['AllLocation'] = ct_dt['LocationCity'].str.lower().map(str) + ' | ' + ct_dt['LocationState'].str.lower().map(str) + ' | ' + ct_dt['LocationCountry'].str.lower().map(str)
  print('Clinical Trials Data: ',ct_dt.shape, '\n')

  # Download User input data
  print('Downloading Test data')
  test = pd.read_csv('https://raw.githubusercontent.com/MWFK/NLP-Semantic-Similarity/main/ClinicalTrials/Data/TestData.csv', sep=';', engine='python', encoding = "utf-8", skiprows=[0], names=['PatientID','ConditionOrDisease','Age','Gender','LocationCountry','TravelDistance','InclusionCriteria'])
  print('Test Data: ', test.shape)

  return ct_dt, test

ct_dt, test = get_data()


def data_processing(ct_dt):

  print('Data dimensions before Filtering : ', ct_dt.shape, '\n')

  ### Filtering by Age ###
  print('Filtering by Age...')
  tmp = ct_dt[ct_dt.iloc[:,13] <= test.iloc[:1,2][0]]               # compare numerics
  tmp = tmp[tmp.iloc[:,13].str.find(test.iloc[:1,2][0][-5:]) != -1] # Detect the Year/Month
  print('Data dimensions: ', tmp.shape, '\n')

  ### Filtering by Gender ###
  print('Filtering by Gender...')
  tmp = tmp[(tmp.iloc[:,12] == test.iloc[:1,3][0]) | (tmp.iloc[:,12] == 'All')] 
  print('Data dimensions: ', tmp.shape, '\n')

  ### Filtering by Travel Distance ###
  print('Filtering by Travel Distance...')
  tmp = tmp[tmp.iloc[:,20].str.find(test.iloc[:1,5][0].lower()) != -1] 
  print('Data dimensions: ', tmp.shape, '\n')

  return tmp

tmp = data_processing(ct_dt)

3.6.0
Downloading Clinical Trials Data
Clinical Trials Data:  (10152, 21) 

Downloading Test data
Test Data:  (7, 7)
Data dimensions before Filtering :  (10152, 21) 

Filtering by Age...
Data dimensions:  (9517, 21) 

Filtering by Gender...
Data dimensions:  (9403, 21) 

Filtering by Travel Distance...
Data dimensions:  (645, 21) 



In [None]:
%%time

#[1] Data
ct_dt['InclusionCriteria'] = ct_dt['InclusionCriteria'].fillna(' ') # the whole dataset
tmp['InclusionCriteria']   = tmp['InclusionCriteria'].fillna(' ')   # the subset of the data that we'll score
test0                      = test.iloc[:1,6].fillna(' ')            # the user input



#[2] Instantiating Vectorization and Feature Engineerings methods
cv  = CountVectorizer(encoding='utf-8', decode_error='ignore', strip_accents='ascii', lowercase=True, stop_words='english', analyzer='word', ngram_range=(2, 2), max_features=100000)
tf = TfidfTransformer(use_idf=True)


#[3] Processing the whole dataset
cv_fitted_tmp = cv.fit(tmp['InclusionCriteria'])
cv_trans_tmp  = cv_fitted_tmp.transform(tmp['InclusionCriteria']) 

tf_fitted_tmp = tf.fit(cv_trans_tmp)
tf_trans_tmp  = tf_fitted_tmp.transform(cv_trans_tmp)


#[4] Processing the user input
cv_test0 = cv_fitted_tmp.transform(test0)
tf_test0 = tf_fitted_tmp.transform(cv_test0)

CPU times: user 725 ms, sys: 8.36 ms, total: 734 ms
Wall time: 762 ms


In [None]:
tmp['Similarity'] = pd.Series(cosine_similarity(tf_test0, tf_trans_tmp)[0]).values

ct_dt_tmp = ct_dt
ct_dt_tmp['Similarity'] = 0
print(ct_dt_tmp.shape)
print(tmp.shape)

ct_dt_tmp = ct_dt_tmp[~ct_dt_tmp['NCTId'].isin(tmp['NCTId'])]
print(ct_dt_tmp.shape)
ct_dt_tmp = ct_dt_tmp.append(tmp, ignore_index=True)

print(ct_dt_tmp[ct_dt_tmp['Similarity']>0.1].shape)
print(ct_dt_tmp[ct_dt_tmp['Similarity']>0.2].shape)
print(ct_dt_tmp[ct_dt_tmp['Similarity']>0.25].shape)
print(ct_dt_tmp[ct_dt_tmp['Similarity']>0.3].shape)

ct_dt_tmp['Similarity'] = ct_dt_tmp['Similarity'].apply(lambda score: score if score>0 else 0) # if score>0.25
ct_dt_tmp = ct_dt_tmp.sort_values(by=['Similarity'], ascending=False)

(10152, 22)
(645, 22)
(9507, 22)
(14, 22)
(1, 22)
(0, 22)
(0, 22)


In [None]:
ct_dt_tmp['InclusionCriteria'][0]

'Age ≥ 18 years at the time of screening.|Eastern Cooperative Oncology Group performance status of ≤ 2.|Written informed consent obtained from the patient.|Histologically and cytologically documented Stage 3B-4 lung cancer (according to Version 8 of the International Association for the Study of Lung Cancer Staging system).|Patients with stage 1 to 3, who undergo radical therapy with disease free survival (DFS) >12 months.|Willingness and ability to comply with scheduled visits and other study procedures.||'

In [None]:
ct_dt_tmp['InclusionCriteria'][0]
print(*test0)

Histologically diagnosed with metastatic non-small cell lung cancer in 2018 | Initially treated with pertuzumab but relapsed | His performance status is ECOG 1 or KPS 90 | His blood and liver function analysis show normal | No other indications like HIV, HCV, HBV | No allergies | Life expectancy over 6 months | No mental disabilities.
