In [29]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
stop_words = set(stopwords.words('english'))

In [3]:
df = pd.read_csv('/home/maaz-lfd/Maaz/Thesis/Thesis/dataset/clinical_trials/round_5/round_5_CT_metdata.csv')['title']

In [4]:
research_titles = df.to_list()
len(research_titles)

7809

In [5]:
research_titles

['Debate: Transfusing to normal haemoglobin levels will not improve outcome',
 'The 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001',
 'Epicatechins Purified from Green Tea (Camellia sinensis) Differentially Suppress Growth of Gender-Dependent Human Cancer Cell Lines',
 'Markers of exacerbation severity in chronic obstructive pulmonary disease',
 'Transmission patterns of smallpox: systematic review of natural outbreaks in Europe and North America since World War II',
 'Ventilator associated pneumonia and infection control',
 'Open lung biopsy in early-stage acute respiratory distress syndrome',
 'The Waiting Time for Inter-Country Spread of Pandemic Influenza',
 'Surfactant therapy for acute respiratory failure in children: a systematic review and meta-analysis',
 'Clinical review: Update of avian influenza A infections in humans',
 'Studying copy number variations using a nanofluidic platform',
 'Nasal Delivery of an Adenovir

In [6]:
queries = ['Acellular vaccines for preventing whooping cough in children', 
'Acetylcysteine and carbocysteine for acute upper and lower respiratory tract infections in paediatric patients without chronic broncho-pulmonary disease', 
'Acyclovir for treating varicella in otherwise healthy children and adolescents', 
'Amantadine and rimantadine for inﬂuenza A in adults', 
'Amantadine and rimantadine for inﬂuenza A in children and the elderly', 
'Antibiotic prophylaxis for preventing meningitis in patients with basilar skull fractures', 
'Antibiotic prophylaxis to reduce respiratory tract infections and mortality in adults receiving intensive care', 
'Antibiotics for acute bronchitis', 
'Antibiotics for acute laryngitis in adults', 
'Antibiotics for acute maxillary sinusitis', 
'Antibiotics for acute otitis media in children', 
'Antibiotics for community acquired pneumonia in adult outpatients', 
'Antibiotics for community-acquired pneumonia in children', 
'Antibiotics for preventing complications in children with measles', 
'Antibiotics for preventing meningococcal infections', 
'Antibiotics for sore throat', 
'Antibiotics for the common cold and acute purulent rhinitis', 
'Antibiotics for the prevention of acute and chronic suppurative otitis media in children', 
'Antibiotics for whooping cough (pertussis)', 
'Azithromycin for acute lower respiratory tract infections', 
'Beta2-agonists for acute bronchitis', 
'Bronchodilators for bronchiolitis', 
'Chest physiotherapy for acute bronchiolitis in paediatric patients between 0 and 24 months old', 
'Chest physiotherapy for pneumonia in adults', 
'Chinese herbs combined with Western medicine for severe acute respiratory syndrome (SARS)', 
'Chinese medicinal herbs for acute bronchitis', 
'Chinese medicinal herbs for measles', 
'Chinese medicinal herbs for sore throat', 
'Chinese medicinal herbs for the common cold', 
'Combined DTP-HBV-HIB vaccine versus separately administered DTP-HBV and HIB vaccines for primary prevention of diphtheria, tetanus, pertussis, hepatitis B and Haemophilus inﬂuenza B (HIB)', 
'Continuous negative extrathoracic pressure or continuous positive airway pressure compared to conventional ventilation for acute hypoxaemic respiratory failure in children', 
'Corticosteroids for acute bacterial meningitis', 
'Decongestants and antihistamines for acute otitis media in children', 
'Different antibiotic treatments for group A streptococcal pharyngitis', 
'Echinacea for preventing and treating the common cold', 
'Empiric antibiotic coverage of atypical pathogens for community-acquired pneumonia in hospitalized adults', 
'Epinephrine for bronchiolitis', 
'Garlic for the common cold', 
'Glucocorticoids for acute viral bronchiolitis in infants and young children', 
'Glucocorticoids for croup']

In [7]:
len(queries)

40

#### Word Split

**Mapping on Research title**
1. **NLTK tokenizer** on quries
2. **Stopwords Removal** on quries
3. IF word of a query exists in Research title

**ToDo:**
1. Mapping on research abstract

In [15]:
processed_queries = [nltk.word_tokenize(query) for query in queries]

In [16]:
processed_queries = [[token.lower() for token in query if not token.lower() in stop_words] for query in processed_queries]

In [17]:
processed_queries[1]

['acetylcysteine',
 'carbocysteine',
 'acute',
 'upper',
 'lower',
 'respiratory',
 'tract',
 'infections',
 'paediatric',
 'patients',
 'without',
 'chronic',
 'broncho-pulmonary',
 'disease']

In [18]:
mapping = {}
for i in range(len(processed_queries)):
    mapping.update({int(i+1):[]})
    for title in research_titles:
        for word in processed_queries[i]:
            if word in title.lower():
                mapping[int(i+1)].append(title)
                break


In [19]:
mapping

{1: ['Surfactant therapy for acute respiratory failure in children: a systematic review and meta-analysis',
  'DNA Vaccines: Developing New Strategies against Cancer',
  'Confronting Potential Influenza A (H5N1) Pandemic with Better Vaccines',
  'Development of a Symptom Score for Clinical Studies to Identify Children With a Documented Viral Upper Respiratory Tract Infection',
  'Interstitial lung diseases in children',
  'Noninvasive positive pressure ventilation for acute respiratory failure in children: a concise review',
  'A Pilot Study of Host Genetic Variants Associated with Influenza-associated Deaths among Children and Young Adults',
  'Host-protective effect of circulating pentraxin 3 (PTX3) and complex formation with neutrophil extracellular traps',
  'Vaccines for the future: learning from human immunology',
  'Update on the Angiotensin Converting Enzyme 2-Angiotensin (1–7)-Mas Receptor Axis: Fetal Programing, Sex Differences, and Intracellular Pathways',
  'Glioblastoma ex

In [20]:
for i in range(1,41):
    print(i)
    print(len(mapping[i]))
    print('========================================')

1
536
2
2379
3
241
4
99
5
223
6
718
7
1454
8
449
9
521
10
451
11
671
12
419
13
427
14
277
15
432
16
19
17
526
18
883
19
741
20
1177
21
440
22
23
23
1539
24
339
25
2205
26
537
27
115
28
108
29
175
30
4903
31
1160
32
508
33
664
34
192
35
132
36
455
37
25
38
77
39
1333
40
5


#### TF-IDF

**Mapping on Research title**
1. **TF-IDF** vectorizer on queires and research titles
3. **stopword removal**
4. **lower case**
5. **Cosine similarity**

**ToDo:**
1. Mapping on research abstract

In [8]:
corpus = research_titles
research_titles.extend(queries)

In [9]:
vectorizer = TfidfVectorizer(stop_words='english')

In [10]:
corpus_vectors = vectorizer.fit_transform(corpus)

In [11]:
vectorizer.get_feature_names_out().tolist()

['001',
 '002',
 '005',
 '01',
 '015',
 '0201',
 '025',
 '03012',
 '04',
 '05',
 '055',
 '066',
 '07th',
 '10',
 '100',
 '1002',
 '101',
 '1016',
 '1061',
 '107',
 '10rb',
 '10th',
 '11',
 '110f',
 '1111',
 '116',
 '12',
 '125',
 '128',
 '12th',
 '13',
 '14',
 '142',
 '145',
 '15',
 '150',
 '1500g',
 '15021',
 '151',
 '16',
 '16th',
 '17',
 '1726',
 '1755',
 '178',
 '17a',
 '17th',
 '17β',
 '18',
 '184',
 '186',
 '18α',
 '18β',
 '19',
 '1918',
 '193',
 '1977',
 '1984',
 '1985',
 '1987',
 '1993',
 '1994',
 '1995',
 '1997',
 '1999',
 '1a',
 '1b',
 '1f6',
 '1j',
 '1α',
 '20',
 '2000',
 '2001',
 '2002',
 '2003',
 '2005',
 '2007',
 '2008',
 '2009',
 '2009pdm',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2020_166',
 '2050',
 '20th',
 '21',
 '21st',
 '22',
 '221',
 '223',
 '226v',
 '229e',
 '22bp',
 '23',
 '24',
 '247',
 '25',
 '26',
 '263',
 '26th',
 '27',
 '27th',
 '28',
 '28th',
 '29',
 '29th',
 '2a',
 '2b',
 '2c',
 '2nd',
 '2α',
 '3

In [12]:
corpus_vectors.shape

(7849, 9682)

In [13]:
analyze = vectorizer.build_analyzer()

In [14]:
analyze('Markers of exacerbation severity in chronic obstructive pulmonary disease')

['markers',
 'exacerbation',
 'severity',
 'chronic',
 'obstructive',
 'pulmonary',
 'disease']

In [40]:
x1 = vectorizer.transform(["Amantadine and rimantadine for inﬂuenza A in children and the elderly"]); x2 = vectorizer.transform(["Markers of exacerbation severity in chronic obstructive pulmonary disease"])
similarity = cosine_similarity(x1,x2)

In [47]:
x1 = vectorizer.transform(["Amantadine and rimantadine for inﬂuenza A in children and the elderly"]); x2 = vectorizer.transform(["Amantadine and rimantadine children and the elderly"])
similarity = cosine_similarity(x1,x2)

In [48]:
similarity.flatten()[0]

0.8623322219641323

In [16]:
mapping = []
for i in range(len(queries)):
    # mapping.update({int(i+1):[]})
    for title in research_titles:
        query_vector = vectorizer.transform([queries[i]]); title_vector = vectorizer.transform([title])
        mapping.append(cosine_similarity(query_vector,title_vector).flatten()[0] )

In [25]:
mapping = {}
for i in range(len(queries)):
    mapping.update({int(i+1):[]})
    for title in research_titles:
        query_vector = vectorizer.transform([queries[i]]); title_vector = vectorizer.transform([title])
        similarity = cosine_similarity(query_vector,title_vector).flatten()[0]
        if similarity != 0.0:
            mapping[int(i+1)].append( [round(similarity,2), title] )

In [20]:
mapping = [round(i,2) for i in mapping]

In [26]:
mapping

{1: [[0.08,
   'Surfactant therapy for acute respiratory failure in children: a systematic review and meta-analysis'],
  [0.09, 'DNA Vaccines: Developing New Strategies against Cancer'],
  [0.07,
   'Confronting Potential Influenza A (H5N1) Pandemic with Better Vaccines'],
  [0.06,
   'Development of a Symptom Score for Clinical Studies to Identify Children With a Documented Viral Upper Respiratory Tract Infection'],
  [0.11, 'Interstitial lung diseases in children'],
  [0.07,
   'Noninvasive positive pressure ventilation for acute respiratory failure in children: a concise review'],
  [0.06,
   'A Pilot Study of Host Genetic Variants Associated with Influenza-associated Deaths among Children and Young Adults'],
  [0.08, 'Vaccines for the future: learning from human immunology'],
  [0.06,
   'Relative Efficacy of AS03-Adjuvanted Pandemic Influenza A(H1N1) Vaccine in Children: Results of a Controlled, Randomized Efficacy Trial'],
  [0.05,
   'Prevalence and Incidence of Respiratory Sync

In [30]:
unique_vals = set(mapping)
for val in unique_vals:
    print(f"{val}\t{mapping.count(val)}")


AttributeError: 'dict' object has no attribute 'count'

#### Count Vectorizer

In [31]:
corpus = research_titles
research_titles.extend(queries)

In [32]:
vectorizer = CountVectorizer(stop_words='english')
corpus_vectors = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out().tolist()

['001',
 '002',
 '005',
 '01',
 '015',
 '0201',
 '025',
 '03012',
 '04',
 '05',
 '055',
 '066',
 '07th',
 '10',
 '100',
 '1002',
 '101',
 '1016',
 '1061',
 '107',
 '10rb',
 '10th',
 '11',
 '110f',
 '1111',
 '116',
 '12',
 '125',
 '128',
 '12th',
 '13',
 '14',
 '142',
 '145',
 '15',
 '150',
 '1500g',
 '15021',
 '151',
 '16',
 '16th',
 '17',
 '1726',
 '1755',
 '178',
 '17a',
 '17th',
 '17β',
 '18',
 '184',
 '186',
 '18α',
 '18β',
 '19',
 '1918',
 '193',
 '1977',
 '1984',
 '1985',
 '1987',
 '1993',
 '1994',
 '1995',
 '1997',
 '1999',
 '1a',
 '1b',
 '1f6',
 '1j',
 '1α',
 '20',
 '2000',
 '2001',
 '2002',
 '2003',
 '2005',
 '2007',
 '2008',
 '2009',
 '2009pdm',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2020_166',
 '2050',
 '20th',
 '21',
 '21st',
 '22',
 '221',
 '223',
 '226v',
 '229e',
 '22bp',
 '23',
 '24',
 '247',
 '25',
 '26',
 '263',
 '26th',
 '27',
 '27th',
 '28',
 '28th',
 '29',
 '29th',
 '2a',
 '2b',
 '2c',
 '2nd',
 '2α',
 '3

In [33]:
x1 = vectorizer.transform(["Amantadine and rimantadine for inﬂuenza A in children and the elderly"]); x2 = vectorizer.transform(["Amantadine and rimantadine children and the elderly"])
similarity = cosine_similarity(x1,x2)

In [34]:
similarity.flatten()[0]

0.8944271909999159

In [35]:
mapping = {}
for i in range(len(queries)):
    mapping.update({int(i+1):[]})
    for title in research_titles:
        query_vector = vectorizer.transform([queries[i]]); title_vector = vectorizer.transform([title])
        similarity = cosine_similarity(query_vector,title_vector).flatten()[0]
        if similarity != 0.0:
            mapping[int(i+1)].append( [round(similarity,2), title] )

In [36]:
mapping

{1: [[0.13,
   'Surfactant therapy for acute respiratory failure in children: a systematic review and meta-analysis'],
  [0.17, 'DNA Vaccines: Developing New Strategies against Cancer'],
  [0.15,
   'Confronting Potential Influenza A (H5N1) Pandemic with Better Vaccines'],
  [0.11,
   'Development of a Symptom Score for Clinical Studies to Identify Children With a Documented Viral Upper Respiratory Tract Infection'],
  [0.2, 'Interstitial lung diseases in children'],
  [0.13,
   'Noninvasive positive pressure ventilation for acute respiratory failure in children: a concise review'],
  [0.11,
   'A Pilot Study of Host Genetic Variants Associated with Influenza-associated Deaths among Children and Young Adults'],
  [0.18, 'Vaccines for the future: learning from human immunology'],
  [0.1,
   'Relative Efficacy of AS03-Adjuvanted Pandemic Influenza A(H1N1) Vaccine in Children: Results of a Controlled, Randomized Efficacy Trial'],
  [0.09,
   'Prevalence and Incidence of Respiratory Syncyt