In [2]:
import spacy 
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# business news
corpus = fetch_20newsgroups(categories=['sci.med'], remove=('headers', 'footers', 'quotes'))

print(len(corpus.data))
print(corpus.data[0])

594
[reply to keith@actrix.gen.nz (Keith Stewart)]
 
 
It would help if you (and anyone else asking for medical information on
some subject) could ask specific questions, as no one is likely to type
in a textbook chapter covering all aspects of the subject.  If you are
looking for a comprehensive review, ask your local hospital librarian.
Most are happy to help with a request of this sort.
 
Briefly, this is a condition in which patients who have significant
residual weakness from childhood polio notice progression of the
weakness as they get older.  One theory is that the remaining motor
neurons have to work harder and so die sooner.


In [9]:
nlp = spacy.load('en_core_web_sm')

def custom_tokenizer(document):
    doc = nlp(document)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_space and not token.is_punct and token.is_alpha]

tfidf = TfidfVectorizer(tokenizer=custom_tokenizer)
features = tfidf.fit_transform(corpus.data)
features



<594x9343 sparse matrix of type '<class 'numpy.float64'>'
	with 36610 stored elements in Compressed Sparse Row format>

In [10]:
print(len(tfidf.get_feature_names_out()))

print(features.shape)

9343
(594, 9343)


In [14]:
tfidf.get_feature_names_out()[:50]

array(['aaai', 'aanerud', 'aaron', 'aaronson', 'ab', 'abandon', 'abate',
       'abbott', 'abbreviation', 'abdoman', 'abdominal', 'abduction',
       'abeit', 'aberdeen', 'aberrant', 'aberration', 'abhin', 'ability',
       'abington', 'able', 'abner', 'abnormal', 'abnormality',
       'abnormally', 'abreast', 'abscess', 'absence', 'absolute',
       'absolutely', 'absorb', 'absorbed', 'absorbtion', 'absorption',
       'abstraction', 'absurd', 'absurdum', 'abundant', 'abuse', 'ac',
       'acad', 'academia', 'academic', 'academy', 'accentuate', 'accept',
       'acceptable', 'acceptance', 'accepted', 'access', 'accessible'],
      dtype=object)

In [15]:
print(features)

  (0, 7758)	0.08284201027681323
  (0, 2217)	0.12597510255397745
  (0, 3590)	0.11161414829062062
  (0, 9237)	0.08221171145463559
  (0, 5552)	0.14887426068161608
  (0, 5367)	0.16050196279367784
  (0, 6971)	0.13832885463553046
  (0, 8402)	0.11340707220042541
  (0, 5788)	0.10469823739884833
  (0, 6570)	0.1657304786600589
  (0, 5671)	0.11736887051651362
  (0, 6347)	0.1657304786600589
  (0, 1355)	0.15608131561699154
  (0, 9116)	0.3210039255873557
  (0, 7061)	0.17212966490573958
  (0, 7613)	0.12324185040268239
  (0, 6075)	0.08845546402994495
  (0, 1662)	0.11161414829062062
  (0, 1030)	0.1657304786600589
  (0, 7767)	0.10911888457553465
  (0, 7040)	0.14887426068161608
  (0, 3588)	0.15608131561699154
  (0, 4716)	0.19200735295878033
  (0, 3806)	0.1143474004419157
  (0, 4801)	0.11957591630829678
  :	:
  (593, 7601)	0.2539857068716477
  (593, 889)	0.2539857068716477
  (593, 5951)	0.2539857068716477
  (593, 1495)	0.2539857068716477
  (593, 2519)	0.20646304771957705
  (593, 5232)	0.2386046805999144
 

In [32]:
query = ['health']
query_tfidf = tfidf.transform(query)

cosine_similarities=cosine_similarity(features, query_tfidf).flatten()

In [33]:
np.argsort(cosine_similarities)

array([  0, 386, 387, 388, 389, 390, 391, 385, 392, 395, 396, 397, 398,
       399, 400, 393, 401, 384, 382, 366, 367, 368, 369, 370, 372, 383,
       373, 375, 377, 378, 379, 380, 381, 374, 364, 402, 404, 423, 424,
       425, 426, 427, 428, 422, 429, 431, 432, 433, 434, 435, 436, 430,
       403, 421, 419, 405, 406, 407, 408, 409, 410, 420, 411, 413, 414,
       415, 416, 417, 418, 412, 438, 363, 361, 308, 309, 310, 311, 312,
       313, 307, 314, 316, 317, 318, 319, 321, 322, 315, 323, 306, 304,
       290, 291, 292, 293, 294, 295, 305, 592, 298, 299, 300, 301, 302,
       303, 297, 362, 324, 326, 345, 346, 347, 348, 349, 351, 344, 352,
       354, 355, 356, 358, 359, 360, 353, 325, 343, 341, 327, 328, 329,
       330, 331, 332, 342, 333, 335, 336, 337, 338, 339, 340, 334, 289,
       439, 441, 535, 536, 537, 538, 539, 540, 534, 541, 543, 544, 545,
       546, 547, 549, 542, 550, 533, 531, 517, 518, 519, 520, 521, 522,
       532, 523, 525, 526, 527, 528, 529, 530, 524, 516, 552, 55

In [36]:
def top_k(arr, k):
    kth_largest = -(k+1)
    return np.argsort(arr)[:kth_largest:-1]

top_related_indices = top_k(cosine_similarities, 10)
top_related_indices

array([ 73,  70, 570, 115, 175,  97, 462, 209,  83, 279], dtype=int64)

In [37]:
print(cosine_similarities[top_related_indices])

[0.2703794  0.25179011 0.2427181  0.17531343 0.17021368 0.16459357
 0.14024906 0.13939235 0.13470176 0.12875607]


In [41]:
import pandas as pd
def top_5_similarities(corpus, cosine_similarities, n):        
    for x in range(0,n):
        print(corpus.data[top_k(cosine_similarities, n)[x]])
        print("$"*100)


pd.set_option('display.max_colwidth', None)
top_5_similarities(corpus, cosine_similarities, 1)

------------- cut here -----------------
Volume  6, Number 10                                           April 20, 1993

              +------------------------------------------------+
              !                                                !
              !              Health Info-Com Network           !
              !                Medical Newsletter              !
              +------------------------------------------------+
                         Editor: David Dodell, D.M.D.
    10250 North 92nd Street, Suite 210, Scottsdale, Arizona 85258-4599 USA
                          Telephone +1 (602) 860-1121
                              FAX +1 (602) 451-1165

Compilation Copyright 1993 by David Dodell,  D.M.D.  All  rights  Reserved.  
License  is  hereby  granted  to republish on electronic media for which no 
fees are charged,  so long as the text of this copyright notice and license 
are attached intact to any and all republished portion or portions.  

The Health Info-