# Imports

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity


# Action story

In [2]:
corpus_1 =["Hercule Poirot has retired to the village of King’s Abbot to cultivate marrows.",
"But when wealthy Roger Ackroyd is found stabbed in his study, he agrees to investigate.",
"A typical village murder mystery; or so it seems until the last chapter with its stunning revelation.",
"This title would still be discussed today even if Christie had never written another book.",
"An unmissable, and still controversial, milestone of detective fiction",
"Sixteen years ago, Caroline Crale died in prison while serving a life sentence for poisoning her husband.",
"Her daughter asks Poirot to investigate a possible miscarriage of justice and he approaches the other five suspects.",
"This sublime novel is a subtle and ingenious detective story \
an elegiac love story and a masterful example of storytelling technique, \
with five separate accounts of one devastating event.",
"Christie’s greatest achievement.",
"In the village of Chipping Cleghorn,",
"a murder is announced in the local paper’s small ads.",
"As Miss Blacklock’s friends gather for what they fondly imagine will be a parlour game",
"an elaborate murder plot is set in motion. This was Christie’s 50th title and remains Miss Marple’s finest hour.",
"Notable also for its setting in post-war Britain\
(a factor vital to the plot) this is arguably the last of the ingeniously clued and perfectly paced Christies."]

# heart disease

In [3]:

corpus_2=[ "Causes of congenital heart defects",
           "Congenital heart defects usually develop while a baby is in the womb. Heart defects can develop as the heart develops, about a month after conception, changing the flow of blood in the heart.",
           "Some medical conditions, medications and genes may play a role in causing heart defects.",
           "Heart defects can also develop in adults. As you age, your heart's structure can change, causing a heart defect.",
           "Causes of cardiomyopathy",
           "The cause of cardiomyopathy, a thickening or enlarging of the heart muscle, may depend on the type:",
           "Dilated cardiomyopathy. The cause of this most common type of cardiomyopathy often is unknown. The condition usually causes the left ventricle to widen. ",
           "Dilated cardiomyopathy may be caused by reduced blood flow to the heart (ischemic heart disease) resulting from damage after a heart attack, infections, toxins and certain drugs, including those used to treat cancer.",
           "It may also be inherited from a parent.",
           "Hypertrophic cardiomyopathy. This type usually is passed down through families (inherited).", 
           "It can also develop over time because of high blood pressure or aging.",
           "Restrictive cardiomyopathy. This least common type of cardiomyopathy, which causes the heart muscle to become rigid and less elastic, can occur for no known reason. ",
           "Or it may be caused by diseases, such as connective tissue disorders or the buildup of abnormal proteins (amyloidosis)." ]



In [4]:
print(len(corpus_1))
print(len(corpus_2))

14
13


# tokenization & preprocessing

In [5]:
def tokenize(doc):
    return doc.split(" ")


def tokenize_corpus(corpus):
    vv=[]
    for doc in corpus:
        
        tokens = [x.lower() for x in tokenize(doc)  if x.isalpha()]
   
        vv.append(tokens)
    return vv

In [6]:
tokenized_corpus_1 = tokenize_corpus(corpus_1)
print(tokenized_corpus_1)

tokenized_corpus_2 = tokenize_corpus(corpus_2)
print(tokenized_corpus_2)

[['hercule', 'poirot', 'has', 'retired', 'to', 'the', 'village', 'of', 'abbot', 'to', 'cultivate'], ['but', 'when', 'wealthy', 'roger', 'ackroyd', 'is', 'found', 'stabbed', 'in', 'his', 'he', 'agrees', 'to'], ['a', 'typical', 'village', 'murder', 'or', 'so', 'it', 'seems', 'until', 'the', 'last', 'chapter', 'with', 'its', 'stunning'], ['this', 'title', 'would', 'still', 'be', 'discussed', 'today', 'even', 'if', 'christie', 'had', 'never', 'written', 'another'], ['an', 'and', 'still', 'milestone', 'of', 'detective', 'fiction'], ['sixteen', 'years', 'caroline', 'crale', 'died', 'in', 'prison', 'while', 'serving', 'a', 'life', 'sentence', 'for', 'poisoning', 'her'], ['her', 'daughter', 'asks', 'poirot', 'to', 'investigate', 'a', 'possible', 'miscarriage', 'of', 'justice', 'and', 'he', 'approaches', 'the', 'other', 'five'], ['this', 'sublime', 'novel', 'is', 'a', 'subtle', 'and', 'ingenious', 'detective', 'story', 'an', 'elegiac', 'love', 'story', 'and', 'a', 'masterful', 'example', 'of', 

In [7]:

def vocabulary_fun (tokenized_corpus):

    vocabulary = []
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                vocabulary.append(token)

    word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
    idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
    return (vocabulary)

vocabulary_1=vocabulary_fun(tokenized_corpus_1)
print(len(vocabulary_1))
print(vocabulary_1)

vocabulary_2=vocabulary_fun(tokenized_corpus_2)
print(len(vocabulary_2))
print(vocabulary_2)

122
['hercule', 'poirot', 'has', 'retired', 'to', 'the', 'village', 'of', 'abbot', 'cultivate', 'but', 'when', 'wealthy', 'roger', 'ackroyd', 'is', 'found', 'stabbed', 'in', 'his', 'he', 'agrees', 'a', 'typical', 'murder', 'or', 'so', 'it', 'seems', 'until', 'last', 'chapter', 'with', 'its', 'stunning', 'this', 'title', 'would', 'still', 'be', 'discussed', 'today', 'even', 'if', 'christie', 'had', 'never', 'written', 'another', 'an', 'and', 'milestone', 'detective', 'fiction', 'sixteen', 'years', 'caroline', 'crale', 'died', 'prison', 'while', 'serving', 'life', 'sentence', 'for', 'poisoning', 'her', 'daughter', 'asks', 'investigate', 'possible', 'miscarriage', 'justice', 'approaches', 'other', 'five', 'sublime', 'novel', 'subtle', 'ingenious', 'story', 'elegiac', 'love', 'masterful', 'example', 'storytelling', 'separate', 'accounts', 'one', 'devastating', 'greatest', 'chipping', 'announced', 'local', 'small', 'as', 'miss', 'friends', 'gather', 'what', 'they', 'fondly', 'imagine', 'wil

# **TF_IDF**

In [8]:

def TF_IDF_FUN (tokenized_corpus_1 , vocabulary):
  tf_matrix=[]
  idf_matrix=[]
  TF_IDF=[]
  for sent in tokenized_corpus_1:
      tf =[ sent.count(word) / len(sent) for word in vocabulary]
      tf_matrix.append(tf)

  xx=(len(tokenized_corpus_1)+1)

  for word in vocabulary:
    idf=(np.log10(xx / (1+ sum([1 for doc in tokenized_corpus_1 if word in doc]))))
    idf_matrix.append(idf)

  for sent in tf_matrix:
    TF_IDF.append(list(np.multiply( sent,idf_matrix) ))
  return TF_IDF



# Story TF_IDF  

In [9]:
TF_IDF_doc_1= pd.DataFrame(TF_IDF_FUN (tokenized_corpus_1 , vocabulary_1))
  

TF_IDF_doc_1_df = pd.DataFrame(TF_IDF_doc_1)
TF_IDF_doc_1_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,112,113,114,115,116,117,118,119,120,121
0,0.079551,0.063543,0.079551,0.079551,0.086749,0.03009,0.052185,0.03009,0.079551,0.079551,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.036702,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.022066,0.038269,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047285,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.041116,0.0,0.0,0.028066,0.01947,0.0,0.01947,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024518,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.066199,0.114806,0.066199,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Medical TF_IDF 

In [10]:

TF_IDF_doc_2 = TF_IDF_FUN (tokenized_corpus_2 , vocabulary_2)

TF_IDF_doc_2_df = pd.DataFrame(TF_IDF_doc_2)
TF_IDF_doc_2_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84,85,86,87,88,89,90,91,92,93
0,0.089432,0.038377,0.133801,0.048608,0.108814,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.006617,0.023069,0.025142,0.037522,0.018761,0.037522,0.029141,0.020761,0.029141,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.020253,0.0,0.0,0.0,0.0,0.025086,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.03472,0.038862,0.0,0.038862,0.0,0.021502,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.149053,0.063962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.027412,0.0,0.01736,0.0,0.0,0.0,0.0,0.021502,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.022358,0.019189,0.0,0.0,0.0,0.027203,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.027004,0.0,0.0,0.0,0.0,0.011149,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043004,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.060452,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Cosine similarity of Story 

In [11]:


# compute and print the cosine similarity matrix
cosine_sim_doc_1 = cosine_similarity(TF_IDF_doc_1, TF_IDF_doc_1)

sim_sum_doc_1= np.triu(cosine_sim_doc_1).sum()-np.trace(cosine_sim_doc_1)
print('sim_sum_doc_1 : ',sim_sum_doc_1)

count_doc_1=len(cosine_sim_doc_1)

total_doc_1=(count_doc_1*count_doc_1-count_doc_1)/2
print('total_doc_1 : ' ,total_doc_1)

centroid_doc_1 =sim_sum_doc_1/total_doc_1
print('centroid_doc_1 : ',centroid_doc_1)

cosine_sim_doc_1_df = pd.DataFrame(cosine_sim_doc_1)
cosine_sim_doc_1_df





sim_sum_doc_1 :  3.758127531812036
total_doc_1 :  91.0
centroid_doc_1 :  0.04129810474518721


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.067501,0.062772,0.0,0.025865,0.0,0.163164,0.022303,0.0,0.191295,0.025833,0.0,0.0,0.112394
1,0.067501,1.0,0.0,0.0,0.0,0.012692,0.086056,0.013805,0.0,0.032715,0.054101,0.0,0.036384,0.053385
2,0.062772,0.0,1.0,0.0,0.0,0.012239,0.025388,0.059488,0.0,0.126432,0.106832,0.012497,0.043152,0.135647
3,0.0,0.0,0.0,1.0,0.090656,0.0,0.0,0.018212,0.0,0.0,0.0,0.053041,0.089265,0.022506
4,0.025865,0.0,0.0,0.090656,1.0,0.0,0.05126,0.187938,0.0,0.052095,0.0,0.0,0.105503,0.045958
5,0.0,0.012692,0.012239,0.0,0.0,1.0,0.065573,0.017428,0.0,0.02985,0.040372,0.047391,0.013575,0.043157
6,0.163164,0.086056,0.025388,0.0,0.05126,0.065573,1.0,0.10258,0.0,0.061918,0.041871,0.012264,0.020351,0.084024
7,0.022303,0.013805,0.059488,0.018212,0.187938,0.017428,0.10258,1.0,0.0,0.044922,0.052333,0.017796,0.096243,0.068179
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,0.191295,0.032715,0.126432,0.0,0.052095,0.02985,0.061918,0.044922,0.0,1.0,0.10406,0.0,0.034991,0.138783


# Cosine similarity of Medical Doc

In [12]:
cosine_sim_doc_2 = cosine_similarity(TF_IDF_doc_2, TF_IDF_doc_2)

sim_sum_doc_2= np.triu(cosine_sim_doc_2).sum()-np.trace(cosine_sim_doc_2)
print('sim_sum_doc_2 : ',sim_sum_doc_2)

count_doc_2=len(cosine_sim_doc_2)

total_doc_2=(count_doc_1*count_doc_1-count_doc_1)/2
print('total_doc_2 :',total_doc_2)

centroid_doc_2 =sim_sum_doc_2/total_doc_2
print('centroid_doc_2 : ',centroid_doc_2)

cosine_sim_doc_2_df = pd.DataFrame(cosine_sim_doc_2)
cosine_sim_doc_2_df

sim_sum_doc_2 :  6.580324883097099
total_doc_2 : 91.0
centroid_doc_2 :  0.07231126245161647


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.351597,0.024461,0.178191,0.318169,0.059116,0.095764,0.0495,0.0,0.0,0.015706,0.091423,0.012589
1,0.351597,1.0,0.114746,0.377248,0.014424,0.197327,0.178886,0.184567,0.034352,0.077631,0.139867,0.06995,0.069306
2,0.024461,0.114746,1.0,0.176377,0.0,0.054666,0.0,0.084077,0.063258,0.0,0.0,0.047233,0.019929
3,0.178191,0.377248,0.176377,1.0,0.0,0.041588,0.0,0.055596,0.112423,0.0,0.189221,0.071595,0.045264
4,0.318169,0.014424,0.0,0.0,1.0,0.045819,0.278478,0.115494,0.0,0.0,0.021937,0.1022,0.017584
5,0.059116,0.197327,0.054666,0.041588,0.045819,1.0,0.228586,0.087316,0.068045,0.0,0.073006,0.057888,0.16986
6,0.095764,0.178886,0.0,0.0,0.278478,0.228586,1.0,0.163366,0.0,0.19377,0.011428,0.211491,0.042979
7,0.0495,0.184567,0.084077,0.055596,0.115494,0.087316,0.163366,1.0,0.183044,0.0,0.03666,0.103711,0.140691
8,0.0,0.034352,0.063258,0.112423,0.0,0.068045,0.0,0.183044,1.0,0.0,0.17066,0.0,0.168088
9,0.0,0.077631,0.0,0.0,0.0,0.0,0.19377,0.0,0.0,1.0,0.0,0.085538,0.0


# Test

In [20]:
# respiratory system diseases ---> Medical Doc

test_doc=["Pneumonia. An infection of your alveoli, usually by bacteria or viruses, including the coronavirus that causes COVID-19.",
          "Tuberculosis Pneumonia that slowly gets worse, caused by the bacteria Mycobacterium tuberculosis.",
          "Emphysema. This happens when the fragile links between alveoli are damaged. Smoking is the usual cause. (Emphysema also limits airflow, affecting your airways.)",
          "Pulmonary edema. Fluid leaks out of the small blood vessels of your lung into the air sacs and the area around them.",
          "One form is caused by heart failure and back pressure in your lungs' blood vessels. In another form, injury to your lung causes the leak of fluid.",
          "Lung cancer. It has many forms and may start in any part of your lungs. It most often happens in the main part of your lung, in or near the air sacs.",
          "Acute respiratory distress syndrome (ARDS). This is a severe, sudden injury to the lungs from a serious illness. COVID-19 is one example.",
          "Many people who have ARDS need help breathing from a machine called a ventilator until their lungs recover.",
          "Pneumoconiosis. This is a category of conditions caused by inhaling something that injures your lungs." ,
          "Examples include black lung disease from coal dust and asbestosis from asbestos dust."]
 

In [14]:
print(len(test_doc))

10


In [15]:
tokenized_corpus_test = tokenize_corpus(test_doc)
print(tokenized_corpus_test)


[['an', 'infection', 'of', 'your', 'usually', 'by', 'bacteria', 'or', 'including', 'the', 'coronavirus', 'that', 'causes'], ['tuberculosis', 'pneumonia', 'that', 'slowly', 'gets', 'caused', 'by', 'the', 'bacteria', 'mycobacterium'], ['this', 'happens', 'when', 'the', 'fragile', 'links', 'between', 'alveoli', 'are', 'smoking', 'is', 'the', 'usual', 'also', 'limits', 'affecting', 'your'], ['pulmonary', 'fluid', 'leaks', 'out', 'of', 'the', 'small', 'blood', 'vessels', 'of', 'your', 'lung', 'into', 'the', 'air', 'sacs', 'and', 'the', 'area', 'around'], ['one', 'form', 'is', 'caused', 'by', 'heart', 'failure', 'and', 'back', 'pressure', 'in', 'your', 'blood', 'in', 'another', 'injury', 'to', 'your', 'lung', 'causes', 'the', 'leak', 'of'], ['lung', 'it', 'has', 'many', 'forms', 'and', 'may', 'start', 'in', 'any', 'part', 'of', 'your', 'it', 'most', 'often', 'happens', 'in', 'the', 'main', 'part', 'of', 'your', 'in', 'or', 'near', 'the', 'air'], ['acute', 'respiratory', 'distress', 'syndrome

In [16]:
vocabulary_test=vocabulary_fun(tokenized_corpus_test)
print(len(vocabulary_test))
print(vocabulary_test)

104
['an', 'infection', 'of', 'your', 'usually', 'by', 'bacteria', 'or', 'including', 'the', 'coronavirus', 'that', 'causes', 'tuberculosis', 'pneumonia', 'slowly', 'gets', 'caused', 'mycobacterium', 'this', 'happens', 'when', 'fragile', 'links', 'between', 'alveoli', 'are', 'smoking', 'is', 'usual', 'also', 'limits', 'affecting', 'pulmonary', 'fluid', 'leaks', 'out', 'small', 'blood', 'vessels', 'lung', 'into', 'air', 'sacs', 'and', 'area', 'around', 'one', 'form', 'heart', 'failure', 'back', 'pressure', 'in', 'another', 'injury', 'to', 'leak', 'it', 'has', 'many', 'forms', 'may', 'start', 'any', 'part', 'most', 'often', 'main', 'near', 'acute', 'respiratory', 'distress', 'syndrome', 'a', 'sudden', 'lungs', 'from', 'serious', 'people', 'who', 'have', 'ards', 'need', 'help', 'breathing', 'machine', 'called', 'ventilator', 'until', 'their', 'category', 'conditions', 'inhaling', 'something', 'injures', 'examples', 'include', 'black', 'disease', 'coal', 'dust', 'asbestosis', 'asbestos']


In [17]:
# TF_IDF

TF_IDF_doc_test= pd.DataFrame(TF_IDF_FUN (tokenized_corpus_test , vocabulary_test))

TF_IDF_test_df = pd.DataFrame(TF_IDF_doc_test)
TF_IDF_test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,0.056951,0.056951,0.020249,0.0151,0.056951,0.02634,0.043405,0.043405,0.056951,0.010639,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.034242,0.056427,0.0,0.0,0.01383,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.011547,0.0,0.0,0.0,0.0,0.0,0.016271,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.026324,0.009815,0.0,0.0,0.0,0.0,0.0,0.020745,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.011445,0.017069,0.0,0.014888,0.0,0.0,0.0,0.006013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.018803,0.014021,0.0,0.0,0.0,0.020153,0.0,0.009879,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.020249,0.0151,0.0,0.02634,0.0,0.0,0.0,0.0,...,0.056951,0.056951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.061697,0.061697,0.061697,0.061697,0.061697,0.061697,0.061697,0.061697


In [18]:
cosine_sim_doc_test = cosine_similarity(TF_IDF_doc_test, TF_IDF_doc_test)

sim_sum_doc_test= np.triu(cosine_sim_doc_test).sum()-np.trace(cosine_sim_doc_test)
print('sim_sum_doc_test : ',sim_sum_doc_test)

count_doc_test=len(cosine_sim_doc_test)

total_doc_test=(count_doc_test*count_doc_test-count_doc_test)/2
print('total_doc_test : ' ,total_doc_test)

centroid_doc_test =sim_sum_doc_test/total_doc_test
print('centroid_doc_test : ',centroid_doc_test)

cosine_sim_doc_test_df = pd.DataFrame(cosine_sim_doc_test)
cosine_sim_doc_test_df

sim_sum_doc_test :  2.474241104166083
total_doc_test :  45.0
centroid_doc_test :  0.05498313564813518


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.168371,0.014513,0.044103,0.107208,0.074705,0.003776,0.0,0.104742,0.0
1,0.168371,1.0,0.007767,0.011593,0.063106,0.005364,0.004056,0.0,0.135444,0.0
2,0.014513,0.007767,1.0,0.022521,0.0324,0.048126,0.071826,0.0,0.068259,0.0
3,0.044103,0.011593,0.022521,1.0,0.114497,0.103688,0.008805,0.0,0.034492,0.038614
4,0.107208,0.063106,0.0324,0.114497,1.0,0.23822,0.175897,0.0,0.105972,0.036617
5,0.074705,0.005364,0.048126,0.103688,0.23822,1.0,0.004074,0.029544,0.029148,0.026801
6,0.003776,0.004056,0.071826,0.008805,0.175897,0.004074,1.0,0.180104,0.16629,0.06672
7,0.0,0.0,0.0,0.0,0.0,0.029544,0.180104,1.0,0.068744,0.058129
8,0.104742,0.135444,0.068259,0.034492,0.105972,0.029148,0.16629,0.068744,1.0,0.0
9,0.0,0.0,0.0,0.038614,0.036617,0.026801,0.06672,0.058129,0.0,1.0


In [19]:
c1= centroid_doc_test + centroid_doc_1
c2= centroid_doc_test + centroid_doc_2

if c1 > c2 :
  print("test doc ----> doc 1 ( Story Doc )")
elif c1 < c2:
  print("test doc ----> doc 2 ( medical Doc )")
else:
  print("test doc ----> doc 1 OR doc 2")

test doc ----> doc 2 ( medical Doc )
