In [1]:
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# here we define our document collection which contains 5 documents
# this is an array of strings
documents = ["Euler is the father of graph theory",
             "Graph theory studies the properties of graphs",
             "Bioinformatics studies the application of efficient algorithms in biological problems",
             "DNA sequences are very complex biological structures",
             "Genes are parts of a DNA sequence"]

tfidf_vectorizer = TfidfVectorizer()

In [3]:
## TODO: get the matrix
tfidf_matrix_sparse = tfidf_vectorizer.fit_transform(documents)
print(tfidf_matrix_sparse)

  (0, 8)	0.4442143564455307
  (0, 14)	0.4442143564455307
  (0, 23)	0.2974955316311179
  (0, 9)	0.4442143564455307
  (0, 15)	0.2502626198745014
  (0, 11)	0.3583893457792226
  (0, 24)	0.3583893457792226
  (1, 23)	0.3083034814908922
  (1, 15)	0.25935460802151405
  (1, 11)	0.3714095550516834
  (1, 24)	0.3714095550516834
  (1, 22)	0.3714095550516834
  (1, 18)	0.4603525702369509
  (1, 12)	0.4603525702369509
  (2, 23)	0.23578260125361059
  (2, 15)	0.1983477573159667
  (2, 22)	0.28404450899176437
  (2, 3)	0.35206584751933884
  (2, 1)	0.35206584751933884
  (2, 7)	0.35206584751933884
  (2, 0)	0.35206584751933884
  (2, 13)	0.35206584751933884
  (2, 4)	0.28404450899176437
  (2, 17)	0.35206584751933884
  (3, 4)	0.33067681238156543
  (3, 6)	0.33067681238156543
  (3, 20)	0.40986538560224284
  (3, 2)	0.33067681238156543
  (3, 25)	0.40986538560224284
  (3, 5)	0.40986538560224284
  (3, 21)	0.40986538560224284
  (4, 15)	0.26213107330293306
  (4, 6)	0.3753856006004972
  (4, 2)	0.3753856006004972
  (4, 10)

In [4]:
# get the unique terms of the collection and display them
terms = tfidf_vectorizer.get_feature_names()

print("The unique terms of the collection are: ")
print(terms)

# print matrix dimensionality
print("The dimensionality of the tfidf matrix is: ")
print(tfidf_matrix_sparse.shape)

# print matrix contents

The unique terms of the collection are: 
['algorithms', 'application', 'are', 'bioinformatics', 'biological', 'complex', 'dna', 'efficient', 'euler', 'father', 'genes', 'graph', 'graphs', 'in', 'is', 'of', 'parts', 'problems', 'properties', 'sequence', 'sequences', 'structures', 'studies', 'the', 'theory', 'very']
The dimensionality of the tfidf matrix is: 
(5, 26)


In [5]:
## TODO: get the tabular form of the tf-idf matrix.
tfidf_matrix_dense = tfidf_matrix_sparse.todense()
print(tfidf_matrix_dense)

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.44421436 0.44421436 0.         0.35838935
  0.         0.         0.44421436 0.25026262 0.         0.
  0.         0.         0.         0.         0.         0.29749553
  0.35838935 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.37140956
  0.46035257 0.         0.         0.25935461 0.         0.
  0.46035257 0.         0.         0.         0.37140956 0.30830348
  0.37140956 0.        ]
 [0.35206585 0.35206585 0.         0.35206585 0.28404451 0.
  0.         0.35206585 0.         0.         0.         0.
  0.         0.35206585 0.         0.19834776 0.         0.35206585
  0.         0.         0.         0.         0.28404451 0.2357826
  0.         0.        ]
 [0.         0.         0.33067681 0.         0.33067681 0.40986539
  0.33067681 0.         0.         0.         0.         0.
  0.         0.         0.    

In [6]:
## TODO: compute the doc-doc similarity matrix.
ddsim_matrix = cosine_similarity(tfidf_matrix_dense)

# define the doc-doc similarity matrix based on the cosine distance
print("This is the doc-doc similarity matrix :")
print(ddsim_matrix)
print("Diagonals of ones (since cosine between a document and itself is <X,X>/||X||^2 = 1)")
print("The matrix si simetric since cosine(X,Y) = cosine(Y,X)")

This is the doc-doc similarity matrix :
[[1.         0.42284413 0.1197833  0.         0.06560161]
 [0.42284413 1.         0.22963185 0.         0.0679849 ]
 [0.1197833  0.22963185 1.         0.09392693 0.05199311]
 [0.         0.         0.09392693 1.         0.24826263]
 [0.06560161 0.0679849  0.05199311 0.24826263 1.        ]]
Diagonals of ones (since cosine between a document and itself is <X,X>/||X||^2 = 1)
The matrix si simetric since cosine(X,Y) = cosine(Y,X)


In [7]:
# display the first line of the similarity matrix
# these are the similarity values between the first document with the rest of the documents
print("The first row of the doc-doc similarity matrix: ")
print(ddsim_matrix[:1])

cosine_1_2 = 0.42284413
angle_in_radians = math.acos(cosine_1_2)
angle_in_degrees = math.degrees(angle_in_radians)
print("The cosine of the angle between doc1 and doc2 is : \t" + str(cosine_1_2))
print("The angle (in radians) between doc1 and doc2 is  : \t"  + str(angle_in_radians))
print("The angle (in degrees) between doc1 and doc2 is  : \t"  + str(angle_in_degrees))

The first row of the doc-doc similarity matrix: 
[[1.         0.42284413 0.1197833  0.         0.06560161]]
The cosine of the angle between doc1 and doc2 is : 	0.42284413
The angle (in radians) between doc1 and doc2 is  : 	1.1342147812610444
The angle (in degrees) between doc1 and doc2 is  : 	64.98572002761169
