In [13]:
from google.colab import drive

DRIVE_PATH = '/content/drive'
drive.mount(DRIVE_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import re 
import math

DATA_PATH = "/MyDrive/machine_learning/data/vectorspace.txt"


In [15]:
# function for reading file
def readFile():
  df = open(DRIVE_PATH+DATA_PATH,"r", encoding='UTF8')
  doc = []
  while True:
    line = df.readline()
    if not line:
      break

    line = line.lower()

    line = re.sub(r'[^a-zA-Z0-9\s]', ' ', line)
    doc.append(line)

  df.close()
  query = []
  query.append(doc[0])
  doc.pop(0)
  return doc, query

In [16]:
# function calculating length
def calcLength(t):
    sum = 0
    for i in range(len(t)):
        sum += math.pow(t[i], 2)

    return math.sqrt(sum)


In [17]:
# function for calculating cosine similarity
def getCosSim(table, query):
    q = query.ravel()
    queryLength = calcLength(q)
    cosSim = []

    i = 1
    for t in table:
        t = t.ravel()
        docLength = calcLength(t)
        cosSim.append([i, np.dot(t, q) / docLength * queryLength])
        i += 1

    return cosSim


In [18]:
doc, query = readFile()

vectorizer = TfidfVectorizer(stop_words='english').fit(doc)

Table = vectorizer.transform(doc).toarray()

Query = vectorizer.transform(query).toarray()

cosSim = getCosSim(Table, Query)

print('Document & Cosine similarity')
print(cosSim)

cosSim.sort(key=lambda a : a[1], reverse = True)
print('Ranking:')
i = 1
for a in cosSim:
  print('Rank', i, '= Doc', a[0], ', Similarity score=', a[1])
  i += 1



Document & Cosine similarity
[[1, 0.0], [2, 0.22971152878078144], [3, 0.1611912403016746], [4, 0.15204332272810034], [5, 0.0533597269966836]]
Ranking:
Rank 1 = Doc 2 , Similarity score= 0.22971152878078144
Rank 2 = Doc 3 , Similarity score= 0.1611912403016746
Rank 3 = Doc 4 , Similarity score= 0.15204332272810034
Rank 4 = Doc 5 , Similarity score= 0.0533597269966836
Rank 5 = Doc 1 , Similarity score= 0.0
