# Importing libraries and defining text files

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Defining files as variables and splitting the terms into bags of words

In [5]:
file1 = open("Text Files/abstract_1_test")
file2 = open("Text Files/abstract_2_test")
file3 = open("Text Files/abstract_3_test")
file1_data = file1.read()
file2_data = file2.read()
file3_data = file3.read()

bagOfWords1 = file1_data.split(' ')
bagOfWords2 = file2_data.split(' ')
bagOfWords3 = file3_data.split(' ')

We combine the common words using the union method to eliminate repetitions and consider them as unique words

In [6]:
uniqueWords = set(bagOfWords1).union(set(bagOfWords2)).union(set(bagOfWords3))
print(uniqueWords)

{'of', 'system', 'research', 'B.', 'optimization', 'through', 'Stochastic', '6m.', 'adopted', '(SBC).', 'integrated', 'between', 'quickly', 'trap', 'weight', 'rates.', 'by', 'Descent', 'purposes.', '320x240', "ANN's", 'DBN-Adam', 'neural', '100', 'each', '(ANN),', 'raspberry', 'be', '1,618', 'can', 'forecast', 'as', 'MAE', 'structure', 'Mixture', 'carried', 'maintaining', 'test', 'level', 'DBN-SGD', 'parameter.', 'RMSE', 'connections,', 'while', 'have', 'point', 'response', 'determining', 'determined', 'pi', 'on', '(SGD)', 'Computer', 'pixel', 'development,', 'camera', 'online', '3', 'show', 'approach', '/', 'compared', 'observe', 'reach', 'improve', '59.0635004,', 'able', 'detect', 'On', 'k', 'room', 'networks', 'evaluation', 'evaluate', 'and', 'four', 'life', 'Camera', 'exchange', 'far', 'DBN', '(DBN)', 'proposed', '46.406739,', 'Board', 'distance', '=', 'local', 'artificial', 'out', '0.34652.', 'seconds.', 'language', 'Network', 'long', 'Adam', 'its', 'such', 'process', 'second', 'r

For each unique words, we traverse through the bag of words to find the frequency of the terms between the two documents

In [48]:
numOfWords1 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords1:
    numOfWords1[word] += 1
numOfWords2 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords2:
    numOfWords2[word] += 1
numOfWords3 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords3:
    numOfWords3[word] += 1

Printing the frequency of the terms

In [49]:
print(numOfWords1)
print(numOfWords2)
print(numOfWords3)

{'DBN-Adam': 0, 'stored': 0, 'admin': 0, '3': 1, 'by': 1, 'exchange': 0, 'permissions': 0, 'room': 1, 'experiments.': 0, 'rule-base': 0, 'developed': 0, 'best': 2, 'used': 6, 'developing': 0, 'this': 2, 'Therefore,': 0, 'networks': 0, 'OpenCV': 1, 'uploading': 1, 'quickly,': 0, 'questions': 0, 'RMSE': 0, 'program': 0, 'videos,': 1, 'MAE': 0, 'development': 2, 'also': 0, 'programming': 1, 'objects': 1, 'specific': 0, 'USB': 2, 'research': 1, 'question': 0, 'requirements.': 0, 'notifications': 0, 'to': 7, 'camera': 4, 'problem': 0, 'connect': 0, '(SGD)': 0, 'out': 0, 'USD': 0, 'its': 1, 'resolution.': 1, 'one': 0, 'natural': 0, 'about': 0, 'was': 1, 'integrated': 1, 'initial': 0, 'provides': 0, 'computer': 0, 'many': 0, 'network': 0, 'detection': 2, 'the': 10, 'conclusions': 0, 'use': 1, 'purposes.': 1, 'raspberry': 1, '320x240': 1, '95%.': 0, 'condition.': 0, 'but': 1, 'compared': 0, 'server': 0, 'focuses': 0, 'methods': 0, 'on': 1, 'This': 0, 'show': 2, 'model': 0, 'accuracy': 1, 'into

Importing NLTK libraries to eliminate stopwords (Examples shown below)

In [50]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Term frequency calculation

To calculate the frequency of a term, we fetch the number of times the word appear in a document divided by the number of words in the document.

In [10]:
def tf(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [None]:
tf1 = tf(numOfWords1, bagOfWords1)
tf2 = tf(numOfWords2, bagOfWords2)
tf3 = tf(numOfWords3, bagOfWords3)

In [51]:
print(tf1)
print(tf2)
print(tf3)

{'DBN-Adam': 0.0, 'stored': 0.0, 'admin': 0.0, '3': 0.005050505050505051, 'by': 0.005050505050505051, 'exchange': 0.0, 'permissions': 0.0, 'room': 0.005050505050505051, 'experiments.': 0.0, 'rule-base': 0.0, 'developed': 0.0, 'best': 0.010101010101010102, 'used': 0.030303030303030304, 'developing': 0.0, 'this': 0.010101010101010102, 'Therefore,': 0.0, 'networks': 0.0, 'OpenCV': 0.005050505050505051, 'uploading': 0.005050505050505051, 'quickly,': 0.0, 'questions': 0.0, 'RMSE': 0.0, 'program': 0.0, 'videos,': 0.005050505050505051, 'MAE': 0.0, 'development': 0.010101010101010102, 'also': 0.0, 'programming': 0.005050505050505051, 'objects': 0.005050505050505051, 'specific': 0.0, 'USB': 0.010101010101010102, 'research': 0.005050505050505051, 'question': 0.0, 'requirements.': 0.0, 'notifications': 0.0, 'to': 0.03535353535353535, 'camera': 0.020202020202020204, 'problem': 0.0, 'connect': 0.0, '(SGD)': 0.0, 'out': 0.0, 'USD': 0.0, 'its': 0.005050505050505051, 'resolution.': 0.00505050505050505

# Inverse Data Frequency

The log of the number of documents divided by the number of documents that contain the specified word. This inverse data frequency determines the weight of rare words across all documents in the file.

In [13]:
def idf(docs):
    import math
    N = len(docs)

    idfDict = dict.fromkeys(docs[0].keys(), 0)
    for doc in docs:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [23]:
idfs = idf([numOfWords1, numOfWords2, numOfWords3])
print(idfs)

{'DBN-Adam': 1.0986122886681098, 'stored': 1.0986122886681098, 'admin': 1.0986122886681098, '3': 1.0986122886681098, 'by': 0.0, 'exchange': 1.0986122886681098, 'permissions': 1.0986122886681098, 'room': 1.0986122886681098, 'experiments.': 1.0986122886681098, 'rule-base': 1.0986122886681098, 'developed': 1.0986122886681098, 'best': 1.0986122886681098, 'used': 0.0, 'developing': 1.0986122886681098, 'this': 0.4054651081081644, 'Therefore,': 1.0986122886681098, 'networks': 1.0986122886681098, 'OpenCV': 1.0986122886681098, 'uploading': 1.0986122886681098, 'quickly,': 1.0986122886681098, 'questions': 1.0986122886681098, 'RMSE': 1.0986122886681098, 'program': 1.0986122886681098, 'videos,': 1.0986122886681098, 'MAE': 1.0986122886681098, 'development': 1.0986122886681098, 'also': 1.0986122886681098, 'programming': 1.0986122886681098, 'objects': 1.0986122886681098, 'specific': 1.0986122886681098, 'USB': 1.0986122886681098, 'research': 0.4054651081081644, 'question': 1.0986122886681098, 'requirem

# Multiplying the Term Frequency and Inverse Data frequency

By multiplying the term frequency and the inverse data frequency, we get the weight of each term

In [24]:
def tfidf(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [25]:
tfidf1 = tfidf(tf1, idfs)
tfidf2 = tfidf(tf2, idfs)
tfidf3 = tfidf(tf3, idfs)
df = pd.DataFrame([tfidf1, tfidf2, tfidf3])

In [27]:
df

Unnamed: 0,DBN-Adam,stored,admin,3,by,exchange,permissions,room,experiments.,rule-base,...,life,second,ontology.,improve,forecasting,that,Boltzmann,trap,maintaining,(RBM)
0,0.0,0.0,0.0,0.005549,0.0,0.0,0.0,0.005549,0.0,0.0,...,0.005549,0.005549,0.0,0.0,0.0,0.0,0.0,0.005549,0.0,0.0
1,0.013317,0.0,0.0,0.0,0.0,0.013317,0.0,0.0,0.006658,0.0,...,0.0,0.0,0.0,0.006658,0.006658,0.002457,0.006658,0.0,0.006658,0.006658
2,0.0,0.006242,0.012484,0.0,0.0,0.0,0.006242,0.0,0.0,0.006242,...,0.0,0.0,0.006242,0.0,0.0,0.009215,0.0,0.0,0.0,0.0


In [28]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([file1_data, file2_data, file3_data])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
print(vectors.shape)

(3, 269)


df

In [31]:
query = denselist
query_vec = vectorizer.transform([query])
results = cosine_similarity(vectors, query_vec).reshape((-1))

AttributeError: 'list' object has no attribute 'lower'