# Importing libraries and defining text files

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Defining files as variables and splitting the terms into bags of words

In [2]:
file1 = open("text_files/abstract_1.txt")
file2 = open("text_files/abstract_2.txt")
file3 = open("text_files/abstract_3.txt")
file1_data = file1.read()
file2_data = file2.read()
file3_data = file3.read()

bagOfWords1 = file1_data.split(' ')
bagOfWords2 = file2_data.split(' ')
bagOfWords3 = file3_data.split(' ')

We combine the common words using the union method to eliminate repetitions and consider them as unique words

In [3]:
uniqueWords = set(bagOfWords1).union(set(bagOfWords2)).union(set(bagOfWords3))
print(uniqueWords)

{'castle', 'love,', 'scientists', 'regularly.', 'hearts', 'them', 'research', 'or', 'action', 'pathways,', 'companionship,', 'world', 'masterful', 'technology,', 'analyzing', 'helps', 'remind', 'strength-based', 'legendary,', 'tirelessly', 'wide', 'agility', 'ability', 'ashen', 'protect', 'embody.\n', 'those', 'communication', 'focus', 'rewarding,', 'characters.', 'defense,', 'where', 'identifying', 'adaptability,', 'Its', 'join', 'atmosphere', 'day', 'it.', 'inhabitants', 'owners', 'vulnerability', 'have', 'detect', '(SIEM)', 'resonate', 'armored', 'availability', 'reports,', 'earned', 'involve', 'keeps', 'instinct', 'threat.', 'these', 'adding', 'senses', 'multifaceted', 'able', 'charm.', 'maintain', 'changing,', 'need', 'just', 'strengthen', 'demanding,', 'feel', 'resilience,', 'art,', 'gentle', 'Rather', 'perseverance.', 'growth', 'bring', 'story', 'emerging', 'continue', 'lowly', 'significant', 'thick', 'eyes,', 'III,', 'presence.', 'narrative,', 'achievement,', 'audiences,', 'occ

For each unique words, we traverse through the bag of words to find the frequency of the terms between the two documents

In [4]:
numOfWords1 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords1:
    numOfWords1[word] += 1
numOfWords2 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords2:
    numOfWords2[word] += 1
numOfWords3 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords3:
    numOfWords3[word] += 1

Printing the frequency of the terms

In [5]:
print(numOfWords1)
print(numOfWords2)
print(numOfWords3)

{'castle': 0, 'love,': 0, 'scientists': 0, 'regularly.': 1, 'hearts': 0, 'them': 0, 'research': 1, 'or': 0, 'action': 0, 'pathways,': 0, 'companionship,': 0, 'world': 0, 'masterful': 0, 'technology,': 1, 'analyzing': 1, 'helps': 0, 'remind': 0, 'strength-based': 0, 'legendary,': 0, 'tirelessly': 1, 'wide': 0, 'agility': 0, 'ability': 0, 'ashen': 0, 'protect': 3, 'embody.\n': 0, 'those': 0, 'communication': 1, 'focus': 0, 'rewarding,': 0, 'characters.': 0, 'defense,': 1, 'where': 0, 'identifying': 2, 'adaptability,': 0, 'Its': 0, 'join': 0, 'atmosphere': 0, 'day': 0, 'it.': 1, 'inhabitants': 0, 'owners': 0, 'vulnerability': 1, 'have': 0, 'detect': 1, '(SIEM)': 1, 'resonate': 0, 'armored': 0, 'availability': 1, 'reports,': 1, 'earned': 0, 'involve': 1, 'keeps': 0, 'instinct': 0, 'threat.': 1, 'these': 1, 'adding': 0, 'senses': 0, 'multifaceted': 1, 'able': 1, 'charm.': 0, 'maintain': 1, 'changing,': 1, 'need': 2, 'just': 0, 'strengthen': 1, 'demanding,': 0, 'feel': 0, 'resilience,': 0, '

Importing NLTK libraries to eliminate stopwords (Examples shown below)

In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mark\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

# Term frequency calculation

To calculate the frequency of a term, we fetch the number of times the word appear in a document divided by the number of words in the document.

In [10]:
def tf(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [11]:
tf1 = tf(numOfWords1, bagOfWords1)
tf2 = tf(numOfWords2, bagOfWords2)
tf3 = tf(numOfWords3, bagOfWords3)

In [12]:
print(tf1)
print(tf2)
print(tf3)

{'castle': 0.0, 'love,': 0.0, 'scientists': 0.0, 'regularly.': 0.0030211480362537764, 'hearts': 0.0, 'them': 0.0, 'research': 0.0030211480362537764, 'or': 0.0, 'action': 0.0, 'pathways,': 0.0, 'companionship,': 0.0, 'world': 0.0, 'masterful': 0.0, 'technology,': 0.0030211480362537764, 'analyzing': 0.0030211480362537764, 'helps': 0.0, 'remind': 0.0, 'strength-based': 0.0, 'legendary,': 0.0, 'tirelessly': 0.0030211480362537764, 'wide': 0.0, 'agility': 0.0, 'ability': 0.0, 'ashen': 0.0, 'protect': 0.00906344410876133, 'embody.\n': 0.0, 'those': 0.0, 'communication': 0.0030211480362537764, 'focus': 0.0, 'rewarding,': 0.0, 'characters.': 0.0, 'defense,': 0.0030211480362537764, 'where': 0.0, 'identifying': 0.006042296072507553, 'adaptability,': 0.0, 'Its': 0.0, 'join': 0.0, 'atmosphere': 0.0, 'day': 0.0, 'it.': 0.0030211480362537764, 'inhabitants': 0.0, 'owners': 0.0, 'vulnerability': 0.0030211480362537764, 'have': 0.0, 'detect': 0.0030211480362537764, '(SIEM)': 0.0030211480362537764, 'reson

# Inverse Data Frequency

The log of the number of documents divided by the number of documents that contain the specified word. This inverse data frequency determines the weight of rare words across all documents in the file.

In [13]:
def idf(docs):
    import math
    N = len(docs)

    idfDict = dict.fromkeys(docs[0].keys(), 0)
    for doc in docs:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [14]:
idfs = idf([numOfWords1, numOfWords2, numOfWords3])
print(idfs)

{'castle': 1.0986122886681098, 'love,': 1.0986122886681098, 'scientists': 1.0986122886681098, 'regularly.': 1.0986122886681098, 'hearts': 1.0986122886681098, 'them': 1.0986122886681098, 'research': 1.0986122886681098, 'or': 1.0986122886681098, 'action': 1.0986122886681098, 'pathways,': 1.0986122886681098, 'companionship,': 1.0986122886681098, 'world': 0.4054651081081644, 'masterful': 1.0986122886681098, 'technology,': 1.0986122886681098, 'analyzing': 1.0986122886681098, 'helps': 1.0986122886681098, 'remind': 1.0986122886681098, 'strength-based': 1.0986122886681098, 'legendary,': 1.0986122886681098, 'tirelessly': 1.0986122886681098, 'wide': 1.0986122886681098, 'agility': 1.0986122886681098, 'ability': 1.0986122886681098, 'ashen': 1.0986122886681098, 'protect': 1.0986122886681098, 'embody.\n': 1.0986122886681098, 'those': 1.0986122886681098, 'communication': 1.0986122886681098, 'focus': 1.0986122886681098, 'rewarding,': 1.0986122886681098, 'characters.': 1.0986122886681098, 'defense,': 1

# Multiplying the Term Frequency and Inverse Data frequency

By multiplying the term frequency and the inverse data frequency, we get the weight of each term

In [15]:
def tfidf(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [16]:
tfidf1 = tfidf(tf1, idfs)
tfidf2 = tfidf(tf2, idfs)
tfidf3 = tfidf(tf3, idfs)
df = pd.DataFrame([tfidf1, tfidf2, tfidf3])

In [27]:
df

Unnamed: 0,DBN-Adam,stored,admin,3,by,exchange,permissions,room,experiments.,rule-base,...,life,second,ontology.,improve,forecasting,that,Boltzmann,trap,maintaining,(RBM)
0,0.0,0.0,0.0,0.005549,0.0,0.0,0.0,0.005549,0.0,0.0,...,0.005549,0.005549,0.0,0.0,0.0,0.0,0.0,0.005549,0.0,0.0
1,0.013317,0.0,0.0,0.0,0.0,0.013317,0.0,0.0,0.006658,0.0,...,0.0,0.0,0.0,0.006658,0.006658,0.002457,0.006658,0.0,0.006658,0.006658
2,0.0,0.006242,0.012484,0.0,0.0,0.0,0.006242,0.0,0.0,0.006242,...,0.0,0.0,0.006242,0.0,0.0,0.009215,0.0,0.0,0.0,0.0


In [17]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([file1_data, file2_data, file3_data])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
print(vectors.shape)

(3, 603)


df

In [19]:
query = " ".join(map(str, denselist[0]))
query_vec = vectorizer.transform([query])
results = cosine_similarity(vectors, query_vec).reshape((-1))

In [None]:
tf_df = pd.DataFrame([tf1, tf2, tf3], index=["Document 1", "Document 2", "Document 3"])

# Display the DataFrame
print("\nTerm Frequencies as DataFrame:")
print(tf_df)

# Export to JSON
tf_json = tf_df.to_json(indent=2)
print("\nTerm Frequencies as JSON:")
print(tf_json)

# Export to CSV
tf_df.to_csv("output/term_frequencies.csv")
print("\nTerm frequencies have been exported to 'term_frequencies.csv'")


Term Frequencies as DataFrame:
              castle     love,  scientists  regularly.    hearts     them  \
Document 1  0.000000  0.000000    0.000000    0.003021  0.000000  0.00000   
Document 2  0.000000  0.002445    0.002445    0.000000  0.002445  0.00978   
Document 3  0.002558  0.000000    0.000000    0.000000  0.000000  0.00000   

            research        or    action  pathways,  ...  player's  \
Document 1  0.003021  0.000000  0.000000   0.000000  ...  0.000000   
Document 2  0.000000  0.002445  0.000000   0.000000  ...  0.000000   
Document 3  0.000000  0.000000  0.002558   0.002558  ...  0.005115   

            worldwide.      when      fur,       yet  testing,  experience  \
Document 1    0.000000  0.000000  0.000000  0.000000  0.003021    0.000000   
Document 2    0.002445  0.002445  0.002445  0.000000  0.000000    0.000000   
Document 3    0.000000  0.000000  0.000000  0.002558  0.000000    0.002558   

            distinct     offer   assets,  
Document 1  0.000000  