In [17]:
import os
import numpy as np
import pandas as pd
import nltk
import math
import re # Import Regex
# Import NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# End Import NLTK
import math # Import Math

In [18]:
# Ensure necessary NLTK downloads
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prihandana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prihandana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [85]:
# Ambil list Stopwords dari file txt
f = open("stopword/stopwords-en.txt", "r")
stopword_list = []
for line in f:
    stripped_line = line.strip()
    line_list = stripped_line.split()
    stopword_list.append(line_list[0])
f.close()

# print(stopword_list)
len(stopword_list)

1298

In [86]:
documents_list = {}
document_path = "docs"

# os.listdir returns a list containing all files under the given path
list_doc = os.listdir(document_path)


for doc_name in list_doc:
  if doc_name.endswith('.txt'):
    try:
      filepath = os.path.join(document_path, doc_name)
      if os.path.isfile(filepath):
        f = open(filepath, "r")
        documents_list[doc_name] = f.read()
        f.close()

    except Exception as e:
      print(e)

document_count = len(documents_list)

documents_list

{'Doc5.txt': 'Training head cost factor on evidence evidence have',
 'Doc4.txt': 'Arrive fire pattern each',
 'Doc1.txt': 'My favorite favorite food documents is chocolate',
 'Doc3.txt': 'My favorite food is chicken nugget',
 'Doc2.txt': 'My favorite food is chocolate'}

In [87]:
def tokenize(text):
  # Convert text to lowercase (case folding)
  text = text.lower()

  # Remove Punctuation
  text = re.sub(r'[^\w\s]', '', text)

  # Break Word
  words = re.findall(r'\b\w+\b', text)

  return words

In [88]:
# Tokenize each document and store the tokenized content
for doc_name, content in documents_list.items():
    documents_list[doc_name] = {
        'original': content,
        'content': tokenize(content)
    }

# Display the tokenized documents
for doc_name, doc in documents_list.items():
    print(f"Tokenized: {doc['content']}\n")

Tokenized: ['training', 'head', 'cost', 'factor', 'on', 'evidence', 'evidence', 'have']

Tokenized: ['arrive', 'fire', 'pattern', 'each']

Tokenized: ['my', 'favorite', 'favorite', 'food', 'documents', 'is', 'chocolate']

Tokenized: ['my', 'favorite', 'food', 'is', 'chicken', 'nugget']

Tokenized: ['my', 'favorite', 'food', 'is', 'chocolate']



In [89]:
def remove_stopwords(tokens):
    # Remove stopwords from the list of tokens
    filtered_tokens = [token for token in tokens if token not in stopword_list]

    return filtered_tokens

In [90]:
# Remove stopwords from the tokenized content
for doc_name, doc in documents_list.items():
    documents_list[doc_name]['content'] = remove_stopwords(doc['content'])

# Display the documents after stopwords removal
for doc_name, doc in documents_list.items():
    print(f"Stopwords: {doc['content']}\n")

Stopwords: ['training', 'head', 'cost', 'factor', 'evidence', 'evidence']

Stopwords: ['arrive', 'pattern']

Stopwords: ['favorite', 'favorite', 'food', 'documents', 'chocolate']

Stopwords: ['favorite', 'food', 'chicken', 'nugget']

Stopwords: ['favorite', 'food', 'chocolate']



In [91]:
# Initialize Porter Stemmer
stemmer = PorterStemmer()

def stem_tokens(tokens):
    # Perform stemming on each token
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    return stemmed_tokens

In [92]:
# Stemming dokumen
for doc_name, doc in documents_list.items():
    documents_list[doc_name]['content'] = stem_tokens(doc['content'])

# Display the documents after stemming
for doc_name, doc in documents_list.items():
    print(f"Stopwords: {doc['content']}\n")

Stopwords: ['train', 'head', 'cost', 'factor', 'evid', 'evid']

Stopwords: ['arriv', 'pattern']

Stopwords: ['favorit', 'favorit', 'food', 'document', 'chocol']

Stopwords: ['favorit', 'food', 'chicken', 'nugget']

Stopwords: ['favorit', 'food', 'chocol']



In [93]:
documents_list['Doc1.txt']

{'original': 'My favorite favorite food documents is chocolate',
 'content': ['favorit', 'favorit', 'food', 'document', 'chocol']}

In [94]:
# Build the vocabulary
vocab = set()
for doc in documents_list.values():
    vocab.update(doc['content'])
vocab = list(vocab)

In [95]:
print(vocab)

['chicken', 'pattern', 'evid', 'nugget', 'chocol', 'train', 'factor', 'cost', 'food', 'favorit', 'head', 'arriv', 'document']


In [96]:
# Vectorize documents
document_vectors = {}
for filename, doc in documents_list.items():
    vector = [doc['content'].count(word) for word in vocab]
    document_vectors[filename] = vector

In [97]:
print(document_vectors)

{'Doc5.txt': [0, 0, 2, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0], 'Doc4.txt': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], 'Doc1.txt': [0, 0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1], 'Doc3.txt': [1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0], 'Doc2.txt': [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0]}


In [99]:
def preprocess(text):
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    return tokens

In [100]:
# Preprocess and vectorize the query
query = "find documents about chicken nugget"
query_tokens = preprocess(query)
query_vector = [query_tokens.count(word) for word in vocab]

print("Query Vector:", query_vector)

Query Vector: [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [101]:
# Constants for Rocchio Algorithm
alpha = 1
beta = 0.75
gamma = 0.25

In [102]:
# Manually mark some documents as relevant and non-relevant
relevant_docs = ['Doc1.txt']  # Replace with actual relevant filenames

In [103]:
non_relevant_docs = ['Doc2.txt', 'Doc3.txt']  # Replace with actual non-relevant filenames

In [104]:
# Calculate centroids for relevant and non-relevant documents
relevant_vectors = [document_vectors[doc] for doc in relevant_docs if doc in document_vectors]
non_relevant_vectors = [document_vectors[doc] for doc in non_relevant_docs if doc in document_vectors]

In [113]:
# Calculate centroids for relevant and non-relevant documents
relevant_centroid = np.mean(relevant_vectors, axis=0)
non_relevant_centroid = np.mean(non_relevant_vectors, axis=0)

In [129]:
# Apply Rocchio Algorithm
adjusted_query = alpha * np.array(query_vector) + beta * relevant_centroid - gamma * non_relevant_centroid

In [130]:
print(adjusted_query)

[0.875 0.    0.    0.875 0.625 0.    0.    0.    0.5   1.25  0.    0.
 1.75 ]


In [123]:
# Calculate cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = sum(p * q for p, q in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum([val ** 2 for val in vec1]))
    magnitude2 = math.sqrt(sum([val ** 2 for val in vec2]))
    if not magnitude1 or not magnitude2:
        return 0
    else:
        return dot_product / (magnitude1 * magnitude2)

In [124]:
similarities = {}
for filename, vector in document_vectors.items():
    similarity = cosine_similarity(adjusted_query, vector)
    similarities[filename] = similarity

In [131]:
similarities

{'Doc5.txt': np.float64(0.0),
 'Doc4.txt': np.float64(0.0),
 'Doc1.txt': np.float64(0.7792464045147061),
 'Doc3.txt': np.float64(0.6712486220795378),
 'Doc2.txt': np.float64(0.5259547057403532)}

In [125]:
# Sort documents by similarity score in descending order
sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)

In [126]:
# Determine the most relevant document based on the adjusted query
most_relevant = max(similarities, key=similarities.get)
print("Most relevant document after Rocchio adjustment:", most_relevant)
print("Similarity score:", similarities[most_relevant])

Most relevant document after Rocchio adjustment: Doc1.txt
Similarity score: 0.7792464045147061


In [127]:
# Display the top 5 most relevant documents
top_documents = sorted_similarities[:5]
print("Top 5 most relevant documents:")
for doc, score in top_documents:
    print(f"{doc}: {score:.4f}")

Top 5 most relevant documents:
Doc1.txt: 0.7792
Doc3.txt: 0.6712
Doc2.txt: 0.5260
Doc5.txt: 0.0000
Doc4.txt: 0.0000
