## Import Library

In [1]:
import numpy as np
import pandas as pd
import os
import re # Import Regex
# Import NLTK
import nltk
from nltk.stem import PorterStemmer
# End Import NLTK
from collections import Counter # Import Counter
import math # Import Math

In [2]:
import warnings

In [3]:
# Ambil list Stopwords dari file txt
f = open("stopword/stopwords-en.txt", "r")
stopword_list = []
for line in f:
    stripped_line = line.strip()
    line_list = stripped_line.split()
    stopword_list.append(line_list[0])
f.close()

# print(stopword_list)
len(stopword_list)

1298

## Preparation

### Defining Function

In [4]:
documents_list = []
document_path = "docs"

# os.listdir returns a list containing all files under the given path
list_doc = os.listdir(document_path)


for doc_name in list_doc:
  if doc_name.endswith('.txt'):
    try:
      filepath = os.path.join(document_path, doc_name)
      if os.path.isfile(filepath):
        f = open(filepath, "r")
        documents_list.append(
          {'title':doc_name,
          'teks':f.read()}
          )
        f.close()

    except Exception as e:
      print(e)

document_count = len(documents_list)

documents_list

[{'title': 'Doc5.txt',
  'teks': 'Training head cost factor on evidence evidence have'},
 {'title': 'Doc4.txt', 'teks': 'Arrive fire pattern each'},
 {'title': 'Doc1.txt',
  'teks': 'My favorite favorite food documents is chocolate'},
 {'title': 'Doc3.txt', 'teks': 'My favorite food is chicken nugget'},
 {'title': 'Doc2.txt', 'teks': 'My favorite food is chocolate'}]

### Start TF-IDF - Preprocessing

In [5]:
def tokenize(text):
  # Convert text ke lowercase
  text = text.lower()

  # Hapus tanda baca
  text = re.sub(r'[^\w\s]', '', text)

  # Pecah kata
  words = re.findall(r'\b\w+\b', text)

  return words

In [6]:
# Tokenize masing-masing document
for doc in documents_list:
    doc['content'] = tokenize(doc['teks'])

for doc in documents_list:
    print(doc)

{'title': 'Doc5.txt', 'teks': 'Training head cost factor on evidence evidence have', 'content': ['training', 'head', 'cost', 'factor', 'on', 'evidence', 'evidence', 'have']}
{'title': 'Doc4.txt', 'teks': 'Arrive fire pattern each', 'content': ['arrive', 'fire', 'pattern', 'each']}
{'title': 'Doc1.txt', 'teks': 'My favorite favorite food documents is chocolate', 'content': ['my', 'favorite', 'favorite', 'food', 'documents', 'is', 'chocolate']}
{'title': 'Doc3.txt', 'teks': 'My favorite food is chicken nugget', 'content': ['my', 'favorite', 'food', 'is', 'chicken', 'nugget']}
{'title': 'Doc2.txt', 'teks': 'My favorite food is chocolate', 'content': ['my', 'favorite', 'food', 'is', 'chocolate']}


In [7]:
def remove_stopwords(tokens):
    # Hilangkan stopword dari teks
    filtered_tokens = [token for token in tokens if token not in stopword_list]

    return filtered_tokens

In [8]:
for doc in documents_list:
  doc['content'] = remove_stopwords(doc['content'])

for doc in documents_list:
    print(doc)

{'title': 'Doc5.txt', 'teks': 'Training head cost factor on evidence evidence have', 'content': ['training', 'head', 'cost', 'factor', 'evidence', 'evidence']}
{'title': 'Doc4.txt', 'teks': 'Arrive fire pattern each', 'content': ['arrive', 'pattern']}
{'title': 'Doc1.txt', 'teks': 'My favorite favorite food documents is chocolate', 'content': ['favorite', 'favorite', 'food', 'documents', 'chocolate']}
{'title': 'Doc3.txt', 'teks': 'My favorite food is chicken nugget', 'content': ['favorite', 'food', 'chicken', 'nugget']}
{'title': 'Doc2.txt', 'teks': 'My favorite food is chocolate', 'content': ['favorite', 'food', 'chocolate']}


In [9]:
# Initialize Porter Stemmer
stemmer = PorterStemmer()

def stem_tokens(tokens):
    # Perform stemming on each token
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    return stemmed_tokens

In [10]:
# Lakukan Stemming pada document
for doc in documents_list:
  doc['content'] = stem_tokens(doc['content'])

for doc in documents_list:
    print(doc)

{'title': 'Doc5.txt', 'teks': 'Training head cost factor on evidence evidence have', 'content': ['train', 'head', 'cost', 'factor', 'evid', 'evid']}
{'title': 'Doc4.txt', 'teks': 'Arrive fire pattern each', 'content': ['arriv', 'pattern']}
{'title': 'Doc1.txt', 'teks': 'My favorite favorite food documents is chocolate', 'content': ['favorit', 'favorit', 'food', 'document', 'chocol']}
{'title': 'Doc3.txt', 'teks': 'My favorite food is chicken nugget', 'content': ['favorit', 'food', 'chicken', 'nugget']}
{'title': 'Doc2.txt', 'teks': 'My favorite food is chocolate', 'content': ['favorit', 'food', 'chocol']}


In [11]:
# Build the vocabulary
vocab = set()
for tokens in documents_list:
    vocab.update(tokens['content'])
vocab = list(vocab)

print(vocab)

['train', 'evid', 'chicken', 'document', 'food', 'pattern', 'arriv', 'favorit', 'cost', 'factor', 'chocol', 'head', 'nugget']


## Query Operation

In [12]:
query = "find documents about chicken nugget"
query

'find documents about chicken nugget'

In [13]:
query_tokens = tokenize(query)
print(query_tokens)

['find', 'documents', 'about', 'chicken', 'nugget']


In [14]:
query_tokens = remove_stopwords(query_tokens)
print(query_tokens)

['documents', 'chicken', 'nugget']


In [15]:
query_tokens = stem_tokens(query_tokens)
query_tokens

['document', 'chicken', 'nugget']

In [16]:
def find_count(query_tokens):
    query_count = {token: query_tokens.count(token) for token in query_tokens}
    
    return query_count

query_count = find_count(query_tokens)
query_count

{'document': 1, 'chicken': 1, 'nugget': 1}

### Cari Nilai Term Frequency masing masing kata di masing-masing dokumen

In [17]:
word_list = set(word for doc in documents_list for word in doc['content'])
word_list = list(word_list)

custom_index_name = 'Q'
doc_titles = [doc['title'] for doc in documents_list]

tf_count = pd.DataFrame(columns=[custom_index_name] + sorted(doc_titles), index=word_list)

for doc in documents_list:
    word_count = {word: doc['content'].count(word) for word in word_list}
    tf_count[doc['title']] = tf_count.index.map(word_count.get)

tf_count['Q'] = tf_count.index.map(query_count.get)   
tf_count = tf_count.fillna(0).astype(int)

tf_count

Unnamed: 0,Q,Doc1.txt,Doc2.txt,Doc3.txt,Doc4.txt,Doc5.txt
train,0,0,0,0,0,1
evid,0,0,0,0,0,2
chicken,1,0,0,1,0,0
document,1,1,0,0,0,0
food,0,1,1,1,0,0
pattern,0,0,0,0,1,0
arriv,0,0,0,0,1,0
favorit,0,2,1,1,0,0
cost,0,0,0,0,0,1
factor,0,0,0,0,0,1


### Cari Nilai DF di Masing-Masing

In [18]:
df_count = tf_count.copy()
column_df = 0
if 'df' not in df_count.columns:
    column_df = df_count.drop(columns='Q').astype(bool).sum(axis=1)
    df_count['df'] = column_df

df_count

Unnamed: 0,Q,Doc1.txt,Doc2.txt,Doc3.txt,Doc4.txt,Doc5.txt,df
train,0,0,0,0,0,1,1
evid,0,0,0,0,0,2,1
chicken,1,0,0,1,0,0,1
document,1,1,0,0,0,0,1
food,0,1,1,1,0,0,3
pattern,0,0,0,0,1,0,1
arriv,0,0,0,0,1,0,1
favorit,0,2,1,1,0,0,3
cost,0,0,0,0,0,1,1
factor,0,0,0,0,0,1,1


### Cari Nilai Inverse Document Frequency

In [19]:
# Buat copy baru dari dataframe df_count
idf_df = df_count.copy()

# Hitung nilai idf dan simpan kedalam variabel idf_values
idf_values = (np.log10(document_count/column_df)) 

# Buat sebuah pandas series dari idf_values
idf_values = pd.Series(idf_values)

# Tambahkan kolom idf dengan nilai dari idf_values
idf_df['IDF'] = idf_values.loc[word_list].values

idf_df

Unnamed: 0,Q,Doc1.txt,Doc2.txt,Doc3.txt,Doc4.txt,Doc5.txt,df,IDF
train,0,0,0,0,0,1,1,0.69897
evid,0,0,0,0,0,2,1,0.69897
chicken,1,0,0,1,0,0,1,0.69897
document,1,1,0,0,0,0,1,0.69897
food,0,1,1,1,0,0,3,0.221849
pattern,0,0,0,0,1,0,1,0.69897
arriv,0,0,0,0,1,0,1,0.69897
favorit,0,2,1,1,0,0,3,0.221849
cost,0,0,0,0,0,1,1,0.69897
factor,0,0,0,0,0,1,1,0.69897


### Finding TF-IDF

In [20]:
# Buat copy baru dari dataframe idf_df
df_w = idf_df.copy()

# Ambil nilai idf dan simpan ke variabel idf_row
idf_row = df_w['IDF']

# Ambil data terkecuali 2 kolom akhir dari df, yaitu kolom df dan idf
tf_df = df_w.iloc[:, :-2]

# Hitung nilai tf-idf dengan mengkalikan nilai tf dengan idf
tfidf_df = tf_df.mul(idf_row, axis=0)

# Tambahkan awalan W_ pada kolom Q
new_columns = ['W_' + col if 'Q' in col else col for col in tfidf_df.columns]
tfidf_df.columns = new_columns

tfidf_df

Unnamed: 0,W_Q,Doc1.txt,Doc2.txt,Doc3.txt,Doc4.txt,Doc5.txt
train,0.0,0.0,0.0,0.0,0.0,0.69897
evid,0.0,0.0,0.0,0.0,0.0,1.39794
chicken,0.69897,0.0,0.0,0.69897,0.0,0.0
document,0.69897,0.69897,0.0,0.0,0.0,0.0
food,0.0,0.221849,0.221849,0.221849,0.0,0.0
pattern,0.0,0.0,0.0,0.0,0.69897,0.0
arriv,0.0,0.0,0.0,0.0,0.69897,0.0
favorit,0.0,0.443697,0.221849,0.221849,0.0,0.0
cost,0.0,0.0,0.0,0.0,0.0,0.69897
factor,0.0,0.0,0.0,0.0,0.0,0.69897


In [21]:
result = pd.concat([idf_df, tfidf_df], axis=1)
result

Unnamed: 0,Q,Doc1.txt,Doc2.txt,Doc3.txt,Doc4.txt,Doc5.txt,df,IDF,W_Q,Doc1.txt.1,Doc2.txt.1,Doc3.txt.1,Doc4.txt.1,Doc5.txt.1
train,0,0,0,0,0,1,1,0.69897,0.0,0.0,0.0,0.0,0.0,0.69897
evid,0,0,0,0,0,2,1,0.69897,0.0,0.0,0.0,0.0,0.0,1.39794
chicken,1,0,0,1,0,0,1,0.69897,0.69897,0.0,0.0,0.69897,0.0,0.0
document,1,1,0,0,0,0,1,0.69897,0.69897,0.69897,0.0,0.0,0.0,0.0
food,0,1,1,1,0,0,3,0.221849,0.0,0.221849,0.221849,0.221849,0.0,0.0
pattern,0,0,0,0,1,0,1,0.69897,0.0,0.0,0.0,0.0,0.69897,0.0
arriv,0,0,0,0,1,0,1,0.69897,0.0,0.0,0.0,0.0,0.69897,0.0
favorit,0,2,1,1,0,0,3,0.221849,0.0,0.443697,0.221849,0.221849,0.0,0.0
cost,0,0,0,0,0,1,1,0.69897,0.0,0.0,0.0,0.0,0.0,0.69897
factor,0,0,0,0,0,1,1,0.69897,0.0,0.0,0.0,0.0,0.0,0.69897


### Finding Weight

In [22]:
# Transpose matriks tfidf_df untuk merubah posisi kolom dan row
weight_df = tfidf_df.T

In [23]:
# Filter kolom berdasarkan nilai W_Q dimana nilai yang != 0 dihilangkan
weight_df_filtered_columns = weight_df.loc[:, weight_df.loc['W_Q'] != 0.0]

weight_df_filtered_columns

Unnamed: 0,chicken,document,nugget
W_Q,0.69897,0.69897,0.69897
Doc1.txt,0.0,0.69897,0.0
Doc2.txt,0.0,0.0,0.0
Doc3.txt,0.69897,0.0,0.69897
Doc4.txt,0.0,0.0,0.0
Doc5.txt,0.0,0.0,0.0


In [205]:
# Hitung hasil penjumlahan masing-masing row
if 'Weight' not in weight_df_filtered_columns.columns:
    row_sums = weight_df_filtered_columns.sum(axis=1)
    weight_df_filtered_columns.loc[:, 'Weight'] = row_sums

# Hapus row W_Q
if 'W_Q' in weight_df_filtered_columns.columns:
    weight_df_filtered_columns = weight_df_filtered_columns.drop(columns=["W_Q"])

### Ranked Weight

In [206]:
weight_df_filtered_columns_ranked = weight_df_filtered_columns.sort_values(by='Weight', ascending=False)
weight_df_filtered_columns_ranked

Unnamed: 0,document,chicken,nugget,Weight
Doc3.txt,0.0,0.69897,0.69897,1.39794
Doc1.txt,0.69897,0.0,0.0,0.69897
Doc2.txt,0.0,0.0,0.0,0.0
Doc4.txt,0.0,0.0,0.0,0.0
Doc5.txt,0.0,0.0,0.0,0.0


In [213]:
# Find the most relevant document
most_relevant_document_title = weight_df_filtered_columns_ranked.index[0]

# Print the most relevant document title and its original text
for doc in documents_list:
    if doc['title'] == most_relevant_document_title:
        print(f"Dokumen yang paling relevan terhadap query merupakan dokumen: {doc['title']}")
        print(f"Teks: {doc['teks']}")
        break

Dokumen yang paling relevan terhadap query merupakan dokumen: Doc3.txt
Teks: My favorite food is chicken nugget
