## Term Frequency-Inverse Document Frequency based Vectorizer

In [1]:
import json
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
with open("output/pandawa_dataset_v2.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [3]:
korpus = [item["answer"] for item in data]
korpus = pd.Series(korpus)

In [4]:
stopword_id = set([
    "adalah", "yang", "dan", "di", "dari", "ke", "untuk", 
    "pada", "sebagai", "oleh", "dengan", "dalam"
])

In [5]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()


In [6]:
def pra_proses(korpus):
    hasil = []
    for teks in korpus:
        teks = teks.lower()
        teks = re.sub(r'[^a-zA-Z0-9\s]', ' ', teks)
        kata2 = []
        for kata in teks.split():
            if kata not in stopword_id:
                kata2.append(stemmer.stem(kata))
        hasil.append(" ".join(kata2))
    return hasil


In [7]:
pra_korpus = pra_proses(korpus)

In [8]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(pra_korpus)

In [9]:
print("\nFitur TF-IDF:")
print(vectorizer.get_feature_names_out())


Fitur TF-IDF:
['01' '04' '05' ... 'yudana' 'zaman' 'ziarah']


In [10]:
print("\nMatriks TF-IDF:")
print(tfidf_matrix.toarray())


Matriks TF-IDF:
[[0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.0105145 0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]]


In [11]:
print("\nUkuran Matriks:", tfidf_matrix.shape)


Ukuran Matriks: (33, 2902)


In [12]:
vectorizer_ngram = TfidfVectorizer(ngram_range=(1,3), max_features=15)
tfidf_ngram = vectorizer_ngram.fit_transform(pra_korpus)

In [13]:
print("\nFitur N-Gram:")
print(vectorizer_ngram.get_feature_names_out())
print(tfidf_ngram.toarray())


Fitur N-Gram:
['arya' 'bambang' 'batara' 'dewi' 'gatutkaca' 'ia' 'itu' 'kresna' 'mereka'
 'prabu' 'prabu kresna' 'pun' 'raden' 'raja' 'tidak']
[[0.05391724 0.         0.21305764 0.17583495 0.10110948 0.27407928
  0.16624481 0.19538532 0.13479309 0.53467925 0.19538532 0.17972412
  0.54366546 0.28306549 0.13928619]
 [0.08267943 0.01114772 0.         0.23841344 0.13023899 0.26044021
  0.29351198 0.13153776 0.24803829 0.43406701 0.13153776 0.16535886
  0.63663162 0.15295695 0.15709092]
 [0.20853744 0.00540716 0.         0.05782078 0.31134743 0.19650643
  0.16041342 0.08932267 0.11228939 0.22056845 0.08932267 0.1924961
  0.81409808 0.03609302 0.11629973]
 [0.20329154 0.0063744  0.07005701 0.03408193 0.06915279 0.35930598
  0.21274696 0.12034382 0.08037108 0.1843807  0.12034382 0.19856383
  0.77534449 0.16074215 0.19383612]
 [0.03136041 0.         0.         0.27855829 0.22494558 0.27048356
  0.16072211 0.03326163 0.04704062 0.28616377 0.03326163 0.16464217
  0.79185042 0.07448098 0.1607221