In [15]:
!pip install pymupdf Sastrawi nltk scikit-learn



In [16]:
import fitz  # PyMuPDF untuk membaca PDF
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from google.colab import files

In [17]:
# Download stopwords untuk bahasa Indonesia
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
# Fungsi untuk ekstrak teks dari PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + " "
    return text.strip()

# Fungsi preprocessing teks
def preprocess_text(text):
    # Case folding: Mengubah teks menjadi huruf kecil
    text = text.lower()

    # Tokenisasi
    tokens = word_tokenize(text)

    # Menghapus tanda baca
    tokens = [word for word in tokens if word.isalnum()]

    # Menghapus stopwords
    stop_words = set(stopwords.words("indonesian"))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming menggunakan Sastrawi
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)

# Fungsi untuk vektorisasi menggunakan TF-IDF
def tfidf_vectorize(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer

In [19]:
# Upload file PDF ke Google Colab
print("Silakan upload file PDF")
uploaded = files.upload()

# Proses file PDF yang diunggah
pdf_texts = []
for filename in uploaded.keys():
    print(f"Memproses: {filename}")
    text = extract_text_from_pdf(filename)
    processed_text = preprocess_text(text)
    pdf_texts.append(processed_text)

Silakan upload file PDF


Saving TensorFlow vs PyTorch Mana yang Lebih Baik untuk AI & Machine Learning.pdf to TensorFlow vs PyTorch Mana yang Lebih Baik untuk AI & Machine Learning.pdf
Memproses: TensorFlow vs PyTorch Mana yang Lebih Baik untuk AI & Machine Learning.pdf


In [20]:
# Representasi teks dalam bentuk TF-IDF
tfidf_matrix, vectorizer = tfidf_vectorize(pdf_texts)

In [21]:
# Menampilkan hasil TF-IDF
print("\nTF-IDF Representation (Array):")
print(tfidf_matrix.toarray())

# Menampilkan kata-kata fitur yang dipilih oleh TF-IDF
print("\nFeature Names:")
print(vectorizer.get_feature_names_out())


TF-IDF Representation (Array):
[[0.20526182 0.05864624 0.02932312 0.02932312 0.02932312 0.02932312
  0.02932312 0.02932312 0.08796935 0.14661559 0.02932312 0.02932312
  0.02932312 0.02932312 0.05864624 0.02932312 0.11729247 0.02932312
  0.14661559 0.05864624 0.05864624 0.02932312 0.08796935 0.05864624
  0.11729247 0.23458494 0.08796935 0.02932312 0.02932312 0.08796935
  0.02932312 0.02932312 0.05864624 0.08796935 0.08796935 0.02932312
  0.02932312 0.20526182 0.02932312 0.02932312 0.02932312 0.02932312
  0.20526182 0.05864624 0.02932312 0.02932312 0.02932312 0.02932312
  0.05864624 0.02932312 0.02932312 0.02932312 0.05864624 0.02932312
  0.02932312 0.20526182 0.05864624 0.05864624 0.02932312 0.02932312
  0.02932312 0.08796935 0.05864624 0.05864624 0.02932312 0.20526182
  0.11729247 0.02932312 0.05864624 0.08796935 0.02932312 0.02932312
  0.08796935 0.05864624 0.05864624 0.02932312 0.02932312 0.08796935
  0.02932312 0.38120053 0.11729247 0.02932312 0.05864624 0.02932312
  0.02932312 0.1