In [1]:
!pip install pymupdf Sastrawi nltk scikit-learn google-generativeai

Collecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi, pymupdf
Successfully installed Sastrawi-1.0.1 pymupdf-1.25.4


In [2]:
import fitz  # PyMuPDF untuk membaca PDF
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
import google.generativeai as genai
import string
from google.colab import files

In [3]:
# Download stopwords untuk bahasa Indonesia
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
# Konfigurasi API Key Gemini
API_KEY = "YOUR_GEMINI_API_KEY"
genai.configure(api_key=API_KEY)

In [5]:
# Fungsi untuk ekstrak teks dari PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + " "
    return text.strip()

# Fungsi preprocessing teks
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words("indonesian"))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming menggunakan Sastrawi
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)

# Fungsi untuk vektorisasi menggunakan TF-IDF
def tfidf_vectorize(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer

# Fungsi untuk mendapatkan ringkasan dari Gemini API
def summarize_text(text):
    model = genai.GenerativeModel("gemini-1.5-pro")
    response = model.generate_content(f"Ringkas teks berikut:\n\n{text}")
    return response.text

# Fungsi untuk mendapatkan entitas utama
def extract_entities(text):
    model = genai.GenerativeModel("gemini-1.5-pro")
    response = model.generate_content(f"Temukan entitas utama dalam teks ini:\n\n{text}")
    return response.text

# Fungsi untuk klasifikasi dokumen
def classify_document(text):
    model = genai.GenerativeModel("gemini-1.5-pro")
    response = model.generate_content(f"Klasifikasikan dokumen ini berdasarkan isi:\n\n{text}")
    return response.text

In [6]:
# Upload file PDF ke Google Colab
print("Silakan upload file PDF")
uploaded = files.upload()

Silakan upload file PDF


Saving TOR Alkademi Talks Maret 2025.pdf to TOR Alkademi Talks Maret 2025.pdf


In [7]:
# Proses file PDF yang diunggah
pdf_texts = []
summaries = []
entities = []
categories = []

In [8]:
for filename in uploaded.keys():
    print(f"Memproses: {filename}")
    text = extract_text_from_pdf(filename)
    processed_text = preprocess_text(text)
    pdf_texts.append(processed_text)

    # Menggunakan Gemini untuk analisis lanjutan
    summary = summarize_text(text)
    entity_info = extract_entities(text)
    category = classify_document(text)

    summaries.append(summary)
    entities.append(entity_info)
    categories.append(category)

# Representasi teks dalam bentuk TF-IDF
tfidf_matrix, vectorizer = tfidf_vectorize(pdf_texts)

Memproses: TOR Alkademi Talks Maret 2025.pdf


In [9]:
# Menampilkan hasil TF-IDF
print("\nTF-IDF Representation (Array):")
print(tfidf_matrix.toarray())

# Menampilkan kata-kata fitur yang dipilih oleh TF-IDF
print("\nFeature Names:")
print(vectorizer.get_feature_names_out())

# Menampilkan hasil analisis Gemini
for i, filename in enumerate(uploaded.keys()):
    print(f"\n📄 **Analisis untuk {filename}:**")
    print(f"🔹 **Ringkasan:** {summaries[i]}")
    print(f"🔹 **Entitas Utama:** {entities[i]}")
    print(f"🔹 **Kategori Dokumen:** {categories[i]}")


TF-IDF Representation (Array):
[[0.03123475 0.03123475 0.03123475 0.40605178 0.03123475 0.03123475
  0.03123475 0.03123475 0.03123475 0.03123475 0.03123475 0.03123475
  0.0624695  0.03123475 0.53099079 0.03123475 0.03123475 0.43728653
  0.03123475 0.03123475 0.03123475 0.03123475 0.03123475 0.0624695
  0.0624695  0.03123475 0.03123475 0.0624695  0.03123475 0.03123475
  0.03123475 0.03123475 0.03123475 0.0624695  0.03123475 0.03123475
  0.03123475 0.0624695  0.03123475 0.03123475 0.0624695  0.03123475
  0.03123475 0.03123475 0.03123475 0.03123475 0.03123475 0.03123475
  0.03123475 0.03123475 0.03123475 0.03123475 0.03123475 0.03123475
  0.03123475 0.03123475 0.03123475 0.0624695  0.09370426 0.03123475
  0.03123475 0.03123475 0.03123475 0.03123475 0.03123475 0.03123475
  0.03123475 0.03123475 0.0624695  0.03123475 0.03123475 0.03123475
  0.03123475 0.03123475 0.03123475 0.03123475 0.03123475 0.03123475
  0.03123475 0.03123475 0.09370426 0.03123475 0.03123475 0.03123475
  0.03123475 0.03