In [None]:
!pip install pymupdf Sastrawi nltk scikit-learn google-generativeai



In [None]:
import fitz  # PyMuPDF untuk membaca PDF
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
import google.generativeai as genai
import string
from google.colab import files

In [None]:
# Download stopwords untuk bahasa Indonesia
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Konfigurasi API Key Gemini
API_KEY = "API_KEY"
genai.configure(api_key=API_KEY)

In [None]:
# Fungsi untuk ekstrak teks dari PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + " "
    return text.strip()

# Fungsi preprocessing teks
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words("indonesian"))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming menggunakan Sastrawi
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)

# Fungsi untuk vektorisasi menggunakan TF-IDF
def tfidf_vectorize(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer

# Fungsi untuk mendapatkan ringkasan dari Gemini API
def summarize_text(text):
    model = genai.GenerativeModel("gemini-2.0-flash")
    response = model.generate_content(f"Ringkas teks berikut:\n\n{text}")
    return response.text

# Fungsi untuk mendapatkan entitas utama
def extract_entities(text):
    model = genai.GenerativeModel("gemini-2.0-flash")
    response = model.generate_content(f"Temukan entitas utama dalam teks ini:\n\n{text}")
    return response.text

# Fungsi untuk klasifikasi dokumen
def classify_document(text):
    model = genai.GenerativeModel("gemini-2.0-flash")
    response = model.generate_content(f"Klasifikasikan dokumen ini berdasarkan isi:\n\n{text}")
    return response.text

In [None]:
# Upload file PDF ke Google Colab
print("Silakan upload file PDF")
uploaded = files.upload()

Silakan upload file PDF


Saving 242011017_Muhammad Ikhwan Fathulloh_SYSTEM ENTERPRISE MUTAKHIR.pdf to 242011017_Muhammad Ikhwan Fathulloh_SYSTEM ENTERPRISE MUTAKHIR.pdf


In [None]:
# Proses file PDF yang diunggah
pdf_texts = []
summaries = []
entities = []
categories = []

In [None]:
for filename in uploaded.keys():
    print(f"Memproses: {filename}")
    text = extract_text_from_pdf(filename)
    processed_text = preprocess_text(text)
    pdf_texts.append(processed_text)

    # Menggunakan Gemini untuk analisis lanjutan
    summary = summarize_text(text)
    entity_info = extract_entities(text)
    category = classify_document(text)

    summaries.append(summary)
    entities.append(entity_info)
    categories.append(category)

# Representasi teks dalam bentuk TF-IDF
tfidf_matrix, vectorizer = tfidf_vectorize(pdf_texts)

Memproses: 242011017_Muhammad Ikhwan Fathulloh_SYSTEM ENTERPRISE MUTAKHIR.pdf


In [None]:
# Menampilkan hasil TF-IDF
print("\nTF-IDF Representation (Array):")
print(tfidf_matrix.toarray())

# Menampilkan kata-kata fitur yang dipilih oleh TF-IDF
print("\nFeature Names:")
print(vectorizer.get_feature_names_out())

# Menampilkan hasil analisis Gemini
for i, filename in enumerate(uploaded.keys()):
    print(f"\n📄 **Analisis untuk {filename}:**")
    print(f"🔹 **Ringkasan:** {summaries[i]}")
    print(f"🔹 **Entitas Utama:** {entities[i]}")
    print(f"🔹 **Kategori Dokumen:** {categories[i]}")


TF-IDF Representation (Array):
[[0.14605156 0.02921031 0.02921031 0.02921031 0.02921031 0.02921031
  0.02921031 0.02921031 0.11684125 0.02921031 0.02921031 0.02921031
  0.02921031 0.02921031 0.02921031 0.02921031 0.02921031 0.02921031
  0.11684125 0.11684125 0.05842062 0.05842062 0.02921031 0.02921031
  0.02921031 0.02921031 0.02921031 0.02921031 0.02921031 0.05842062
  0.02921031 0.02921031 0.02921031 0.02921031 0.17526187 0.02921031
  0.02921031 0.11684125 0.02921031 0.02921031 0.02921031 0.11684125
  0.08763094 0.02921031 0.02921031 0.02921031 0.14605156 0.02921031
  0.02921031 0.02921031 0.02921031 0.05842062 0.05842062 0.02921031
  0.05842062 0.02921031 0.02921031 0.02921031 0.05842062 0.02921031
  0.05842062 0.14605156 0.02921031 0.02921031 0.02921031 0.05842062
  0.02921031 0.02921031 0.05842062 0.02921031 0.02921031 0.02921031
  0.02921031 0.02921031 0.17526187 0.02921031 0.02921031 0.02921031
  0.02921031 0.02921031 0.02921031 0.02921031 0.02921031 0.02921031
  0.02921031 0.0