In [1]:
pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import string
from docx import Document
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import ipywidgets as widgets
from IPython.display import display

# Fungsi untuk membaca teks dari berkas DOCX
def read_text_from_docx(file_path):
    doc = Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

# Fungsi preprocessing yang mencakup filtering, stemming, dan tokenisasi
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    preprocessed_tokens = [token for token in stemmed_tokens if token.isalnum()]
    return ' '.join(preprocessed_tokens)

# Fungsi untuk menampilkan hasil
def display_results(b):
    results_output.clear_output()
    folder_name = folder_selector.value
    folder_path = os.path.join(base_directory, folder_name)
    hasil_path = os.path.join(hasil_directory, folder_name)

    results = []
    
    for filename in os.listdir(folder_path):
        output_file_name = f'frekuensi_kata_{filename.replace(".docx", "")}.txt'  
        file_path = os.path.join(folder_path, filename)

        text = read_text_from_docx(file_path)
        preprocessed_text = preprocess_text(text)
        tokens = word_tokenize(preprocessed_text)
        word_frequency = Counter(tokens)

        # Frekuensi Kata
        word_freq_output = ""
        for word, freq in word_frequency.items():
            word_freq_output += f'{word} {freq}\n'

        # TF-IDF
        tokens = word_tokenize(preprocessed_text)
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens)])
        terms = tfidf_vectorizer.get_feature_names_out()
        tfidf_output = ""
        for i, term in enumerate(terms):
            tfidf_output += f'{term} : {tfidf_matrix[0, i]}\n'

        results.append((filename, word_freq_output, tfidf_output))

    with results_output:
        for filename, word_freq_output, tfidf_output in results:
            print(f'Nama Berkas: {filename}')
            print('Hasil Frekuensi Kata:')
            print(word_freq_output)
            print('Hasil TF-IDF:')
            print(tfidf_output)
            print("--------------------------------------------------")

# Direktori Sumber dan Hasil
base_directory = "D:\PENS22\SEMESTER 3\TEXT MINING\Source"
hasil_directory = "D:\PENS22\SEMESTER 3\TEXT MINING\Destinasi"

# Pilih Folder
folders_to_check = ['Kesehatan', 'Teknologi', 'Politik', 'Olahraga', 'Kriminal']
folder_selector = widgets.Dropdown(options=folders_to_check, description='Pilih Folder:')
display(folder_selector)

# Tombol Tampilkan Hasil
show_results_button = widgets.Button(description="Tampilkan Hasil")
show_results_button.on_click(display_results)
display(show_results_button)

# Output Hasil
results_output = widgets.Output()
display(results_output)


Dropdown(description='Pilih Folder:', options=('Kesehatan', 'Teknologi', 'Politik', 'Olahraga', 'Kriminal'), v…

Button(description='Tampilkan Hasil', style=ButtonStyle())

Output()