## SOAL 1

In [None]:
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# 1. Persiapan - Membuat dan membaca file teks
paragraph = """
Pemerintah Indonesia terus berupaya meningkatkan kesejahteraan rakyat melalui berbagai program sosial. 12 % $ 34 ; &* Salah satu fokus utama adalah mengurangi kesenjangan ekonomi antara wilayah perkotaan dan pedesaan. Program seperti bantuan sosial, kartu prakerja, dan subsidi pendidikan diharapkan dapat membantu masyarakat kurang mampu.
"""

# Menyimpan ke file teks
with open('artikel.txt', 'w', encoding='utf-8') as file:
    file.write(paragraph)

# Membaca file teks
with open('artikel.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# 2. Preprocessing teks
# Case folding
def case_folding(text):
    text = text.lower()  # Mengubah ke lowercase
    text = re.sub(r'\d+', '', text)  # Menghapus angka
    text = re.sub(r'[^\w\s]', '', text)  # Menghapus tanda baca
    text = text.strip()  # Menghapus whitespace di awal dan akhir
    text = re.sub(r'\s+', ' ', text)  # Mengganti multiple whitespace dengan single space
    return text

# Tokenisasi
def tokenize(text):
    return text.split()

# Menghapus stopwords
def remove_stopwords(tokens):
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    return [word for word in tokens if word not in factory.get_stop_words()]

# Stemming
def stem_words(tokens):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return [stemmer.stem(word) for word in tokens]

# Pipeline preprocessing
def preprocess(text):
    text = case_folding(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stem_words(tokens)
    return tokens

# Melakukan preprocessing
preprocessed_text = preprocess(text)
print("Hasil Preprocessing:")
print(preprocessed_text)

Hasil Preprocessing:
['perintah', 'indonesia', 'terus', 'upaya', 'tingkat', 'sejahtera', 'rakyat', 'lalu', 'bagai', 'program', 'sosial', 'salah', 'satu', 'fokus', 'utama', 'kurang', 'senjang', 'ekonomi', 'wilayah', 'kota', 'desa', 'program', 'bantu', 'sosial', 'kartu', 'prakerja', 'subsidi', 'didik', 'harap', 'bantu', 'masyarakat', 'kurang', 'mampu']


## SOAL 3


In [None]:
import pandas as pd
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Dokumen-dokumen
doc1 = "adik belajar baca kakak olahraga"
doc2 = "murid belajar tulis baca"
doc3 = "kakak belajar komputer"
query = "belajar baca"

# 1. Membuat tabel frekuensi
def create_freq_table(docs):
    # Membuat vocabulary
    vocab = set()
    for doc in docs:
        words = doc.split()
        vocab.update(words)
    vocab = sorted(vocab)

    # Membuat tabel frekuensi
    freq_table = []
    for i, doc in enumerate(docs, 1):
        words = doc.split()
        freq = {word: words.count(word) for word in vocab}
        freq_table.append(freq)

    df = pd.DataFrame(freq_table, index=[f'Doc {i}' for i in range(1, len(docs)+1)])
    return df

docs = [doc1, doc2, doc3]
freq_table = create_freq_table(docs)
print("\nTabel Frekuensi:")
print(freq_table)

# 2. Menghitung TF-IDF
def calculate_tfidf(freq_table):
    # Menghitung TF
    tf = freq_table.copy()

    # Menghitung IDF
    N = len(freq_table)
    idf = {}
    for term in freq_table.columns:
        df = (freq_table[term] > 0).sum()
        idf[term] = math.log(N / df) if df != 0 else 0

    # Menghitung TF-IDF
    tfidf = tf.copy()
    for term in tfidf.columns:
        tfidf[term] = tfidf[term] * idf[term]

    return tfidf

tfidf_table = calculate_tfidf(freq_table)
print("\nHasil TF-IDF:")
print(tfidf_table)

# 3. Menghitung Cosine Similarity dengan query
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs)
query_vec = vectorizer.transform([query])

cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
print("\nCosine Similarity dengan query 'belajar baca':")
for i, score in enumerate(cosine_similarities, 1):
    print(f"Doc {i}: {score:.4f}")


Tabel Frekuensi:
       adik  baca  belajar  kakak  komputer  murid  olahraga  tulis
Doc 1     1     1        1      1         0      0         1      0
Doc 2     0     1        1      0         0      1         0      1
Doc 3     0     0        1      1         1      0         0      0

Hasil TF-IDF:
           adik      baca  belajar     kakak  komputer     murid  olahraga  \
Doc 1  1.098612  0.405465      0.0  0.405465  0.000000  0.000000  1.098612   
Doc 2  0.000000  0.405465      0.0  0.000000  0.000000  1.098612  0.000000   
Doc 3  0.000000  0.000000      0.0  0.405465  1.098612  0.000000  0.000000   

          tulis  
Doc 1  0.000000  
Doc 2  1.098612  
Doc 3  0.000000  

Cosine Similarity dengan query 'belajar baca':
Doc 1: 0.5143
Doc 2: 0.5628
Doc 3: 0.2609
