In [None]:
# !pip install pyspellchecker

In [None]:
# !pip install transformers torchyspellchecker in /usr/local/lib/python3.11/dist-packages (0.8.2)

In [None]:
# !pip install --upgrade sqlalchemy

# !pip install transformers dataset
# # !pip install transformers torch
!pip install --upgrade transformers


[31mERROR: Operation cancelled by user[0m[31m
[0m^C


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import difflib
from collections import defaultdict
import re
import pandas as pd

# Preprocessing
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Hapus tanda baca
    text = re.sub(r'\s+', ' ', text)     # Ganti jadi satu spasi
    return text.lower().strip()

# Dekomp suffix (e.g., sinyalnya -> sinyal + nya)
def decompose_suffix(word):
    suffixes = ['ku', 'mu', 'nya', 'lah', 'kah', 'tah', 'pun']
    for suffix in suffixes:
        if word.endswith(suffix) and len(word) > len(suffix) + 1:
            base = word[:-len(suffix)] #menghapus suffix
            if base in word_list:
                return base, suffix
    return word, ''

# Load models and resources
def load_resources():
    tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
    model = AutoModelForMaskedLM.from_pretrained("indobenchmark/indobert-base-p1")

    def load_words(filepath):
        with open(filepath, 'r', encoding='utf-8') as file:
            words = file.read().lower().splitlines()
        return set(words)

    base_words = load_words('wordlist.txt')
    word_list = set(base_words)
    for word in base_words:
        for suf in ['ku', 'mu', 'nya']:
            word_list.add(word + suf)

    df = pd.read_csv("hf://datasets/theonlydo/indonesia-slang/slang-indo.csv")
    slang_map = defaultdict(list)
    for _, row in df.iterrows():
        slang = str(row['slang']).lower().strip()
        correction = str(row['formal']).lower().strip()
        if slang and correction:
            slang_map[slang].append(correction)

    return tokenizer, model, word_list, slang_map

# Load once
tokenizer, model, word_list, slang_map = load_resources()

# Ambil kandidat dengan edit distance
def get_candidates(word, word_list, n=5, cutoff=0.6):
    if word in word_list:
        return [word]
    candidates = difflib.get_close_matches(word, word_list, n=n, cutoff=cutoff)  #mencari kemiripan
    if not candidates and cutoff > 0.5:
        candidates = difflib.get_close_matches(word, word_list, n=n, cutoff=cutoff - 0.1)
    if not candidates and len(word) >= 4:
        candidates = difflib.get_close_matches(word, word_list, n=n, cutoff=0.4)
    return candidates if candidates else [word]

# Highlight difference
def highlight_diff(original, corrected):
    orig_words = original.split()
    corr_words = corrected.split()
    matcher = difflib.SequenceMatcher(None, orig_words, corr_words)
    result = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal': #setara
            result.extend(corr_words[j1:j2])
        elif tag in ['replace', 'delete', 'insert']:
            replaced = ' '.join(corr_words[j1:j2])
            result.append(f"[{replaced}]")
    return ' '.join(result)

# Correction pipeline
def correct_text_pipeline(text, top_k=5):
    original_text = text.strip()
    text = preprocess_text(original_text)
    words = text.split()
    corrected_words = []

#perulangan i dan word pada list
    for i, word in enumerate(words):
        # Step 1: Slang
        if word in slang_map:
            corrected_words.append(slang_map[word][0])
            continue

        # Step 2: Valid word
        if word in word_list:
            corrected_words.append(word)
            continue

        # Step 2.5: Decompose suffix and re-check
        base_word, suffix = decompose_suffix(word)
        if base_word in word_list:
            corrected_words.append(base_word + suffix)
            continue

        # Step 3: Candidates
        candidates = get_candidates(word, word_list, n=top_k)
        if len(candidates) == 1:
            corrected_words.append(candidates[0])
            continue

        # Step 4: Masked Language Model
        temp = words.copy()
        temp[i] = tokenizer.mask_token
        masked_text = ' '.join(temp)
        inputs = tokenizer(masked_text, return_tensors='pt')
        mask_token_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1] #Menemukan posisi index dalam output tokenisasi

        if mask_token_index.size(0) > 0: # Memastikan bahwa ada token [MASK]
            with torch.no_grad(): # Menjalankan model tanpa menyimpan informasi.
                outputs = model(**inputs)
                predictions = outputs.logits[0, mask_token_index[0]]

            candidate_scores = []
            for candidate in candidates:
                candidate_ids = tokenizer.encode(candidate, add_special_tokens=False)
                if candidate_ids:
                    score = sum(predictions[token_id].item() for token_id in candidate_ids) / len(candidate_ids)
                    candidate_scores.append((candidate, score))

            if candidate_scores:
              candidate_scores.sort(key=lambda x: x[1], reverse=True)

              print(f"\n[Info] Kata salah: '{word}' → Kandidat MLM:")
              for cand, score in candidate_scores:
                  print(f"  - {cand:<15} | skor: {score:.4f}")

              corrected_words.append(candidate_scores[0][0])
              continue


        corrected_words.append(candidates[0])

    return ' '.join(corrected_words)

# Interactive CLI
def interactive_correction():
    print("Indonesian Text Correction System")
    print("Type 'stop' to exit\n")

    while True:
        user_input = input("Masukkan kalimat: ").strip()
        if user_input.lower() in ['stop', 'exit', 'quit']:
            break
        if not user_input:
            continue

        corrected = correct_text_pipeline(user_input)
        print("\nOriginal:", user_input)
        print("Corrected:", corrected)
        print("Highlight:", highlight_diff(user_input, corrected))
        print()

# Run
if __name__ == "__main__":
    interactive_correction()


Some weights of BertForMaskedLM were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Indonesian Text Correction System
Type 'stop' to exit

Masukkan kalimat: kmu jgn jalan kejauhan

Original: kmu jgn jalan kejauhan
Corrected: kamu jangan jalan kejauhan
Highlight: [kamu jangan] jalan kejauhan

Masukkan kalimat: stop
