#Menggunakan Library Google Translate

In [None]:
pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googl

In [None]:
from googletrans import Translator

def translate_text(text, src_lang='id', dest_lang='en'):
    translator = Translator()
    translation = translator.translate(text, src=src_lang, dest=dest_lang)
    return translation.text

# Contoh penggunaan
teks = "Selamat siang rekan rekan, menginformasikan perkuliahan besok\
untuk kelas CVL akan bertukar dengan PBAL."
hasil = translate_text(teks, src_lang='id', dest_lang='en')
print(hasil)


Good afternoon colleagues, informing the lecture tomorrow for the CVL class will exchange with Pbal.


#Menggunakan Dataset OPUS

In [None]:
import requests
import zipfile
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
# === 1. Download and Extract Dataset ===
url = "https://object.pouta.csc.fi/OPUS-tldr-pages/v2023-08-29/moses/en-id.txt.zip"
output_zip = "en-id.txt.zip"
output_dir = "ccmatrix"

def download_and_extract(url, output_zip, output_dir):
    # Download dataset
    print("Downloading dataset...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(output_zip, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                f.write(chunk)
        print("Download completed.")
    else:
        raise Exception(f"Failed to download dataset. Status code: {response.status_code}")

    # Extract dataset
    print("Extracting dataset...")
    with zipfile.ZipFile(output_zip, "r") as zip_ref:
        zip_ref.extractall(output_dir)
    print("Extraction completed.")

download_and_extract(url, output_zip, output_dir)

Downloading dataset...
Download completed.
Extracting dataset...
Extraction completed.


In [None]:
# === 2. Load Dataset ===
def load_dataset(data_dir, max_samples=10000):
    # Tentukan nama file relatif terhadap direktori
    id_file = os.path.join(data_dir, "tldr-pages.en-id.id")
    en_file = os.path.join(data_dir, "tldr-pages.en-id.en")

    # Membuka file Bahasa Indonesia dan Inggris
    with open(id_file, "r", encoding="utf-8") as f_id, \
         open(en_file, "r", encoding="utf-8") as f_en:
        id_sentences = f_id.readlines()[:max_samples]
        en_sentences = f_en.readlines()[:max_samples]

    return [s.strip() for s in id_sentences], [s.strip() for s in en_sentences]

# Memuat dataset
data_indonesia, data_english = load_dataset(output_dir, max_samples=10000)

In [None]:
# === 3. Preprocess Data ===
def preprocess_texts(texts):
    tokenizer = Tokenizer(filters='', lower=True)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    return tokenizer, sequences

tokenizer_ind, sequences_ind = preprocess_texts(data_indonesia)
tokenizer_eng, sequences_eng = preprocess_texts(data_english)

# Padding sequences
max_len_ind = max(len(seq) for seq in sequences_ind)
max_len_eng = max(len(seq) for seq in sequences_eng)

padded_ind = pad_sequences(sequences_ind, maxlen=max_len_ind, padding="post")
padded_eng = pad_sequences(sequences_eng, maxlen=max_len_eng, padding="post")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_ind, padded_eng, test_size=0.2)

In [None]:
# === 4. Build Seq2Seq Model ===
embedding_dim = 256
units = 512
vocab_size_ind = len(tokenizer_ind.word_index) + 1
vocab_size_eng = len(tokenizer_eng.word_index) + 1

# Encoder
encoder_inputs = tf.keras.layers.Input(shape=(max_len_ind,))
encoder_embedding = tf.keras.layers.Embedding(vocab_size_ind, embedding_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(units, return_state=True)(encoder_embedding)

# Decoder
decoder_inputs = tf.keras.layers.Input(shape=(max_len_eng,))
decoder_embedding = tf.keras.layers.Embedding(vocab_size_eng, embedding_dim)(decoder_inputs)
decoder_lstm, _, _ = tf.keras.layers.LSTM(units, return_sequences=True, return_state=True)(
    decoder_embedding, initial_state=[state_h, state_c]
)
decoder_outputs = tf.keras.layers.Dense(vocab_size_eng, activation="softmax")(decoder_lstm)

# Build Model
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
# === 5. Latih Model ===
y_train_input = pad_sequences(y_train[:, :-1], maxlen=max_len_eng, padding="post")
y_train_shifted = pad_sequences(y_train[:, 1:], maxlen=max_len_eng, padding="post")

model.fit(
    [X_train, y_train_input],
    y_train_shifted,
    batch_size=64,
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 4s/step - accuracy: 0.6729 - loss: 4.4640 - val_accuracy: 0.8517 - val_loss: 1.0405
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 2s/step - accuracy: 0.8538 - loss: 0.9628 - val_accuracy: 0.8574 - val_loss: 0.9773
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2s/step - accuracy: 0.8607 - loss: 0.8533 - val_accuracy: 0.8548 - val_loss: 0.9560
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2s/step - accuracy: 0.8563 - loss: 0.8492 - val_accuracy: 0.8553 - val_loss: 0.9461
Epoch 5/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2s/step - accuracy: 0.8598 - loss: 0.8116 - val_accuracy: 0.8597 - val_loss: 0.9379
Epoch 6/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 3s/step - accuracy: 0.8611 - loss: 0.7977 - val_accuracy: 0.8703 - val_loss: 0.9256
Epoch 7/10
[1m15/15[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7e9379362c50>

In [None]:
# Buat ulang layer untuk decoder
decoder_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim)
decoder_lstm_layer = tf.keras.layers.LSTM(units, return_sequences=True, return_state=True)
decoder_dense_layer = tf.keras.layers.Dense(vocab_size_eng, activation="softmax")


In [None]:
# Input untuk decoder saat inferensi
decoder_state_input_h = tf.keras.layers.Input(shape=(units,))
decoder_state_input_c = tf.keras.layers.Input(shape=(units,))
decoder_inputs_single = tf.keras.layers.Input(shape=(1,))

# Embedding layer untuk token input decoder
decoder_embedded = decoder_embedding_layer(decoder_inputs_single)

# Gunakan LSTM decoder yang baru dideklarasikan
decoder_lstm_output, state_h2, state_c2 = decoder_lstm_layer(
    decoder_embedded, initial_state=[decoder_state_input_h, decoder_state_input_c]
)

# Gunakan dense layer yang baru dideklarasikan
decoder_outputs = decoder_dense_layer(decoder_lstm_output)

# Buat model decoder untuk inferensi
decoder_model = tf.keras.models.Model(
    [decoder_inputs_single, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs, state_h2, state_c2]
)


In [None]:
# Cek model decoder
decoder_model.summary()

In [None]:
def translate(input_text):
    input_sequence = tokenizer_ind.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=max_len_ind, padding="post")

    encoder_states = encoder_model.predict(input_sequence)
    start_token = tokenizer_eng.word_index.get("<start>", 1)
    target_sequence = tf.constant([[start_token]])

    stop_condition = False
    decoded_sentence = []
    state_h, state_c = encoder_states[1], encoder_states[2]

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_sequence, state_h, state_c])

        # Berikan penalti untuk kata-kata yang sudah muncul
        output_tokens[0, -1, :] /= (1 + np.isin(np.arange(len(output_tokens[0, -1, :])), [tokenizer_eng.word_index.get(word, 0) for word in decoded_sentence]))

        sampled_token_index = tf.argmax(output_tokens[0, -1, :]).numpy()
        sampled_word = tokenizer_eng.index_word.get(sampled_token_index, "")

        if sampled_word == "<end>" or len(decoded_sentence) > max_len_eng:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)

        target_sequence = tf.constant([[sampled_token_index]])
        state_h, state_c = h, c

    return " ".join(decoded_sentence)


In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Load model pra-terlatih
model_name = "Helsinki-NLP/opus-mt-id-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Terjemahkan kalimat
def translate_with_pretrained(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/796k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/291M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/291M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
# === 8. Uji Terjemahan ===
input_word = "Pendidikan, makanan, minuman."
output_word = translate_with_pretrained(input_word)
print(output_word)

Education, food, water.


In [None]:
input_sentence = (
    "Pendidikan adalah kunci untuk membuka pintu masa depan yang cerah. "
    "Dengan pendidikan, seseorang dapat meningkatkan kualitas hidupnya dan memberikan kontribusi positif bagi masyarakat."
)
output_sentence = translate_with_pretrained(input_sentence)
print(output_sentence)

Education is the key to opening the door to a bright future, with education, one can improve the quality of life and contribute positively to society.


#Menggunakan Dataset Kaggle

In [None]:
pip install kaggle



In [None]:
!kaggle datasets download -d williammulianto/bilingual-dictionary-iden

Dataset URL: https://www.kaggle.com/datasets/williammulianto/bilingual-dictionary-iden
License(s): unknown
Downloading bilingual-dictionary-iden.zip to /content
  0% 0.00/136k [00:00<?, ?B/s]
100% 136k/136k [00:00<00:00, 101MB/s]


In [None]:
!unzip bilingual-dictionary-iden.zip

Archive:  bilingual-dictionary-iden.zip
  inflating: en-id.0-5000.txt        
  inflating: en-id.5000-6500.txt     
replace id-en.0-5000.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: id-en.0-5000.txt        
  inflating: id-en.5000-6500.txt     


In [None]:
# Filter sequence kosong sebelum padding
sequences_ind = [seq for seq in sequences_ind if len(seq) > 0]
sequences_eng = [seq for seq in sequences_eng if len(seq) > 0]

# Validasi panjang sequence
if not sequences_ind or not sequences_eng:
    raise ValueError("Sequences kosong setelah preprocessing. Periksa dataset Anda.")

# Recalculate max lengths
max_len_ind = max(len(seq) for seq in sequences_ind)
max_len_eng = max(len(seq) for seq in sequences_eng)

# Padding sequences
padded_ind = pad_sequences(sequences_ind, maxlen=max_len_ind, padding="post")
padded_eng = pad_sequences(sequences_eng, maxlen=max_len_eng, padding="post")

# Validasi hasil padding
print(f"Padded input shape (Indonesia): {padded_ind.shape}")
print(f"Padded target shape (English): {padded_eng.shape}")


Padded input shape (Indonesia): (12128, 1)
Padded target shape (English): (12128, 1)


In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# === 1. Load Dataset ===
def load_multiple_datasets(filepaths):
    sentences_id = []
    sentences_en = []
    for filepath in filepaths:
        # Membaca dataset
        data = pd.read_csv(filepath, sep="\t", header=None, names=["id", "en"])
        data = data.dropna()  # Menghapus baris kosong
        sentences_id.extend(data["id"].astype(str).tolist())
        sentences_en.extend(data["en"].astype(str).tolist())
    return sentences_id, sentences_en

# File paths
filepaths = ["id-en.0-5000.txt", "id-en.5000-6500.txt"]

# Load dataset
data_indonesia, data_english = load_multiple_datasets(filepaths)

# === 2. Preprocess Data ===
def preprocess_data(sentences):
    tokenizer = Tokenizer(filters='', lower=True)
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    return tokenizer, sequences

tokenizer_ind, sequences_ind = preprocess_data(data_indonesia)
tokenizer_eng, sequences_eng = preprocess_data(data_english)

# Padding sequences
max_len_ind = max(len(seq) for seq in sequences_ind)
max_len_eng = max(len(seq) for seq in sequences_eng)

padded_ind = pad_sequences(sequences_ind, maxlen=max_len_ind, padding="post")
padded_eng = pad_sequences(sequences_eng, maxlen=max_len_eng, padding="post")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_ind, padded_eng, test_size=0.2)

# === 3. Build Simple Embedding Model ===
embedding_dim = 128
vocab_size_ind = len(tokenizer_ind.word_index) + 1
vocab_size_eng = len(tokenizer_eng.word_index) + 1

# Input
input_layer = tf.keras.layers.Input(shape=(1,))  # Input hanya 1 token
embedding_layer = tf.keras.layers.Embedding(vocab_size_ind, embedding_dim)(input_layer)
flatten_layer = tf.keras.layers.Flatten()(embedding_layer)

# Output
output_layer = tf.keras.layers.Dense(vocab_size_eng, activation="softmax")(flatten_layer)

# Build Model
model = tf.keras.models.Model(input_layer, output_layer)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
# === 4. Train Model ===
model.fit(
    padded_ind,
    padded_eng,
    batch_size=64,
    epochs=20,
    validation_split=0.2
)

# === 5. Translate Function ===
def translate(sentence):
    sequence = tokenizer_ind.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=1, padding="post")
    prediction = model.predict(padded_sequence)
    predicted_index = tf.argmax(prediction[0]).numpy()
    return tokenizer_eng.index_word.get(predicted_index, "<unknown>")

Epoch 1/20
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.0000e+00 - loss: 9.1930 - val_accuracy: 0.0000e+00 - val_loss: 9.2393
Epoch 2/20
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0413 - loss: 9.1226 - val_accuracy: 0.0000e+00 - val_loss: 9.3267
Epoch 3/20
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.2295 - loss: 9.0183 - val_accuracy: 0.0000e+00 - val_loss: 9.4116
Epoch 4/20
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.4013 - loss: 8.8321 - val_accuracy: 0.0000e+00 - val_loss: 9.4915
Epoch 5/20
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.4430 - loss: 8.5201 - val_accuracy: 0.0000e+00 - val_loss: 9.5620
Epoch 6/20
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4560 - loss: 8.0823 - val_accuracy: 0.0000e+00 - val_loss: 9.6214

In [None]:
def translate_paragraph_word_by_word(paragraph):
    words = paragraph.split()  # Pisahkan paragraf menjadi kata-kata
    translated_words = []

    for word in words:
        # Preprocess setiap kata
        sequence = tokenizer_ind.texts_to_sequences([word])
        padded_sequence = pad_sequences(sequence, maxlen=1, padding="post")

        # Prediksi terjemahan untuk setiap kata
        if padded_sequence.any():  # Pastikan ada sequence valid
            prediction = model.predict(padded_sequence)
            predicted_index = tf.argmax(prediction[0]).numpy()
            translated_word = tokenizer_eng.index_word.get(predicted_index, "<unknown>")
            translated_words.append(translated_word)
        else:
            translated_words.append("<unknown>")  # Untuk kata yang tidak ada di vocab

    # Gabungkan kembali kata-kata yang diterjemahkan
    translated_paragraph = " ".join(translated_words)
    return translated_paragraph


In [None]:
# === 6. Test Translation for Paragraph ===
input_paragraph = (
    "Pendidikan adalah kunci untuk membuka pintu masa depan yang cerah. "
    "Dengan pendidikan, seseorang dapat meningkatkan kualitas hidupnya dan memberikan kontribusi positif bagi masyarakat. "
    "Selain itu, pendidikan juga membantu seseorang memahami dunia dengan lebih baik dan menghadapi tantangan hidup dengan bijaksana. "
    "Oleh karena itu, kita harus menghargai dan memanfaatkan setiap kesempatan untuk belajar."
)

output_paragraph = translate_paragraph_word_by_word(input_paragraph)
print(f"Terjemahan:\n{output_paragraph}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28