In [None]:
!pip install tensorflow tensorflow-text datasets nltk rouge pydot graphviz
import nltk
nltk.download('punkt')


Collecting tensorflow-text
  Downloading tensorflow_text-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting tensorflow
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m74.8

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Concatenate, Dropout, Bidirectional, MultiHeadAttention
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np
import os
import pickle
from google.colab import drive
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from nltk.translate import bleu_score

In [None]:
# Mount Google Drive
drive.mount('/content/drive')
model_path = '/content/drive/My Drive/model_BiLSTM_word_reinforcement_V2.h5'
history_path = '/content/drive/My Drive/history_BiLSTM_word_reinforcement_V2.pkl'

Mounted at /content/drive


In [None]:
# Load dataset
dataset = load_dataset("neulab/tldr", split='train')

# Append <start> and <end> tokens to cmd_texts
cmd_texts = ["<start> " + item['cmd'] + " <end>" for item in dataset]

# Combine nl_texts and cmd_texts for tokenizer fitting
nl_texts = [item['nl'] for item in dataset]
all_texts = nl_texts + cmd_texts

# Initialize and fit tokenizer
tokenizer = Tokenizer(filters='', lower=True, split=' ')
tokenizer.fit_on_texts(all_texts)
vocab_size = len(tokenizer.word_index) + 1  # +1 for zero padding

# Prepare sequences
nl_sequences = tokenizer.texts_to_sequences(nl_texts)
cmd_sequences = tokenizer.texts_to_sequences(cmd_texts)

# Determine maximum sequence length
max_length = max(max(len(seq) for seq in nl_sequences), max(len(seq) for seq in cmd_sequences))

# Pad sequences
nl_sequences_padded = pad_sequences(nl_sequences, maxlen=max_length, padding='post')
cmd_sequences_padded = pad_sequences(cmd_sequences, maxlen=max_length, padding='post')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.76M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/525k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
# Prepare decoder input data and target data
decoder_input_data = np.zeros_like(cmd_sequences_padded)
decoder_input_data[:, 1:] = cmd_sequences_padded[:, :-1]

# RL Training Setup
optimizer = Adam(learning_rate=1e-3)
epochs = 200
batch_size = 64
def build_model(vocab_size, max_length):
    # Define the dimensionality of the embedding and LSTM units
    embedding_dim = 256
    lstm_units = 256

    # Encoder
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='encoder_embedding')(encoder_inputs)
    encoder_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True, return_state=True, name='encoder_lstm'), name='bidirectional_encoder')
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)

    # Since we are using a bidirectional LSTM, we need to concatenate the final states
    state_h = Concatenate(name='encoder_state_h')([forward_h, backward_h])
    state_c = Concatenate(name='encoder_state_c')([forward_c, backward_c])

    # Decoder
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='decoder_embedding')(decoder_inputs)
    decoder_lstm = LSTM(2 * lstm_units, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

    # Attention
    attention_layer = MultiHeadAttention(num_heads=4, key_dim=2 * lstm_units, name='multihead_attention')
    attention_output = attention_layer(query=decoder_outputs, key=encoder_outputs, value=encoder_outputs)

    # Concatenate attention input and decoder LSTM output
    decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_output])

    # Dense layer to predict each word probability
    dense_output = TimeDistributed(Dense(vocab_size, activation='softmax'), name='output_dense')(decoder_concat_input)

    # Define the model
    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=dense_output)

    # Compile the model with a suitable optimizer, loss, and metric
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()  # Show model architecture

    return model

# Instantiate and compile the model
model = build_model(vocab_size, max_length)
optimizer = Adam(learning_rate=1e-3)
patience = 10  # Define patience for early stopping
best_bleu = 0   # Initialize the best BLEU score tracker




In [None]:
import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from nltk.translate import bleu_score

from nltk.translate.bleu_score import SmoothingFunction

def compute_bleu(references, predictions):
    """ Compute BLEU score for batch predictions with smoothing """
    smoothie = SmoothingFunction().method1  # Using smoothing technique method1
    bleu_scores = [sentence_bleu([ref.split()], pred.split(), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
    return np.mean(bleu_scores)


def rl_training_step(input_seqs, target_seqs, model, tokenizer, optimizer):
    """ Perform a training step and return reward and loss """
    with tf.GradientTape() as tape:
        predictions = model([input_seqs, np.zeros_like(input_seqs)], training=True)
        loss = SparseCategoricalCrossentropy(from_logits=False)(target_seqs, predictions)

        # Convert predictions to text
        predicted_sequences = tf.argmax(predictions, axis=-1)
        predicted_texts = tokenizer.sequences_to_texts(predicted_sequences.numpy())
        target_texts = tokenizer.sequences_to_texts(target_seqs)

        # Compute BLEU score
        rewards = compute_bleu(target_texts, predicted_texts)
        reward = tf.reduce_mean(rewards)
        reward = tf.cast(reward, dtype=tf.float32)

        # Compute combined loss
        combined_loss = -tf.math.log(reward + 1e-9) + loss

    gradients = tape.gradient(combined_loss, model.trainable_variables)
    if gradients is None or any(g is None for g in gradients):
        print("No valid gradients. Check model and loss configuration.")
        return 0, 0

    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return reward.numpy(), loss.numpy()


# Example use in the training loop
for epoch in range(epochs):
    indices = np.random.permutation(len(nl_sequences_padded))
    epoch_rewards = []
    epoch_losses = []
    for i in range(0, len(nl_sequences_padded), batch_size):
        batch_indices = indices[i:i + batch_size]
        reward, loss = rl_training_step(
            nl_sequences_padded[batch_indices],
            cmd_sequences_padded[batch_indices],
            model,
            tokenizer,
            optimizer
        )
        epoch_rewards.append(reward)
        epoch_losses.append(loss)

    avg_reward = np.mean(epoch_rewards)
    avg_loss = np.mean(epoch_losses)
    print(f"Epoch {epoch+1}: Average BLEU Reward = {avg_reward}, Loss = {avg_loss}")
    if avg_reward > best_bleu:
        best_bleu = avg_reward
        model.save(model_path)
        print(f"New best model saved with BLEU score: {best_bleu}")
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping...")
            break




Epoch 1: Average BLEU Reward = 0.0552549734711647, Loss = 1.427756428718567
New best model saved with BLEU score: 0.0552549734711647




Epoch 2: Average BLEU Reward = 0.06440562754869461, Loss = 0.9528326392173767
New best model saved with BLEU score: 0.06440562754869461




Epoch 3: Average BLEU Reward = 0.06533413380384445, Loss = 0.8930621147155762
New best model saved with BLEU score: 0.06533413380384445




Epoch 4: Average BLEU Reward = 0.06564897298812866, Loss = 0.8492655754089355
New best model saved with BLEU score: 0.06564897298812866




Epoch 5: Average BLEU Reward = 0.06650618463754654, Loss = 0.8080022931098938
New best model saved with BLEU score: 0.06650618463754654




Epoch 6: Average BLEU Reward = 0.06823883205652237, Loss = 0.7547717690467834
New best model saved with BLEU score: 0.06823883205652237




Epoch 7: Average BLEU Reward = 0.07147785276174545, Loss = 0.7086248397827148
New best model saved with BLEU score: 0.07147785276174545




Epoch 8: Average BLEU Reward = 0.07547813653945923, Loss = 0.6655876040458679
New best model saved with BLEU score: 0.07547813653945923




Epoch 9: Average BLEU Reward = 0.08108708262443542, Loss = 0.6227487325668335
New best model saved with BLEU score: 0.08108708262443542




Epoch 10: Average BLEU Reward = 0.0889061912894249, Loss = 0.5720441341400146
New best model saved with BLEU score: 0.0889061912894249




Epoch 11: Average BLEU Reward = 0.10548181086778641, Loss = 0.5127752423286438
New best model saved with BLEU score: 0.10548181086778641




Epoch 12: Average BLEU Reward = 0.1255510300397873, Loss = 0.4538021981716156
New best model saved with BLEU score: 0.1255510300397873




Epoch 13: Average BLEU Reward = 0.15550807118415833, Loss = 0.39738327264785767
New best model saved with BLEU score: 0.15550807118415833




Epoch 14: Average BLEU Reward = 0.20904138684272766, Loss = 0.3415793478488922
New best model saved with BLEU score: 0.20904138684272766




Epoch 15: Average BLEU Reward = 0.2619902789592743, Loss = 0.29559794068336487
New best model saved with BLEU score: 0.2619902789592743




Epoch 16: Average BLEU Reward = 0.3258368968963623, Loss = 0.2578856348991394
New best model saved with BLEU score: 0.3258368968963623




Epoch 17: Average BLEU Reward = 0.3726440370082855, Loss = 0.2267271876335144
New best model saved with BLEU score: 0.3726440370082855




Epoch 18: Average BLEU Reward = 0.4276638627052307, Loss = 0.2013530284166336
New best model saved with BLEU score: 0.4276638627052307




Epoch 19: Average BLEU Reward = 0.4776475429534912, Loss = 0.18129490315914154
New best model saved with BLEU score: 0.4776475429534912




Epoch 20: Average BLEU Reward = 0.5183087587356567, Loss = 0.16399477422237396
New best model saved with BLEU score: 0.5183087587356567




Epoch 21: Average BLEU Reward = 0.5561036467552185, Loss = 0.14777132868766785
New best model saved with BLEU score: 0.5561036467552185




Epoch 22: Average BLEU Reward = 0.5875452160835266, Loss = 0.13590960204601288
New best model saved with BLEU score: 0.5875452160835266




Epoch 23: Average BLEU Reward = 0.6207478642463684, Loss = 0.12290127575397491
New best model saved with BLEU score: 0.6207478642463684




Epoch 24: Average BLEU Reward = 0.6560900807380676, Loss = 0.11188386380672455
New best model saved with BLEU score: 0.6560900807380676




Epoch 25: Average BLEU Reward = 0.6744707226753235, Loss = 0.10494065284729004
New best model saved with BLEU score: 0.6744707226753235
Epoch 26: Average BLEU Reward = 0.663984477519989, Loss = 0.10665435343980789




Epoch 27: Average BLEU Reward = 0.6864967942237854, Loss = 0.10112183541059494
New best model saved with BLEU score: 0.6864967942237854




Epoch 28: Average BLEU Reward = 0.7239920496940613, Loss = 0.09010479599237442
New best model saved with BLEU score: 0.7239920496940613
Epoch 29: Average BLEU Reward = 0.5428977608680725, Loss = 0.16626647114753723
Epoch 30: Average BLEU Reward = 0.6540541648864746, Loss = 0.10525581985712051




Epoch 31: Average BLEU Reward = 0.7322658896446228, Loss = 0.08418123424053192
New best model saved with BLEU score: 0.7322658896446228




Epoch 32: Average BLEU Reward = 0.7750787138938904, Loss = 0.07287289947271347
New best model saved with BLEU score: 0.7750787138938904




Epoch 33: Average BLEU Reward = 0.7972219586372375, Loss = 0.06722596287727356
New best model saved with BLEU score: 0.7972219586372375




Epoch 34: Average BLEU Reward = 0.8155980110168457, Loss = 0.0625862181186676
New best model saved with BLEU score: 0.8155980110168457




Epoch 35: Average BLEU Reward = 0.8302013874053955, Loss = 0.05804962292313576
New best model saved with BLEU score: 0.8302013874053955




Epoch 36: Average BLEU Reward = 0.8391112685203552, Loss = 0.055187858641147614
New best model saved with BLEU score: 0.8391112685203552




Epoch 37: Average BLEU Reward = 0.8443979024887085, Loss = 0.05343588814139366
New best model saved with BLEU score: 0.8443979024887085




Epoch 38: Average BLEU Reward = 0.8485697507858276, Loss = 0.051236122846603394
New best model saved with BLEU score: 0.8485697507858276
Epoch 39: Average BLEU Reward = 0.8472580313682556, Loss = 0.05286308377981186
Epoch 40: Average BLEU Reward = 0.8479263186454773, Loss = 0.05136576294898987




Epoch 41: Average BLEU Reward = 0.8566272854804993, Loss = 0.04817615821957588
New best model saved with BLEU score: 0.8566272854804993




Epoch 42: Average BLEU Reward = 0.8636630177497864, Loss = 0.04572529345750809
New best model saved with BLEU score: 0.8636630177497864
Epoch 43: Average BLEU Reward = 0.8456494212150574, Loss = 0.04879550263285637
Epoch 44: Average BLEU Reward = 0.8394469618797302, Loss = 0.051158588379621506
Epoch 45: Average BLEU Reward = 0.8479113578796387, Loss = 0.048498280346393585
Epoch 46: Average BLEU Reward = 0.8599933385848999, Loss = 0.045452333986759186




Epoch 47: Average BLEU Reward = 0.8739767074584961, Loss = 0.041716400533914566
New best model saved with BLEU score: 0.8739767074584961




Epoch 48: Average BLEU Reward = 0.8785683512687683, Loss = 0.04074179753661156
New best model saved with BLEU score: 0.8785683512687683
Epoch 49: Average BLEU Reward = 0.8785335421562195, Loss = 0.04009300470352173




Epoch 50: Average BLEU Reward = 0.8804525136947632, Loss = 0.04137453809380531
New best model saved with BLEU score: 0.8804525136947632
Epoch 51: Average BLEU Reward = 0.875637412071228, Loss = 0.04088438302278519
Epoch 52: Average BLEU Reward = 0.8716857433319092, Loss = 0.04116284102201462
Epoch 53: Average BLEU Reward = 0.8771958947181702, Loss = 0.040746986865997314
Epoch 54: Average BLEU Reward = 0.8750103712081909, Loss = 0.0400402806699276
Epoch 55: Average BLEU Reward = 0.8758222460746765, Loss = 0.039519235491752625
Epoch 56: Average BLEU Reward = 0.8712751269340515, Loss = 0.040087681263685226
Epoch 57: Average BLEU Reward = 0.8690462112426758, Loss = 0.04060320928692818
Epoch 58: Average BLEU Reward = 0.8779487013816833, Loss = 0.03786556422710419




Epoch 59: Average BLEU Reward = 0.8828919529914856, Loss = 0.03746591880917549
New best model saved with BLEU score: 0.8828919529914856




Epoch 60: Average BLEU Reward = 0.8886860609054565, Loss = 0.0359245240688324
New best model saved with BLEU score: 0.8886860609054565




Epoch 61: Average BLEU Reward = 0.8948726058006287, Loss = 0.03553905710577965
New best model saved with BLEU score: 0.8948726058006287




Epoch 62: Average BLEU Reward = 0.8997297286987305, Loss = 0.032974276691675186
New best model saved with BLEU score: 0.8997297286987305




Epoch 63: Average BLEU Reward = 0.9033153057098389, Loss = 0.03170562908053398
New best model saved with BLEU score: 0.9033153057098389
Epoch 64: Average BLEU Reward = 0.8821078538894653, Loss = 0.037381112575531006
Epoch 65: Average BLEU Reward = 0.8750813603401184, Loss = 0.0380539633333683
Epoch 66: Average BLEU Reward = 0.8895054459571838, Loss = 0.035750482231378555
Epoch 67: Average BLEU Reward = 0.890917956829071, Loss = 0.034358132630586624
Epoch 68: Average BLEU Reward = 0.8904756307601929, Loss = 0.03423405811190605
Epoch 69: Average BLEU Reward = 0.9008812308311462, Loss = 0.032118551433086395
Epoch 70: Average BLEU Reward = 0.8982999920845032, Loss = 0.03218657523393631




Epoch 71: Average BLEU Reward = 0.904762327671051, Loss = 0.030205441638827324
New best model saved with BLEU score: 0.904762327671051




Epoch 72: Average BLEU Reward = 0.9104154109954834, Loss = 0.02951730042695999
New best model saved with BLEU score: 0.9104154109954834
Epoch 73: Average BLEU Reward = 0.9102648496627808, Loss = 0.03051701933145523




Epoch 74: Average BLEU Reward = 0.9109330773353577, Loss = 0.02981886826455593
New best model saved with BLEU score: 0.9109330773353577


Exception ignored in: <function _xla_gc_callback at 0x7b61d0eb0ee0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 98, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
# Load the model
model_path = '/content/drive/My Drive/model_BiLSTM_word_reinforcement_V2.h5'
model = load_model(model_path)
def translate(model, tokenizer, text, max_length):
    # Tokenize the input text
    sequence = tokenizer.texts_to_sequences([text])
    sequence_padded = pad_sequences(sequence, maxlen=max_length, padding='post')

    # Prepare the decoder input data
    decoder_input_data = np.zeros_like(sequence_padded)
    decoder_input_data[:, 1:] = sequence_padded[:, :-1]

    # Predict
    prediction = model.predict([sequence_padded, decoder_input_data])

    # Convert prediction to text
    predicted_sequence = np.argmax(prediction[0], axis=-1)
    predicted_text = tokenizer.sequences_to_texts([predicted_sequence])

    # Remove the <start> and <end> tokens
    predicted_text_clean = [token for token in predicted_text[0].split() if token not in ['<start>', '<end>']]

    # Join the tokens back together
    predicted_command = ' '.join(predicted_text_clean)

    return predicted_command



In [None]:
# Get the first 10 rows of the training dataset
first_10_nl_texts = nl_texts[:100]
first_10_cmd_texts = cmd_texts[:100]

# Translate each text and print the input, prediction, and actual command
for i in range(100):
    input_text = first_10_nl_texts[i]
    actual_command = first_10_cmd_texts[i]
    predicted_command = translate(model, tokenizer, input_text, max_length)

    print(f"Input Text: {input_text}")
    print(f"Predicted Command: {predicted_command}")
    print(f"Actual Command: {actual_command}")
    print("\n---\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Input Text: <start> get the label of a fat32 partition <end>
Predicted Command: decode members generate unused logs
Actual Command: <start> fatlabel {{/dev/sda1}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
Input Text: <start> set the label of a fat32 partition <end>
Predicted Command: ascii framework generate {{file.pdf}} logs {{foo}}
Actual Command: <start> fatlabel {{/dev/sdc3}} "{{new_label}}" <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
Input Text: <start> search for a package in your current sources <end>
Predicted Command: "{{title}}" docker file,
Actual Command: <start> apt-cache search {{query}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step
Input Text: <start> show information about a package <end>
Predicted Command: sudo default
Actual Command: <start> apt-cache show {{package}} <end>

---

In [None]:
# Load dataset
dataset_val = load_dataset("neulab/tldr", split='validation')

# Append <start> and <end> tokens to cmd_texts
cmd_text_val = ["<start> " + item['cmd'] + " <end>" for item in dataset_val]

# Combine nl_texts and cmd_texts for tokenizer fitting
nl_texts_val = [item['nl'] for item in dataset_val]

# Get the first 10 rows of the training dataset
first_100_nl_texts_val = nl_texts[:100]
first_100_cmd_texts_val = cmd_texts[:100]

# Translate each text and print the input, prediction, and actual command
for i in range(100):
    input_text = first_100_nl_texts_val[i]
    actual_command = first_100_cmd_texts_val[i]
    predicted_command = translate(model, tokenizer, input_text, max_length)

    print(f"Input Text: {input_text}")
    print(f"Predicted Command: {predicted_command}")
    print(f"Actual Command: {actual_command}")
    print("\n---\n")



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
Input Text: <start> get the label of a fat32 partition <end>
Predicted Command: decode members generate unused logs
Actual Command: <start> fatlabel {{/dev/sda1}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
Input Text: <start> set the label of a fat32 partition <end>
Predicted Command: ascii framework generate {{file.pdf}} logs {{foo}}
Actual Command: <start> fatlabel {{/dev/sdc3}} "{{new_label}}" <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Input Text: <start> search for a package in your current sources <end>
Predicted Command: "{{title}}" docker file,
Actual Command: <start> apt-cache search {{query}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
Input Text: <start> show information about a package <end>
Predicted Command: sudo default
Actual Command: <start> apt-cache show {{package}} <end>

---


In [None]:
# Load dataset
dataset_test = load_dataset("neulab/tldr", split='test')

# Append <start> and <end> tokens to cmd_texts
cmd_text_test = ["<start> " + item['cmd'] + " <end>" for item in dataset_val]

# Combine nl_texts and cmd_texts for tokenizer fitting
nl_texts_test = [item['nl'] for item in dataset_val]

# Get the first 10 rows of the training dataset
first_100_nl_texts_test = nl_texts[:100]
first_100_cmd_texts_test = cmd_texts[:100]

# Translate each text and print the input, prediction, and actual command
for i in range(100):
    input_text = first_100_nl_texts_test[i]
    actual_command = first_100_cmd_texts_test[i]
    predicted_command = translate(model, tokenizer, input_text, max_length)

    print(f"Input Text: {input_text}")
    print(f"Predicted Command: {predicted_command}")
    print(f"Actual Command: {actual_command}")
    print("\n---\n")



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Input Text: <start> get the label of a fat32 partition <end>
Predicted Command: decode members generate unused logs
Actual Command: <start> fatlabel {{/dev/sda1}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
Input Text: <start> set the label of a fat32 partition <end>
Predicted Command: ascii framework generate {{file.pdf}} logs {{foo}}
Actual Command: <start> fatlabel {{/dev/sdc3}} "{{new_label}}" <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
Input Text: <start> search for a package in your current sources <end>
Predicted Command: "{{title}}" docker file,
Actual Command: <start> apt-cache search {{query}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
Input Text: <start> show information about a package <end>
Predicted Command: sudo default
Actual Command: <start> apt-cache show {{package}} <end>

---
