In [1]:
!pip install tensorflow tensorflow-text datasets nltk rouge pydot graphviz
import nltk
nltk.download('punkt')


Collecting tensorflow-text
  Downloading tensorflow_text-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting tensorflow
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m49.9

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Concatenate, Dropout, Bidirectional, MultiHeadAttention
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np
import os
import pickle
from google.colab import drive
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from nltk.translate import bleu_score

In [3]:
# Mount Google Drive
drive.mount('/content/drive')
model_path = '/content/drive/My Drive/model_BiLSTM_word_reinforcement.h5'
history_path = '/content/drive/My Drive/history_BiLSTM_word_reinforcement.pkl'

Mounted at /content/drive


In [4]:
# Load dataset
dataset = load_dataset("neulab/tldr", split='train')

# Append <start> and <end> tokens to cmd_texts
cmd_texts = ["<start> " + item['cmd'] + " <end>" for item in dataset]

# Combine nl_texts and cmd_texts for tokenizer fitting
nl_texts = [item['nl'] for item in dataset]
all_texts = nl_texts + cmd_texts

# Initialize and fit tokenizer
tokenizer = Tokenizer(filters='', lower=True, split=' ')
tokenizer.fit_on_texts(all_texts)
vocab_size = len(tokenizer.word_index) + 1  # +1 for zero padding

# Prepare sequences
nl_sequences = tokenizer.texts_to_sequences(nl_texts)
cmd_sequences = tokenizer.texts_to_sequences(cmd_texts)

# Determine maximum sequence length
max_length = max(max(len(seq) for seq in nl_sequences), max(len(seq) for seq in cmd_sequences))

# Pad sequences
nl_sequences_padded = pad_sequences(nl_sequences, maxlen=max_length, padding='post')
cmd_sequences_padded = pad_sequences(cmd_sequences, maxlen=max_length, padding='post')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.76M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/525k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [20]:
# Prepare decoder input data and target data
decoder_input_data = np.zeros_like(cmd_sequences_padded)
decoder_input_data[:, 1:] = cmd_sequences_padded[:, :-1]

# RL Training Setup
optimizer = Adam(learning_rate=1e-3)
epochs = 200
batch_size = 64
def build_model(vocab_size, max_length):
    # Define the dimensionality of the embedding and LSTM units
    embedding_dim = 256
    lstm_units = 256

    # Encoder
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='encoder_embedding')(encoder_inputs)
    encoder_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True, return_state=True, name='encoder_lstm'), name='bidirectional_encoder')
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)

    # Since we are using a bidirectional LSTM, we need to concatenate the final states
    state_h = Concatenate(name='encoder_state_h')([forward_h, backward_h])
    state_c = Concatenate(name='encoder_state_c')([forward_c, backward_c])

    # Decoder
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='decoder_embedding')(decoder_inputs)
    decoder_lstm = LSTM(2 * lstm_units, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

    # Attention
    attention_layer = MultiHeadAttention(num_heads=4, key_dim=2 * lstm_units, name='multihead_attention')
    attention_output = attention_layer(query=decoder_outputs, key=encoder_outputs, value=encoder_outputs)

    # Concatenate attention input and decoder LSTM output
    decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_output])

    # Dense layer to predict each word probability
    dense_output = TimeDistributed(Dense(vocab_size, activation='softmax'), name='output_dense')(decoder_concat_input)

    # Define the model
    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=dense_output)

    # Compile the model with a suitable optimizer, loss, and metric
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()  # Show model architecture

    return model



In [23]:
# Load or create model
if os.path.exists(model_path):
    print("Loading existing model...")
    model = load_model(model_path)
else:
    print("Creating new model...")
    # [Insert model architecture here, similar to what you've defined]
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

# Define the RL training function
import tensorflow as tf
from nltk.translate import bleu_score

def rl_training_step(input_seqs, target_seqs, model, tokenizer, optimizer):
    with tf.GradientTape() as tape:
        predictions = model([input_seqs, np.zeros_like(input_seqs)], training=True)
        loss = tf.keras.losses.sparse_categorical_crossentropy(target_seqs, predictions, from_logits=False)
        avg_loss = tf.reduce_mean(loss)  # Average over batch and sequence

    grads = tape.gradient(avg_loss, model.trainable_variables)
    if not grads or any(g is None for g in grads):
        print("No gradients were calculated. Check the loss computation.")
        return None, None  # Return None for both rewards and loss

    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return tf.reduce_mean(loss).numpy(), avg_loss.numpy()  # Dummy reward, actual loss


# Create and compile model as before
model = build_model(vocab_size, max_length)
optimizer = Adam(learning_rate=1e-3)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


Creating new model...


In [24]:
# Training loop with early stopping and checkpointing
best_bleu = 0
patience = 5
wait = 0
for epoch in range(epochs):
    indices = np.random.permutation(len(nl_sequences_padded))
    t_rewards = []
    t_losses = []
    for i in range(0, len(nl_sequences_padded), batch_size):
        batch_indices = indices[i:i + batch_size]
        rewards, loss = rl_training_step(
            nl_sequences_padded[batch_indices],
            cmd_sequences_padded[batch_indices],
            model,
            tokenizer,
            optimizer
        )
        if rewards is not None and loss is not None:
            t_rewards.append(rewards)
            t_losses.append(loss)
        else:
            print("Skipping batch due to gradient issues.")

    if t_losses:  # Check if there were any successful batches
        avg_reward = np.mean(t_rewards)
        avg_loss = np.mean(t_losses)
        print(f"Epoch {epoch+1}: Average Reward = {avg_reward}, Loss = {avg_loss}")
        if avg_reward > best_bleu:
            best_bleu = avg_reward
            model.save(model_path)
            print(f"New best model saved with BLEU score: {best_bleu}")
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                print("Early stopping...")
                break



Epoch 1: Average Reward = 1.4233031272888184, Loss = 1.4233031272888184
New best model saved with BLEU score: 1.4233031272888184
Epoch 2: Average Reward = 0.944885790348053, Loss = 0.944885790348053
Epoch 3: Average Reward = 0.887574315071106, Loss = 0.887574315071106
Epoch 4: Average Reward = 0.8452404737472534, Loss = 0.8452404737472534
Epoch 5: Average Reward = 0.8118176460266113, Loss = 0.8118176460266113
Epoch 6: Average Reward = 0.757774293422699, Loss = 0.757774293422699
Early stopping...


In [25]:
def translate(model, tokenizer, text, max_length):
    # Tokenize the input text
    sequence = tokenizer.texts_to_sequences([text])
    sequence_padded = pad_sequences(sequence, maxlen=max_length, padding='post')

    # Prepare the decoder input data
    decoder_input_data = np.zeros_like(sequence_padded)
    decoder_input_data[:, 1:] = sequence_padded[:, :-1]

    # Predict
    prediction = model.predict([sequence_padded, decoder_input_data])

    # Convert prediction to text
    predicted_sequence = np.argmax(prediction[0], axis=-1)
    predicted_text = tokenizer.sequences_to_texts([predicted_sequence])

    # Remove the <start> and <end> tokens
    predicted_text_clean = [token for token in predicted_text[0].split() if token not in ['<start>', '<end>']]

    # Join the tokens back together
    predicted_command = ' '.join(predicted_text_clean)

    return predicted_command

In [28]:
# Get the first 10 rows of the training dataset
first_10_nl_texts = nl_texts[:100]
first_10_cmd_texts = cmd_texts[:100]

# Translate each text and print the input, prediction, and actual command
for i in range(100):
    input_text = first_10_nl_texts[i]
    actual_command = first_10_cmd_texts[i]
    predicted_command = translate(model, tokenizer, input_text, max_length)

    print(f"Input Text: {input_text}")
    print(f"Predicted Command: {predicted_command}")
    print(f"Actual Command: {actual_command}")
    print("\n---\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
Input Text: get the label of a fat32 partition
Predicted Command: lrzip
Actual Command: <start> fatlabel {{/dev/sda1}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
Input Text: set the label of a fat32 partition
Predicted Command: aria2c -o
Actual Command: <start> fatlabel {{/dev/sdc3}} "{{new_label}}" <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
Input Text: search for a package in your current sources
Predicted Command: git status
Actual Command: <start> apt-cache search {{query}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Input Text: show information about a package
Predicted Command: git status
Actual Command: <start> apt-cache show {{package}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
Input Text: show whether a package is installed and up to date
Predict