In [1]:
!pip install tensorflow-text
!pip install datasets gensim rouge pydot graphviz

import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Concatenate, Dropout, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np
import os
from google.colab import drive
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

Collecting tensorflow-text
  Downloading tensorflow_text-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow<2.17,>=2.16.1 (from tensorflow-text)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow<2.17,>=2.16.1->tensorflow-text)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m89.6 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.3.1 (from tensorflow<2.17,>=2.16.1->tensorflow-text)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)

In [6]:

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
model_path = '/content/drive/My Drive/model_BiLSTM_word_without_attention.h5.keras'
history_path = '/content/drive/My Drive/history_BiLSTM_word_without_attention.pkl'

In [3]:
# Load dataset
dataset = load_dataset("neulab/tldr", split='train')

# Append <start> and <end> tokens to cmd_texts
cmd_texts = ["<start> " + item['cmd'] + " <end>" for item in dataset]

# Combine nl_texts and cmd_texts for tokenizer fitting
nl_texts = [item['nl'] for item in dataset]
all_texts = nl_texts + cmd_texts

# Initialize and fit tokenizer
tokenizer = Tokenizer(filters='', lower=True, split=' ')
tokenizer.fit_on_texts(all_texts)
vocab_size = len(tokenizer.word_index) + 1  # +1 for zero padding

# Prepare sequences
nl_sequences = tokenizer.texts_to_sequences(nl_texts)
cmd_sequences = tokenizer.texts_to_sequences(cmd_texts)

# Determine maximum sequence length
max_length = max(max(len(seq) for seq in nl_sequences), max(len(seq) for seq in cmd_sequences))

# Pad sequences
nl_sequences_padded = pad_sequences(nl_sequences, maxlen=max_length, padding='post')
cmd_sequences_padded = pad_sequences(cmd_sequences, maxlen=max_length, padding='post')

# Prepare decoder input data and target data
decoder_input_data = np.zeros_like(cmd_sequences_padded)
decoder_input_data[:, 1:] = cmd_sequences_padded[:, :-1]
decoder_target_data = to_categorical(cmd_sequences_padded, num_classes=vocab_size)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.76M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/525k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [11]:
from tensorflow.keras.initializers import Orthogonal
from tensorflow.keras.models import load_model
import pickle
# Check if model exists and load or create new
if os.path.exists(model_path):
    print("Loading existing model...")
    model = load_model(model_path)
    # with open(history_path, 'rb') as f:
    #     history = pickle.load(f)
else:
    print("Creating new model...")
    embedding_dim = 128
    units = 128

    encoder_inputs = Input(shape=(None,))
    encoder_emb = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
    encoder_lstm = Bidirectional(LSTM(units, return_sequences=True, return_state=True))
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_emb)
    encoder_states = [Concatenate()([forward_h, backward_h]), Concatenate()([forward_c, backward_c])]

    decoder_inputs = Input(shape=(None,))
    decoder_emb = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(units * 2, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)

    decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    checkpoint = ModelCheckpoint(model_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5)


    # Now, fit the model
    history = model.fit(
        [nl_sequences_padded, decoder_input_data],  # Encoder and Decoder inputs
        decoder_target_data,                        # Target outputs
        batch_size=64,
        epochs=100,
        validation_split=0.2,
        callbacks=[checkpoint, early_stopping]
    )

    history = history.history

    # Save the model and history
    model.save(model_path)
    with open(history_path, 'wb') as f:
        pickle.dump(history, f)

Loading existing model...


In [12]:
# Calculate BLEU, ROUGE, and perplexity for validation data
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import numpy as np

def evaluate_model(model, tokenizer, nl_sequences_padded, cmd_sequences_padded, max_length, vocab_size, history):
    # Prepare the input and output for validation
    val_indices = int(len(nl_sequences_padded) * 0.8)  # Assuming validation_split=0.2
    val_nl_padded = nl_sequences_padded[val_indices:]
    val_cmd_padded = cmd_sequences_padded[val_indices:]

    # Check if cmd_sequences_padded is one-hot encoded, convert if necessary
    if val_cmd_padded.ndim == 3:
        val_cmd_padded = np.argmax(val_cmd_padded, axis=-1)

    # Initialize val_decoder_input_data correctly using explicit shape and data type
    val_decoder_input_data = np.zeros((val_cmd_padded.shape[0], val_cmd_padded.shape[1]), dtype=int)
    val_decoder_input_data[:, 1:] = val_cmd_padded[:, :-1]  # Shift cmd data for decoder input

    # Ensure it's 2D and print the shape
    print(f"Validation NL input shape: {val_nl_padded.shape}")
    print(f"Validation CMD input shape: {val_decoder_input_data.shape}")
    if val_decoder_input_data.ndim != 2:
        raise ValueError("Decoder input data should be 2D. Check data preparation steps.")

    # Predict the command sequences
    predictions = model.predict([val_nl_padded, val_decoder_input_data])

    # Convert predictions to text
    predicted_texts = []
    for prediction in predictions:
        sequence = np.argmax(prediction, axis=-1)
        text = tokenizer.sequences_to_texts([sequence])
        predicted_texts.append(text[0])

    # Convert actual command sequences to text
    actual_texts = tokenizer.sequences_to_texts(val_cmd_padded)

    # Compute BLEU and ROUGE scores
    bleu_scores = [sentence_bleu([act.split()], pred.split()) for act, pred in zip(actual_texts, predicted_texts)]
    rouge = Rouge()
    rouge_scores = [rouge.get_scores(pred, act)[0] for pred, act in zip(predicted_texts, actual_texts)]

    # Calculate mean BLEU and ROUGE scores
    mean_bleu = np.mean(bleu_scores)
    rouge_f = np.mean([score['rouge-l']['f'] for score in rouge_scores])

    # Calculate perplexity on validation loss
    val_loss = history['val_loss'][-1]
    perplexity = np.exp(val_loss)

    print(f"Average BLEU Score on Validation: {mean_bleu}")
    print(f"Average ROUGE-L F-Score on Validation: {rouge_f}")
    print(f"Perplexity on Validation: {perplexity}")

# Ensure history is correctly passed to the function after the model has been trained
evaluate_model(model, tokenizer, nl_sequences_padded, cmd_sequences_padded, max_length, vocab_size, history)


Validation NL input shape: (1283, 34)
Validation CMD input shape: (1283, 34)
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 128ms/step


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score on Validation: 2.2714398556136707e-156
Average ROUGE-L F-Score on Validation: 0.4556140490831066
Perplexity on Validation: 3.0343414368756076


In [13]:
def translate(model, tokenizer, text, max_length):
    # Tokenize the input text
    sequence = tokenizer.texts_to_sequences([text])
    sequence_padded = pad_sequences(sequence, maxlen=max_length, padding='post')

    # Prepare the decoder input data
    decoder_input_data = np.zeros_like(sequence_padded)
    decoder_input_data[:, 1:] = sequence_padded[:, :-1]

    # Predict
    prediction = model.predict([sequence_padded, decoder_input_data])

    # Convert prediction to text
    predicted_sequence = np.argmax(prediction[0], axis=-1)
    predicted_text = tokenizer.sequences_to_texts([predicted_sequence])

    # Remove the <start> and <end> tokens
    predicted_text_clean = [token for token in predicted_text[0].split() if token not in ['<start>', '<end>']]

    # Join the tokens back together
    predicted_command = ' '.join(predicted_text_clean)

    return predicted_command

In [14]:
# Get the first 10 rows of the training dataset
first_10_nl_texts = nl_texts[:100]
first_10_cmd_texts = cmd_texts[:100]

# Translate each text and print the input, prediction, and actual command
for i in range(100):
    input_text = first_10_nl_texts[i]
    actual_command = first_10_cmd_texts[i]
    predicted_command = translate(model, tokenizer, input_text, max_length)

    print(f"Input Text: {input_text}")
    print(f"Predicted Command: {predicted_command}")
    print(f"Actual Command: {actual_command}")
    print("\n---\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 783ms/step
Input Text: get the label of a fat32 partition
Predicted Command: sudo
Actual Command: <start> fatlabel {{/dev/sda1}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Input Text: set the label of a fat32 partition
Predicted Command: sudo
Actual Command: <start> fatlabel {{/dev/sdc3}} "{{new_label}}" <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Input Text: search for a package in your current sources
Predicted Command: sudo
Actual Command: <start> apt-cache search {{query}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Input Text: show information about a package
Predicted Command: sudo
Actual Command: <start> apt-cache show {{package}} <end>

---

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Input Text: show whether a package is installed and up to date
Predicted Command: sudo
