# Loading the Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# used to create dataset from uploaded json file
import pandas as pd
import json

def create_dataset(json_path, csv_path):
    # Load data from JSON file
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Create DataFrame
    df = pd.DataFrame(data)

    # Write DataFrame to CSV
    df.to_csv(csv_path, index=False)

    return df

# json_path = 'data.json'
# csv_path = 'train_data.csv'
# data = create_dataset(json_path, csv_path)

file_path = '/content/drive/MyDrive/kubesense/dataset/updted_data_with_class.json'
data = create_dataset(file_path, 'data.csv')

In [None]:
data.head()

["Please create a namespace named 'production'.", "Could you set up a namespace called 'testing'?", "Let's create a namespace for our development environment. Name it 'dev'.", "I need a namespace for our frontend services. Let's call it 'frontend'.", "We require a namespace named 'backend' for our backend services. Can you create it?", "Create a namespace named 'staging' for our staging environment.", "Let's organize our monitoring resources into a namespace named 'monitoring'.", "Could you create a namespace named 'logging' for our logging infrastructure?", "We need a namespace called 'analytics' for our analytics services. Please set it up.", "For our CI/CD pipelines, let's have a namespace named 'ci-cd'.", "Create a namespace named 'tools' to host our various utility services.", "Please establish a namespace called 'security' for our security-related components.", "Can you create a namespace named 'data' for our data-related services?", "Let's create a namespace for our machine lear

In [None]:
data

#Pre-processing the dataset and Creating tensors

In [None]:
# from transformers import BertTokenizer
# import torch

# # Initialize the BERT tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Define the dataset in the specified format
# dataset = []

# # Add input sentences along with their tokenized representations to the dataset
# user_prompts = list(data['prompt'])

# for text in user_prompts:
#     # Tokenize the input text
#     tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True)
#     # Convert tokens to tensor
#     input_tensor = torch.tensor(tokenized_text)
#     # Append to the list of tokenized inputs along with the original text
#     dataset.append((text, input_tensor))

# # Print the dataset
# for i, (text, input_tensor) in enumerate(dataset):
#     print(f"Example {i + 1}:")
#     print(f"Text: {text}")
#     print(f"Tokenized input: {input_tensor}")

In [None]:


# test_dataset = []
# test_dataset.append(dataset[0][1])
# test_dataset.append(dataset[1][1])

# print('tensor 1 length: ', dataset[0][1].size())
# print('tensor 2 length: ', dataset[1][1].size())
# print(test_dataset)

tensor 1 length:  torch.Size([12])
tensor 2 length:  torch.Size([14])
[tensor([  101,  3531,  3443,  1037,  3415, 15327,  2315,  1005,  2537,  1005,
         1012,   102]), tensor([  101,  2071,  2017,  2275,  2039,  1037,  3415, 15327,  2170,  1005,
         5604,  1005,  1029,   102])]


In [None]:
# import torch
# from torch.nn.utils.rnn import pad_sequence

# # Assuming tokenized_inputs is a list of tokenized input tensors
# # Pad the input sequences to the same length
# padded_inputs = pad_sequence(test_dataset, batch_first=True, padding_value=0)

# print(padded_inputs)
# print('tensor 1 length: ', padded_inputs[0][1].size())
# print('tensor 2 length: ', padded_inputs[1][1].size())
# # # Pass the padded inputs through the embedding layer
# # embedded = self.embedding(padded_inputs)

tensor([[  101,  3531,  3443,  1037,  3415, 15327,  2315,  1005,  2537,  1005,
          1012,   102,     0,     0],
        [  101,  2071,  2017,  2275,  2039,  1037,  3415, 15327,  2170,  1005,
          5604,  1005,  1029,   102]])
tensor 1 length:  torch.Size([])
tensor 2 length:  torch.Size([])


# Classification (Linear SVM)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report


# Extract features and labels
X = list(data['prompt'])
y = list(data['type'])

# Label encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Support Vector Machine classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)

# Evaluate the classifier
y_pred = svm_classifier.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Example prediction
new_prompt = "deploy the service in the namespace test"
new_prompt_tfidf = tfidf_vectorizer.transform([new_prompt])
predicted_label = svm_classifier.predict(new_prompt_tfidf)
predicted_type = label_encoder.inverse_transform(predicted_label)
print("Predicted kubectl command type:", predicted_type)

              precision    recall  f1-score   support

       apply       1.00      1.00      1.00        11
      create       0.80      0.80      0.80         5
      delete       0.95      0.95      0.95        21
    describe       1.00      1.00      1.00        10
        edit       1.00      1.00      1.00        11
         get       1.00      1.00      1.00        23

    accuracy                           0.98        81
   macro avg       0.96      0.96      0.96        81
weighted avg       0.98      0.98      0.98        81

Predicted kubectl command type: ['apply']


#Generating embeddings

In [None]:
from transformers import BertTokenizer, BertModel
from transformers import RobertaTokenizer, RobertaModel

import torch

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# # Initialize the RoBERTa tokenizer and model
# tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
# model = RobertaModel.from_pretrained('roberta-large').to(device)

def generate_embedding(tokenizer, model, user_input):

  # Tokenize the input text
  tokenized_input = tokenizer.encode(user_input, add_special_tokens=True, max_length=128, truncation=True)

  # Convert tokens to a PyTorch tensor
  input_tensor = torch.tensor(tokenized_input).to(device)

  # Add batch dimension
  input_tensor = input_tensor.unsqueeze(0).to(device)  # Add batch dimension

  # Pass the input tensor through the BERT model
  with torch.no_grad():
    outputs = model(input_tensor)

  # Extract the embeddings from the BERT model outputs
  last_hidden_state = outputs.last_hidden_state

  # Extract embeddings for the first token ([CLS] token)
  embedding = last_hidden_state[:, 0, :]

  return embedding

# List of user inputs filtered based on the classified command type
filtered_user_inputs = [prompt for prompt, prompt_type in zip(data['prompt'], data['type']) if prompt_type == 'delete']

# Initialize a list to store the embeddings
embeddings = []

# Loop through each user input
for user_input in filtered_user_inputs:
    embedding = generate_embedding(tokenizer, model, user_input)

    # Append the embedding to the list of embeddings
    embeddings.append(embedding)

# Concatenate the embeddings along the batch dimension
embeddings_tensor = torch.cat(embeddings, dim=0)

# Print the shape of the embeddings tensor
print("Shape of the embeddings tensor:", embeddings_tensor.shape)

Shape of the embeddings tensor: torch.Size([110, 768])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to calculate cosine similarity between two embeddings
def calculate_similarity(target_embedding, dataset_embeddings):
    # Convert target embedding and dataset embeddings to numpy arrays
    target_embedding_np = target_embedding.cpu().numpy()
    dataset_embeddings_np = dataset_embeddings.cpu().numpy()

    # Calculate cosine similarity between the target embedding and each embedding in the dataset
    similarities = cosine_similarity(target_embedding_np, dataset_embeddings_np)
    return similarities.squeeze()

# Example usage:
# Generate embedding for the target text
# target_text = "i want to delete the service 'service' in the the namespace 'name'"
target_text = "Could you delete the service labeled as 'backend-service' in the 'backend-ns' namespace?"
target_embedding = generate_embedding(tokenizer, model, target_text)

# Calculate similarity with each embedding in the dataset
similarities = calculate_similarity(target_embedding, embeddings_tensor)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances


target_text = "Service: Can you delete the service named 'x'?"
target_text2 = "Pod: can you delete the pod 'x'"

target_embedding1 = generate_embedding(tokenizer, model, target_text)
target_embedding2 = generate_embedding(tokenizer, model, target_text2)

target_embedding1 = target_embedding1.numpy()
target_embedding2 = target_embedding2.numpy()

# Reshape the arrays to match the expected input shape of cosine_similarity
# target_embedding1 = target_embedding1.reshape(1, -1)
# target_embedding2 = target_embedding2.reshape(1, -1)

# Calculate cosine similarity
similarity = cosine_similarity(target_embedding1, target_embedding2)[0][0]
print(similarity)



# Calculate Euclidean distance
euclidean_dist = euclidean_distances(target_embedding1, target_embedding2)[0][0]
print(euclidean_dist)


0.93688947
5.052328


In [None]:
from collections import Counter

def get_closest_matches(similarities, k=10):
    # Rank dataset examples based on similarity
    ranked_indices = np.argsort(similarities)[::-1]
    print('ranked indices: ', ranked_indices)

    # Get the top-k closest matches
    closest_matches_indices = ranked_indices[:k]
    print(closest_matches_indices)

    filtered_user_prompts = [prompt for prompt, prompt_type in zip(data['command'], data['type']) if prompt_type == 'delete']

    # Extract the corresponding kubectl commands
    closest_matches_commands = [filtered_user_prompts[i] for i in closest_matches_indices]

    return closest_matches_commands

# Get the closest matches and return the corresponding kubectl commands
closest_matches_commands = get_closest_matches(similarities)
print(closest_matches_commands)
print(' ')

# Count occurrences of each unique command
command_counts = Counter(closest_matches_commands)

# Print the counts of each unique command
for command, count in command_counts.items():
    print(f"{command}: {count}")

ranked indices:  [ 53  33  48  37  57  43  20  40  58  47  34  54  23  24  27  28  29  13
  80  90  98  38  67  75  86  94  74  66  81   9  16  70  78  61  79  71
   2  95  87  99  91 102   3  63 108  22  60   0  36  83 103   5 107 100
 104 101 109  11  45  46   7  52   4 105  50  26 106  25  32   1  19  42
  15  41  12  30  18  88  96  14  21  59  44   8  39  55   6  64  49  51
  93  85  35  56  84  73  65  62  31  17  82  10  68  76  72  69  77  89
  97  92]
[53 33 48 37 57 43 20 40 58 47]
['kubectl delete service <name> -n <name>', 'kubectl delete pod <name> -n <name>', 'kubectl delete service <name> -n <name>', 'kubectl delete pod <name> -n <name>', 'kubectl delete service <name> -n <name>', 'kubectl delete service <name> -n <name>', 'kubectl delete pod <name> -n <name>', 'kubectl delete service <name> -n <name>', 'kubectl delete service <name> -n <name>', 'kubectl delete service <name> -n <name>']
 
kubectl delete service <name> -n <name>: 7
kubectl delete pod <name> -n <name>: 3


In [None]:
from collections import Counter

# Function to get the most occurring command from a list of commands
def get_most_occurring_command(commands):
    # Count the occurrences of each command
    command_counts = Counter(commands)

    # Get the most common command and its count
    most_common_command, count = command_counts.most_common(1)[0]

    return most_common_command

# Example usage:
# Get the most occurring command from the list of closest matches commands
most_occurring_command = get_most_occurring_command(closest_matches_commands)
most_occurring_command

'kubectl delete service <name> -n <name>'

In [None]:

commands = list(data['command'])
# commands[0]

for i in [ 343, 349, 210, 353, 354, 206, 397, 389, 350, 358]:
  print(user_inputs[i])

can i get the configmaps available in the namespace called namespace1?
Could you display the configmaps in the 'namespace1' namespace?
Could you remove the service labeled as 'log-service' in the 'log-ns' namespace?
Could you list all the configmaps in the 'namespace1' namespace?
Can you display the configmaps in the 'namespace1' namespace?
Could you delete the service labeled as 'backend-service' in the 'backend-ns' namespace?
Could you edit the svc service_name in the 'namespace1' namespace?
Could you modify the svc service_name in the 'namespace1' namespace?
Can you list the configmaps in the 'namespace1' namespace?
Can you list the configmaps in the 'namespace1' namespace?


In [None]:
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
from nltk import word_tokenize, pos_tag, ne_chunk

# Define the text to be analyzed
text = "Create a namespace called 'namespace1'"

# Tokenize the text
tokens = word_tokenize(text)

# Perform part-of-speech tagging
tagged_tokens = pos_tag(tokens)

# Perform named entity recognition
entities = ne_chunk(tagged_tokens)

print(entities)


(S Create/VB a/DT namespace/NN called/VBN 'namespace1/CD '/'')


In [None]:
import re

# Define the user input
# user_input = "I want to create a namespace called 'namespace1'."
user_input = "I want to edit the configmap 'confmap.yaml' in the namespace 'configs'"

# Define a regular expression pattern to extract entities enclosed in single quotes
pattern = r"'(.*?)'"

# Find all matches of the pattern in the user input
entities = re.findall(pattern, user_input)

# Print the extracted entities
for entity in entities:
    print("Entity:", entity)


kubectl_template = 'kubectl edit configmap <name> -n <name>'

# Define the entity values to replace within the <name> tags
entity1 = entities[0]
entity2 = entities[1]

# Replace the entities within the command
# formatted_command = kubectl_template.replace('<name>', entity1).replace('<name>', entity2)

# print("Formatted command:", formatted_command)

formatted_command = kubectl_template.replace('<name>', '{}').format(entity1, entity2)

print("Formatted command:", formatted_command)




Entity: confmap.yaml
Entity: configs
Formatted command: kubectl edit configmap confmap.yaml -n configs


In [None]:
# Using LLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
model = GPT2LMHeadModel.from_pretrained("gpt2-large")



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Example input user prompt
user_prompt = "i want to create a namespace called x. Can you tell me the kubectl command for this?"

# Step 1: Contextual Understanding
# Tokenize the user prompt
input_ids = tokenizer.encode(user_prompt, return_tensors="pt")
# Generate output tokens using the GPT-2 model
output = model.generate(input_ids, max_length=100, num_return_sequences=1)
# Decode the output tokens to get the generated text
context_understanding_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(context_understanding_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


i want to create a namespace called x. Can you tell me the kubectl command for this?

I'm not sure what you mean by "create a namespace". I'm not sure what you mean by "create a namespace".

I'm not sure what you mean by "create a namespace".

I'm not sure what you mean by "create a namespace".

I'm not sure what you mean by "create a namespace".

I'm not


In [None]:
import pandas as pd

# Replace 'your_file.txt' with the path to your file
file_path = 'entity_rec_dataset.txt'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path, delimiter=',')

# Display the first few rows of the DataFrame to verify the data was loaded correctly
df.head()
# df.columns


Unnamed: 0,user_prompt,namespace,resource
can you describe pod argocd-server in the namespace sample,"""sample""","""argocd-server""",
describe pod web-pod in the namespace dev,"""dev""","""web-pod""",
describe pod cache-server in the namespace testing,"""testing""","""cache-server""",
describe pod nginx-pod in the namespace production,"""production""","""nginx-pod""",
describe pod database-pod in the namespace staging,"""staging""","""database-pod""",


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")

input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-2b-it.
401 Client Error. (Request ID: Root=1-6608e68e-6183dd0962eeec7f173e2769;577b8f8a-789b-4704-ba28-9061923a733a)

Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Repo model google/gemma-2b-it is gated. You must be authenticated to access it.

In [None]:
# using bertforsquence
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


# Define a custom PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


# Split the dataset into train and test sets
train_prompts, test_prompts, train_commands, test_commands = train_test_split(data['prompt'], data['command'], test_size=0.2, random_state=42)

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_commands)
test_labels_encoded = label_encoder.transform(test_commands)

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=len(label_encoder.classes_))

# Tokenize the input texts
train_encodings = tokenizer(train_prompts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_prompts.tolist(), truncation=True, padding=True)

# Create DataLoader objects for train and test sets
train_dataset = CustomDataset(train_encodings, train_labels_encoded.tolist())
test_dataset = CustomDataset(test_encodings, test_labels_encoded.tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=1e-5)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

for epoch in range(15):  # Number of epochs
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluate on the test set
model.eval()
predictions = []
true_labels = []

for batch in tqdm(test_loader):
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, axis=1).cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

# Decode the predicted and true labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(true_labels)

# Calculate accuracy
accuracy = sum(p == t for p, t in zip(predicted_labels, true_labels)) / len(predicted_labels)
print("Accuracy:", accuracy)
print('----Classification Report----')
print(classification_report(true_labels, predicted_labels))

model.eval()

new_input_text = "can you describe a pod 'pod_name' in namespace 'x'"
inputs = tokenizer(new_input_text, return_tensors="pt", truncation=True, padding=True).to(device)

with torch.no_grad():
    outputs = model(**inputs)

predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
predicted_label = label_encoder.classes_[predicted_label_idx]

print("Predicted Label:", predicted_label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 41/41 [00:10<00:00,  3.90it/s]
100%|██████████| 41/41 [00:10<00:00,  3.87it/s]
100%|██████████| 41/41 [00:10<00:00,  3.85it/s]
100%|██████████| 41/41 [00:10<00:00,  3.83it/s]
100%|██████████| 41/41 [00:10<00:00,  3.78it/s]
100%|██████████| 41/41 [00:10<00:00,  3.75it/s]
100%|██████████| 41/41 [00:11<00:00,  3.71it/s]
100%|██████████| 41/41 [00:10<00:00,  3.77it/s]
100%|██████████| 41/41 [00:11<00:00,  3.53it/s]
100%|██████████| 41/41 [00:11<00:00,  3.67it/s]
100%|██████████| 41/41 [00:11<00:00,  3.62it/s]
100%|██████████| 41/41 [00:11<00:00,  3.54it/s]
100%|██████████| 41/41 [00:10<00:00,  3.80it/s]
100%|██████████| 41/41 [00:10<00:00,  3.79it/s]
100%|██████████| 41/41 [00:10<00:0

Accuracy: 1.0
----Classification Report----
                                                    precision    recall  f1-score   support

                           kubectl apply -f <name>       1.00      1.00      1.00        11
                   kubectl create namespace <name>       1.00      1.00      1.00         5
        kubectl delete deployment <name> -n <name>       1.00      1.00      1.00         3
                   kubectl delete namespace <name>       1.00      1.00      1.00         6
               kubectl delete pod <name> -n <name>       1.00      1.00      1.00         2
                 kubectl delete secret secret_name       1.00      1.00      1.00         3
           kubectl delete service <name> -n <name>       1.00      1.00      1.00         4
       kubectl delete statefulset <name> -n <name>       1.00      1.00      1.00         3
      kubectl describe deployment <name> -n <name>       1.00      1.00      1.00         3
             kubectl describe pod <




In [3]:
data.head()

Unnamed: 0,prompt,command,type
0,Please create a namespace named 'production'.,kubectl create namespace <name>,create
1,Could you set up a namespace called 'testing'?,kubectl create namespace <name>,create
2,Let's create a namespace for our development e...,kubectl create namespace <name>,create
3,I need a namespace for our frontend services. ...,kubectl create namespace <name>,create
4,We require a namespace named 'backend' for our...,kubectl create namespace <name>,create


In [12]:
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch
from sklearn.model_selection import train_test_split
import numpy as np

# Define Dataset class
class KubectlDataset(Dataset):
    def __init__(self, prompts, commands, tokenizer):
        self.prompts = prompts
        self.commands = commands
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        prompt = self.prompts[idx]
        command = self.commands[idx]
        return prompt, command


prompts = list(data['prompt'])
commands = list(data['command'])

# Split dataset into train and validation sets (80% train, 20% validation)
train_prompts, valid_prompts, train_commands, valid_commands = train_test_split(prompts, commands, test_size=0.2, random_state=42)

# Load pre-trained BART model and tokenizer
model_name = 'facebook/bart-large'
tokenizer = BartTokenizer.from_pretrained(model_name)
config = BartConfig.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name, config=config)

# Prepare dataset
train_dataset = KubectlDataset(train_prompts, train_commands, tokenizer)
valid_dataset = KubectlDataset(valid_prompts, valid_commands, tokenizer)

# Define DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# Fine-tuning parameters
lr = 3e-5
num_epochs = 30
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# Early stopping parameters
patience = 3  # Number of epochs to wait before early stopping
best_valid_loss = float('inf')
early_stopping_counter = 0

# Fine-tuning loop
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        prompts, commands = batch
        input_ids = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
        labels = tokenizer(commands, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in valid_loader:
            prompts, commands = batch
            input_ids = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
            labels = tokenizer(commands, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)

            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

    # Calculate average validation loss
    avg_valid_loss = total_loss / len(valid_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_valid_loss}')

    # Check for early stopping
    if avg_valid_loss < best_valid_loss:
        best_valid_loss = avg_valid_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= patience:
            print(f'Early stopping at epoch {epoch+1} with best validation loss: {best_valid_loss}')
            break

# Text generation
user_prompt = "Deploy the application"
input_ids = tokenizer.encode(user_prompt, return_tensors="pt").to(device)
generated_ids = model.generate(input_ids=input_ids, max_length=50, num_beams=4, early_stopping=True)
generated_command = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print(f'User Prompt: {user_prompt}')
print(f'Generated kubectl Command: {generated_command}')


Epoch 1/30, Validation Loss: 3.46484375
Epoch 2/30, Validation Loss: 2.905877669652303
Epoch 3/30, Validation Loss: 2.4050615628560386
Epoch 4/30, Validation Loss: 1.7275536855061848
Epoch 5/30, Validation Loss: 1.443963646888733
Epoch 6/30, Validation Loss: 1.214551329612732
Epoch 7/30, Validation Loss: 1.0107586185137432
Epoch 8/30, Validation Loss: 0.8825874328613281
Epoch 9/30, Validation Loss: 0.5644655028978983
Epoch 10/30, Validation Loss: 0.3714420100053151
Epoch 11/30, Validation Loss: 0.23893027504285178
Epoch 12/30, Validation Loss: 0.1584938665231069
Epoch 13/30, Validation Loss: 0.14720496535301208
Epoch 14/30, Validation Loss: 0.11519220719734828
Epoch 15/30, Validation Loss: 0.11663808425267537
Epoch 16/30, Validation Loss: 0.09509812543789546
Epoch 17/30, Validation Loss: 0.30788586537043255
Epoch 18/30, Validation Loss: 0.4776074290275574
Epoch 19/30, Validation Loss: 0.3711738884449005
Early stopping at epoch 19 with best validation loss: 0.09509812543789546
User Prom

In [18]:
# Text generation
# user_prompt = "Can you create a namespace called 'x'"
user_prompt = "i want to delete the pod 'pod' in the namespace 'x'"
input_ids = tokenizer.encode(user_prompt, return_tensors="pt").to(device)
generated_ids = model.generate(input_ids=input_ids, max_length=50, num_beams=4, early_stopping=True)
generated_command = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print(f'User Prompt: {user_prompt}')
print(f'Generated kubectl Command: {generated_command}')

User Prompt: i want to delete the pod 'pod' in the namespace 'x'
Generated kubectl Command: kubectl delete pod <name> -n <name>)


In [34]:
from nltk.translate.bleu_score import corpus_bleu

# Example reference commands (replace with your actual reference data)
reference_commands = [
    'kubectl apply -f deployment.yaml',
    'kubectl scale deployment my-deployment --replicas=3',
    'kubectl get pods'
]

# Example generated commands
generated_commands = [
    'kubectl apply -f deployment.yaml',
    'kubectl get pods',
    'kubectl scale deployment my-deployment --replicas=5'
]

# Tokenize reference commands
# tokenized_reference_commands = reference_commands

tokenized_reference_commands = [ref_command.split() for ref_command in reference_commands]


# Tokenize generated commands
tokenized_generated_commands = [command.split() for command in generated_commands]

print(tokenized_generated_commands)
print(tokenized_reference_commands)

# Compute BLEU score
bleu_score = corpus_bleu(tokenized_reference_commands, tokenized_generated_commands)
print("BLEU Score:", bleu_score)


[['kubectl', 'apply', '-f', 'deployment.yaml'], ['kubectl', 'get', 'pods'], ['kubectl', 'scale', 'deployment', 'my-deployment', '--replicas=5']]
[['kubectl', 'apply', '-f', 'deployment.yaml'], ['kubectl', 'scale', 'deployment', 'my-deployment', '--replicas=3'], ['kubectl', 'get', 'pods']]
BLEU Score: 0
