In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import wandb
from kaggle_secrets import UserSecretsClient
# !pip install -U bitsandbytes

In [None]:
MODEL = "deepseek-ai/DeepSeek-Coder-V2-Lite-Base"
DATA_PATH = "/kaggle/input/linux-dataset/LINUX_TERMINAL_COMMANDS.jsonl"

In [None]:
# Load tokenizer
tokenizer=AutoTokenizer.from_pretrained(MODEL,trust_remote_code=True)

In [None]:
# Load model in 4-bit (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

In [None]:
# !pip show bitsandbytes

In [None]:
# !pip install -U bitsandbytes

In [None]:
base_model = prepare_model_for_kbit_training(base_model)

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"]
)

In [None]:
model = get_peft_model(base_model, peft_config)


In [None]:
# Load and process dataset
dataset = load_dataset("json", data_files=DATA_PATH)
dataset = dataset["train"].select_columns(["description", "command"])

In [None]:
# Apply chat template
def format_with_chat_template(example):
    messages = [
       {"role": "system", "content":'''You are a CLI command translator. Given a natural language request, output only the exact command(s) needed to accomplish the task. Provide no explanations, descriptions, or additional text - just the raw command(s) that can be directly executed in a terminal.

Examples:
Input: "list all files in current directory"
Output: ls -la

Input: "find all Python files"
Output: find . -name "*.py"

Input: "show running processes"
Output: ps aux'''},
        {"role": "user", "content": example["description"]},
        {"role": "assistant", "content": example["command"]}
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    return {"text": prompt}

In [None]:
formatted_dataset = dataset.map(format_with_chat_template)

## Checking best sequence length 

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Token length distribution
def get_token_lengths(example):
    return {"length": len(tokenizer(example["text"])["input_ids"])}

In [None]:
lengths = formatted_dataset.map(get_token_lengths)
lengths_list = [x['length'] for x in lengths]


In [None]:
# Plot
plt.hist(lengths_list, bins=50)
plt.xlabel("Token length")
plt.ylabel("Number of samples")
plt.title("Input Length Distribution")
plt.show()

In [None]:
def tokenize(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=192
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


In [None]:
tokenized_dataset = formatted_dataset.map(tokenize, batched=True, remove_columns=["description", "command", "text"])

# **Preparing the dataset**

In [None]:
has_chat_template = hasattr(tokenizer, "apply_chat_template")

In [None]:
print("Chat template available?", has_chat_template)

In [None]:
formatted_dataset

In [None]:
tokenized_dataset

In [None]:
split_data=tokenized_dataset.train_test_split(test_size=0.1, seed=42)

In [None]:
split_data

# **Training phase**

In [None]:
user_secrets= UserSecretsClient()
my_secret = user_secrets.get_secret("wandb-key") 

wandb.login(key=my_secret)

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./kaggle/working/deepseek-finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    num_train_epochs=5,
    save_strategy="no",
    fp16=True,
    logging_dir="./logs",
    eval_strategy='steps',
    eval_steps=5,
    logging_steps=5,
    report_to="wandb",            
    run_name="deepseek-cli-run",  
    ddp_find_unused_parameters=False 
)


In [None]:
# !pip install --upgrade transformers

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_data["train"],
    eval_dataset=split_data['test'],
    tokenizer=tokenizer
)

In [None]:
model.config.use_cache = False


In [None]:
import gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [None]:
for i in range(torch.cuda.device_count()):
    torch.cuda.set_device(i)
    torch.cuda.empty_cache()


In [None]:
trainer.train()

In [None]:
model.save_pretrained("/kaggle/working")
tokenizer.save_pretrained("/kaggle/working")

In [None]:
model.save_pretrained("CLI_model")
tokenizer.save_pretrained("CLI_model")

# **Saving the model**

In [None]:
from huggingface_hub import login

login(token="Api_key")


In [None]:
# Push LoRA adapter
model.push_to_hub("Maarij-Aqeel/CLI_model", use_temp_dir=False)

# Push tokenizer (very important!)
tokenizer.push_to_hub("Maarij-Aqeel/CLI_model")


# **Testing the model**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig

In [None]:
# !pip install -U bitsandbytes

In [None]:
# Load model in 4-bit (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Maarij-Aqeel/CLI_model", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "Maarij-Aqeel/CLI_model", 
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"  # Better device management
).eval()


In [None]:

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Run this ONCE after loading your model
def generate_cli_command(user_input, model, tokenizer):
    """Generate CLI command for a given user input"""
    messages = [
        {
            "role": "system",
            "content": '''You are a CLI command translator. Given a natural language request, output only the exact command(s) needed to accomplish the task. Provide no explanations, descriptions, or additional text - just the raw command(s) that can be directly executed in a terminal.
Examples:
Input: "list all files in current directory"
Output: ls -la
Input: "find all Python files"
Output: find . -name "*.py"
Input: "show running processes"
Output: ps aux'''
        },
        {"role": "user", "content": user_input}
    ]
    
    # Apply chat template and tokenize
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(
        prompt, 
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=192
    ).to(model.device)
    
    # Generate
    try:
        with torch.no_grad():
            output_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=64,
                do_sample=True,
                temperature=0.3,
                top_p=0.9,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                use_cache=False,
                output_attentions=False,
                output_hidden_states=False,
                return_dict_in_generate=False
            )
        
        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        # Extract just the command part
        if "Output:" in response:
            command = response.split("Output:")[-1].strip()
        else:
            # Fallback: get text after the user input
            prompt_end = f'"{user_input}"'
            if prompt_end in response:
                command = response.split(prompt_end)[-1].strip()
            else:
                command = response.split('\n')[-1].strip()
        
        return command
        
    except Exception as e:
        return f"Error: {e}"

In [None]:
def interactive_mode():
    """Interactive mode for real-time testing"""
    print("=== Interactive CLI Command Generator ===")
    print("Type your requests (or 'quit' to exit):")
    
    while True:
        user_input = input("\n> ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break
        
        if not user_input:
            continue
            
        print("Generating command...")
        command = generate_cli_command(user_input, model, tokenizer)
        print(f"Command: {command}")

In [None]:
interactive_mode()