### Imports & Installation

In [None]:
# !pip install langchain_google_genai
# !pip install langchain_community
# !pip install langchain_huggingface
# !pip install gradio
# !pip install rapidfuzz
# !pip install pypdf
# !pip install faiss-cpu
# !pip install spacy
# !pip install fuzzywuzzy
# !python -m spacy download en_core_web_sm
# !pip install python-Levenshtein
# !pip install django
# !pip install django-cors-headers
# !pip install djangorestframework
# !pip install drf_spectacular
# !pip install psycopg2
# !python.exe -m pip install --upgrade pip


In [1]:
import json, os, re, ast
import subprocess
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough
from langchain.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import gradio as gr
from sentence_transformers import SentenceTransformer, util
import spacy
from rapidfuzz import fuzz
pdf_file = "./res/Cli script guidebook.pdf"


  from .autonotebook import tqdm as notebook_tqdm





### Load Model

In [65]:
def get_llm(model_name="gemini-1.5-pro"):
    return ChatGoogleGenerativeAI(
        model=model_name,
        temperature=0.1,
        max_output_tokens=500,
        google_api_key=os.environ.get("GOOGLE_API_KEY")
    )

#### Load API KEY from environment

In [66]:
api_key_path = './credentials/google_credentials.json'
if not os.path.exists(api_key_path):
    raise FileNotFoundError(f"API key file not found at {api_key_path}. Please provide the correct path.")

with open(api_key_path, 'r') as f:
    api_key = json.load(f)
os.environ['GOOGLE_API_KEY'] = api_key.get("GOOGLE_API_KEY")

### Load & Prepare Data

In [192]:
loader = PyPDFLoader(pdf_file)
pdf_docs = loader.load_and_split()

with open("res/data_mapping.json", "r") as f:
    data_mapping = json.load(f)

REFERENCE_KEYWORDS = list(data_mapping.keys())
data_mapping_doc = Document(page_content='Default Arguments for each node:\n'+ json.dumps(data_mapping))

all_docs = pdf_docs  + [data_mapping_doc]
# print(all_docs[4])

### Create Vector Space

In [193]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents=all_docs, embedding=embeddings)
retriever = vectorstore.as_retriever()

# vectorstore = FAISS.from_documents(documents=mapping_doc, embedding=embeddings)
# mapping_retriever = vectorstore.as_retriever()

### RAG Chain

#### Helper Functions

In [194]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def combine_inputs(input_dict, retriever):
    question = input_dict["question"]
    retrieved_docs = retriever.get_relevant_documents(question)
    return {
        "context": format_docs(retrieved_docs),
        "question": question
    }

def parse_command_list(output: str):
    pattern = r"\[(.*?)\]"
    matches = re.findall(pattern, output, re.DOTALL)

    if matches:
        output = f"[{matches[0]}]"

    try:
        command_list = ast.literal_eval(output.strip())
        return command_list if isinstance(command_list, list) else [command_list]
    except Exception as e:
        return [f"Failed to parse: {e}"]


#### LangChain

In [195]:

template = """You are an expert assistant trained to extract exact command-line instructions from a user guide.

You are given the folliwing information:
- A user guide that contains command-line instructions for a CLI tool.
- all possible values for node_name
- mapping of node names to their default arguments.

Reference Information:
{context}

User Query:
{question}

Instructions:
- Carefully read the Reference Information, node names, and default arguments for each node.
- Return the CLI command(s) that are relevant to the user's query.
- Format your response as a raw Python list: ['command1', 'command2', ...]
- Do NOT include any explanations, comments, or formatting outside the list.
- If no command matches the query, return an empty list: []
- The output MUST be ordered in the same order as the input.

Example Query:
I want to make a logistic regression model and make a project with id 3.

Expected Response:
[
    'make logistic_regression <args>',
    'create_project 3'
]
"""

prompt = ChatPromptTemplate.from_template(template)

rag_chain = (
    RunnablePassthrough()
    | (lambda x: combine_inputs(x, retriever))
    | prompt
    | get_llm(model_name="gemini-2.0-flash")
    | StrOutputParser()
    | parse_command_list
)

### Keyword Extraction (Hybrid)

In [None]:
name_extractor = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("en_core_web_sm")
keyword_embeddings = name_extractor.encode(REFERENCE_KEYWORDS, convert_to_tensor=True)

def extract_keywords_hybrid(user_input: str, reference_keywords, top_n, transformer_thresh=0.7, fuzzy_thresh=80):
    keyword_embeddings = name_extractor.encode(reference_keywords, convert_to_tensor=True)
    doc = nlp(user_input.lower())
    phrases = [chunk.text for chunk in doc.noun_chunks]
    matched_keywords = []

    for phrase in phrases:
        phrase_embedding = name_extractor.encode(phrase, convert_to_tensor=True)
        cosine_scores = util.cos_sim(phrase_embedding, keyword_embeddings)[0]

        for idx, score in enumerate(cosine_scores):
            if score >= transformer_thresh:
                if reference_keywords[idx] not in matched_keywords:
                    matched_keywords.append(reference_keywords[idx])

        # === Fuzzy matching fallback ===
        for keyword in reference_keywords:
            if fuzz.ratio(phrase, keyword.replace("_", " ")) >= fuzzy_thresh:
                if keyword not in matched_keywords:
                    matched_keywords.append(keyword)
    top_n = min(top_n, len(matched_keywords))
    best_match = matched_keywords.copy()
    if len(matched_keywords) > top_n:
        best_match = sorted(matched_keywords, key=lambda x: fuzz.ratio(user_input, x), reverse=True)[:top_n]
    matched_keywords = list(filter(lambda x: x in best_match, matched_keywords))
    return matched_keywords

### Post-Processing

In [188]:
def args_extractor(names, reference_keywords, data_mapping):
    result = []
    for name in names:
        if name in reference_keywords and name in data_mapping:
            # node_type = mapping[name]
            args_str = json.dumps(data_mapping[name], separators=(',', ':'))
            result.append(args_str)
    return result

def replace_args(commands, replacements, names, reference_keywords):

    c = 0
    for i, command in enumerate(commands):
        if "<args>" in command:
            if names[c] in reference_keywords:
                commands[i] = command.replace("<args>", replacements[c])
                c += 1

    return commands

In [None]:
# template = """You are great at extracting suitable commands from user queries.

# You are given:
# - A mapping of node names to their default arguments.
# - A user's natural language query.
# - An empty list of commands to be filled.

# Your task:
# 1. Carefully read the user query.
# 2. Identify any mentioned node names, even if they are misspelled or partially named (e.g., "logistice" → "logistic_regression").
# 3. For each recognized node:
#    - Retrieve its default arguments from the mapping.
#    - If the user provided new values for any arguments, update the default arguments accordingly.
# 4. Format a command for each node in the following format:
#    - `"make <node_name> <arg1>=<value1> <arg2>=<value2> ..."`
#    - Skip arguments if they are empty.
# 5. Return your response as a **raw Python list**, like: `['make node1 arg1=val1', 'make node2']`
# 6. If no node matches the query, return an empty list: `[]`
# 7. Do NOT explain or add any extra text. Only output the list.

# Reference Mapping:
# {context}

# User Query:
# {question}
# """
# prompt = ChatPromptTemplate.from_template(template)

In [None]:
# args_chain = (
#     RunnablePassthrough()
#     | (lambda x: combine_inputs(x, mapping_retriever))
#     | prompt
#     | get_llm(model_name="gemini-2.0-flash")
#     | StrOutputParser()
#     | parse_command_list
# )

### Run Everything

In [196]:
question = "I want to create a project that has id of 3, use logistice regression model and use standard scaler"
# RAG output
rag_output = rag_chain.invoke({"question": question})
print("RAG: ",rag_output)
# # Extract command names
names = extract_keywords_hybrid(question, REFERENCE_KEYWORDS, top_n=len(rag_output))
print("Extracted nodes' names: ",names)
# # Extract args and types
args_list = args_extractor(names, REFERENCE_KEYWORDS, data_mapping)
# # Parse RAG and replace placeholders
final_output = replace_args(rag_output, args_list, names, REFERENCE_KEYWORDS)
final_output

RAG:  ['create_project 3', 'make logistic_regression <args>', 'make standard_scaler <args>']
Extracted nodes' names:  ['logistic_regression', 'standard_scaler']


['create_project 3',
 'make logistic_regression {"model_name":"logistic_regression","task":"classification","model_type":"linear_models","params":{"penalty":"l2","C":1.0}}',
 'make standard_scaler {"preprocessor_name":"standard_scaler","preprocessor_type":"scaler","params":{"with_mean":true,"with_std":true}}']