### Imports & Installation

In [None]:
# !pip install langchain_google_genai
# !pip install langchain_community
# !pip install langchain_huggingface
# !pip install gradio
# !pip install rapidfuzz
# !pip install pypdf
# !pip install faiss-cpu
# !pip install spacy
# !pip install fuzzywuzzy
# !python -m spacy download en_core_web_sm
# !pip install python-Levenshtein
# !pip install django
# !pip install django-cors-headers
# !pip install djangorestframework
# !pip install drf_spectacular
# !pip install psycopg2
# !python.exe -m pip install --upgrade pip


In [1]:
import json, os, re, ast
import subprocess
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough
from langchain.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import gradio as gr
from sentence_transformers import SentenceTransformer, util
import spacy
from rapidfuzz import fuzz
pdf_file = "./res/Cli script guidebook.pdf"


  from .autonotebook import tqdm as notebook_tqdm





### Load Model

In [2]:
def get_llm(model_name="gemini-1.5-pro"):
    return ChatGoogleGenerativeAI(
        model=model_name,
        temperature=0.1,
        max_output_tokens=500,
        google_api_key=os.environ.get("GOOGLE_API_KEY")
    )

#### Load API KEY from environment

In [3]:
api_key_path = './credentials/google_credentials.json'
if not os.path.exists(api_key_path):
    raise FileNotFoundError(f"API key file not found at {api_key_path}. Please provide the correct path.")

with open(api_key_path, 'r') as f:
    api_key = json.load(f)
os.environ['GOOGLE_API_KEY'] = api_key.get("GOOGLE_API_KEY")

### Load & Prepare Data

In [4]:
loader = PyPDFLoader(pdf_file)
pdf_docs = loader.load_and_split()

with open("res/data_mapping.json", "r") as f:
    data_mapping = json.load(f)

REFERENCE_KEYWORDS = list(data_mapping.keys())
data_mapping_doc = Document(page_content='Default Arguments for each node:\n'+ json.dumps(data_mapping))

all_docs = pdf_docs  + [data_mapping_doc]


### Create Vector Space

In [5]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents=all_docs, embedding=embeddings)
retriever = vectorstore.as_retriever()

### RAG Chain

#### Helper Functions

In [5]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def combine_inputs(input_dict, retriever, cur_iter=0):
    question = input_dict["question"]
    retrieved_docs = retriever.get_relevant_documents(question)
    return {
        "context": format_docs(retrieved_docs),
        "question": question,
        "cur_iter": cur_iter
    }

def parse_command_list(output: str):
    pattern = r"\[(.*?)\]"
    matches = re.findall(pattern, output, re.DOTALL)

    if matches:
        output = f"[{matches[0]}]"

    try:
        command_list = ast.literal_eval(output.strip())
        return command_list if isinstance(command_list, list) else [command_list]
    except Exception as e:
        return [f"Failed to parse: {e}"]


#### LangChain

In [None]:

template1 = """You are an expert assistant trained to extract exact command-line instructions from a user guide.

You are given the folliwing information:
- A user guide that contains command-line instructions for a CLI tool.
- all possible values for node_name
- mapping of node names to their default <args>.

Reference Information:
{context}

User Query:
{question}

Instructions:
- Carefully read the Reference Information, node names, and default arguments for each node.
- If user didn't provide any value for the arguments, use the default values from the mapping using this.
- Return the CLI command(s) that are relevant to the user's query.
- Format your response as a raw Python list: ['command1', 'command2', ...]
- Do NOT include any explanations, comments, or formatting outside the list.
- If no command matches the query, return an empty list: []
- If user provided arguments make it in json format. e.g. {{'arg1': 'value1', 'arg2': 'value2'}}
- The output MUST be ordered in the same order as the input.

Example Query:
I want to make a logistic regression model and make a project with id 3.

Expected Response:
[
    'make logistic_regression {{}}',
    'create_project 3'
]
"""

prompt1 = ChatPromptTemplate.from_template(template1)

rag_chain = (
    RunnablePassthrough()
    | (lambda x: combine_inputs(x, retriever, cur_iter=0))
    | prompt1
    | get_llm(model_name="gemini-2.0-flash")
    | StrOutputParser()
    | parse_command_list
)

### Post-Processing

In [54]:
def RAG_pipeline(documents, prompt, question, cur_iter, model_name="gemini-2.0-flash"):
    model = get_llm(model_name=model_name)
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    retriever = FAISS.from_documents(documents=documents, embedding=embeddings).as_retriever()
    rag_chain = (
        RunnablePassthrough()
        | (lambda x: combine_inputs(x, retriever, cur_iter))
        | prompt
        | model
        | StrOutputParser()
        # | parse_command_list
    )
    return rag_chain.invoke({"question":question, "cur_iter": cur_iter})

In [55]:
question = "I want to create a project that has id of 3, use logistice regression model and use standard scaler"
print(RAG_pipeline(all_docs, prompt1, question, cur_iter=0, model_name="gemini-2.0-flash"))

```python
[
    'create_project 3',
    'make logistic_regression {"model_name":"logistic_regression","model_type":"linear_models","task":"classification"}',
    'make standard_scaler {}'
]
```


In [14]:
file1 = './res/steps.pdf'
file2 = pdf_file
loader = PyPDFLoader(file1)
steps_docs = loader.load_and_split()
new = steps_docs + pdf_docs + [data_mapping_doc]
print(new[0].page_content)

Steps for each pipeline 
 
Model fitting with a dataset 
- make <model_name> <args>  
- make data_loader <args>  
- show data_loader <data_loader_id> 1  
- show data_loader <data_loader_id> 2  
- make model_fitter {“model”:<model_id>,”X”:data_loader_X_id,”y”:data_loader_y_id}


In [15]:
def extract_id_message(json_str):
    try:
        json_obj = eval(json_str)
        message = json_obj.get("message")
        node_id = json_obj.get("node_id")
        json_obj = {"message": message, "node_id": node_id}
        return f'{json_obj}'
    except:
        return json_str
extract_id_message(str({"node_id": 2889729083600,"node_name": "data_loader_y","message": "data loaded: iris: y","params": {},"task": "load_data","node_type": "loader","children": [],"location_x": 0.0,"location_y": 0.0,"input_ports": [],"output_ports": [],"project": 1,"node_data": "C:\\Users\\a1mme\\OneDrive\\Desktop\\MO\\test_grad\\project\\core\\saved\\other\\data_loader_2889729083600.pkl"}))

"{'message': 'data loaded: iris: y', 'node_id': 2889729083600}"

In [16]:
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('app.py'), '..')))
from cli.call_cli import call_script

In [None]:

template2 = """You are an expert assistant who takes input from user and has a guide book which helps you, also you have
a response from the api that is stored into your corpus. 

You are given the following information in your corpus:
- steps to follow (follow them one by one).
- A user guide that contains command-line instructions for a CLI tool.
- A response from api, this response is given in your reference information.
- A mapping of node names to their default arguments.
- and the current iteration number.

Reference Information:
{context}

User Query:
{question}

Current Iteration:
{cur_iter}

Instructions:
- Carefully read the Reference Information, user query.
- Follow the steps that is given to you in the reference information one by one.
- take the response from API (which is also stored in your corpus) to fill in the values that needs to be filled e.g. <model_id>.
- If user didn't provide any value for the arguments, use the default values from the mapping using this.
- Return the CLI command(s) that are relevant to the user's query.
- Show only the current command. You can know that by the current iteration number.
- Format the output as string.
- Do NOT include any explanations, comments, or formatting outside the string.
- Monitor the message of the api response and determine whether there is an error or not.

Note The Following: 
- in example '$' means that it is an extracted value from your corpus.
- There are some nodes that has multiple output, so to specify the output channel, use the number of the channel. e.g. 'show data_loader $data_loader_id 1' means that you are showing the first channel of data loader node which is X,

example query:
I want to fit knn on diabetes dataset

Expected Response:
[
    'make knn_classifier {{"model_name":"knn_classifier","task":"classification","model_type":"knn","params":{{"n_neighbors": 5}}}}',
    'make data_loader {{"params":{{"dataset_name": "iris"}}}}',
    'show data_loader data_loader_id 1',
    'show data_loader data_loader_id 2',
    'make model_fitter {{'model': model_id, 'X': X_id, 'y': y_id}}'
]
"""

prompt2 = ChatPromptTemplate.from_template(template2)

In [None]:

# question = "I want to fit logistic regression model on iris dataset"
api_response = ' '
docs = new.copy()
docs += [Document(page_content='')]
i = 1
manual_mode = 1 if input("Select Mode 1 for manual, 0 for auto: ") == '1' else 0
question = input("User Query: ") if not manual_mode else ''      # One question
while True:
    try:
        if manual_mode:  # This mode makes the user be able to add any comment to the chatbot.
            """
            Manual Mode: 
            Where the chatbot Receive Prompts from user and iteracts with cli.
            """
            question = input("User Query: ")
            if question.strip().lower() in ("exit", "quit"):
                break

            rag_out = RAG_pipeline(all_docs, prompt1, question, cur_iter=0, model_name="gemini-2.0-flash")
            rag_out = parse_command_list(rag_out)
            print(f"RAG Output : \n"+str(rag_out))

            for a, out in enumerate(rag_out):
                api_response = call_script(out)
                print(f"API Output {a}: \n"+str(api_response))
            print('-'*50)

        else:
            """
            Automatic Mode:
            Where User only Specify what he want, and the chatbot will take care of the rest.
            """
            if not api_response:
                break 
            docs[-1].page_content += '\n' + str(api_response)
            rag_out = RAG_pipeline(docs, prompt2, question, cur_iter=i, model_name="gemini-2.0-flash")
            rag_out = rag_out.strip().strip("'").strip('"')

            print(f"RAG Output {i}: ",rag_out)
            # api_response = extract_id_message(call_script(rag_out))       # UNCOMMENT to make it interact with cli
            api_response = extract_id_message(input("API Response: "))      # COMMENT this part, it is made for debugging.
            print(f"API Output {i}: ",api_response)
            print('-'*50)
            i +=1
    except KeyboardInterrupt:
        break
# print(rag_out)

RAG Output : 
['make logistic_regression {"model_name": "logistic_regression", "task": "classification", "model_type": "linear_models", "params": {"penalty": "l2", "C": 1.0}}']
Subprocess completed successfully.
API Output 0: 
{'node_id': 1809860798528, 'node_name': 'logistic_regression', 'message': 'Model logistic_regression created', 'node_data': 'C:\\Users\\a1mme\\OneDrive\\Desktop\\MO\\test_grad\\project\\core\\saved\\model\\logistic_regression_1809860798528.pkl', 'params': {'C': 1.0, 'penalty': 'l2'}, 'task': 'classification', 'node_type': 'linear_models', 'project_id': 1, 'children': [], 'location_x': 0.0, 'location_y': 0.0, 'input_ports': [], 'output_ports': []}
--------------------------------------------------


### Run Everything

In [None]:
question = "I want to create a project that has id of 3, use logistice regression model and use standard scaler"
# RAG output
rag_output = rag_chain.invoke({"question": question})
print("RAG: ",rag_output)

RAG:  ['create_project 3', 'make logistic_regression <args>', 'make standard_scaler <args>']
Extracted nodes' names:  ['logistic_regression', 'standard_scaler']


['create_project 3',
 'make logistic_regression {"model_name":"logistic_regression","task":"classification","model_type":"linear_models","params":{"penalty":"l2","C":1.0}}',
 'make standard_scaler {"preprocessor_name":"standard_scaler","preprocessor_type":"scaler","params":{"with_mean":true,"with_std":true}}']