## Package Installs

In [1]:
%%capture
%pip install sentence-transformers
%pip install langchain_community
%pip install langchain_community
%pip install openai
%pip install faiss-cpu
%pip install langchain_openai

from langchain.text_splitter import RecursiveCharacterTextSplitter

### Import Modules

In [13]:
import os
from openai import AzureOpenAI
from langchain_community.document_loaders import TextLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA

## Connect to LLM

In [3]:
client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version="2024-02-01"
)
assert client is not None, "Failed to create AzureOpenAI client"


## File and Database Paths

In [4]:
log_file_path = "C:/Users/kr4193/Desktop/Log_error_reporter/Prep_work/clean_Geiger_for_LLMs.log"
dirty_log_file = "C:/Users/kr4193/Desktop/Log_error_reporter/Prep_work/SMOKE-ZSB-DP12-002.log"
embedding_model = "all-MiniLM-L6-v2"
database_name = "sample_db"

## Data Cleaning

### Generate Chunks and splitting the documents

In [5]:
def generate_chunks(filename):
    data = []
    final_chunk = []

    loader = TextLoader(filename, encoding="utf-8")
    rawdata = loader.load()
    if rawdata:
        text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=50)
        data = text_splitter.split_documents(rawdata)
        if data:
            print(f"generate_chunks - {filename}")
            final_chunk += data
        else:
            print(f"generate_chunks - data is None")
    else:
        print(f"generate_chunks - rawdata is None")

    print(f"{filename} data chunks ready for embedding")

        # Add more conditions for other file types if needed
    print("prepare_data_chunks: finished")
    return final_chunk

### Create and Load the Vector DB

In [6]:
def create_vectordb(filepath, databasename):
    data = generate_chunks(filepath)
    if data:
        print(f"Starting to create {filepath} ...")
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        print(f"embedding : {embeddings}")
        if embeddings:
            vdatabase = FAISS.from_documents(data, embeddings)
            vdatabase.save_local(databasename)
            print(f" vectordatabase {databasename} ready...")
        else:
            print(f"Empty Embeddings")
        return vdatabase
    else:
        print("chunk data received is null, exiting database creation")
        return None

In [7]:
def load_vector_db(embedding_model_name, vector_db_name):
    print(f" Loading {vector_db_name} ...")
    if os.path.exists(vector_db_name):
        print(f" database {vector_db_name} present!!")
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
        if embeddings:
            vector_db = FAISS.load_local(vector_db_name, embeddings, allow_dangerous_deserialization=True)
            print(f" Loading {vector_db_name} Done!!")
        else:
            print(f"Empty Embeddings")
    else:
        print(f" No file path found for {vector_db_name}..")
    return vector_db

### Retriever

## Prompt template

In [8]:
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    You are a helpful assistant. Use the following context to answer the question.
    Context: {context}
    Question: {question}
    """
)

### Chain

In [9]:
def qa_bot():
    vectordb = get_retriever()
    retriever = vectordb.as_retriver(search_kwargs = {"k":10})
    query = "What is the error in the log file?"
    while query != "quit":
        query = input("Enter your query: ")
        output = chain.invoke(query)
        print(output)
    qa_bot()
    


## Main function

In [10]:
# Press the green button in the gutter to run the script.
# if __name__ == '__main__':

# Create vector data base
local_vectordb = create_vectordb(log_file_path, database_name)

# Load existing database using db name
vector_db = load_vector_db(embedding_model, database_name)

# Initialze the retriver with retrieval method
#retriever = jira_vector_db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.1})
retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={"k":2})

# Query the database to get symantical search output
retrieved_output = retriever.invoke("What is this document about?")
print(f"retrieved_output: {retrieved_output}")
print(f"vectordb ready")

Created a chunk of size 6698, which is longer than the specified 1500
Created a chunk of size 4368, which is longer than the specified 1500


generate_chunks - C:/Users/kr4193/Desktop/Log_error_reporter/Prep_work/clean_Geiger_for_LLMs.log
C:/Users/kr4193/Desktop/Log_error_reporter/Prep_work/clean_Geiger_for_LLMs.log data chunks ready for embedding
prepare_data_chunks: finished
Starting to create C:/Users/kr4193/Desktop/Log_error_reporter/Prep_work/clean_Geiger_for_LLMs.log ...
embedding : model_name='all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False
 vectordatabase sample_db ready...
 Loading sample_db ...
 database sample_db present!!
 Loading sample_db Done!!
retrieved_output: [Document(id='f7bcc214-e9b2-46a2-ace5-de49d381b181', metadata={'source': 'C:/Users/kr4193/Desktop/Log_error_reporter/Prep_work/clean_Geiger_for_LLMs.log'}, page_content='Suite SnmpCleanCutterTests: passed\nPassed         : 2\nFailed         : 0\nSkipped        : 0\nError          : 0\nCanceled       : 0\nExpected Fail  : 0'), Document(id='110eb360-3911-4fab-86bc-3528c6c26459', metadata={'sour

client.invoke('tell me a joke')

prompt = ChatPromptTemplate.from_template("tell me a short joke")
chain = prompt  | client | StrOutputParser()

In [11]:
question = "Tell a joke"
response = client.chat.completions.create(
    model="gpt-4o", # model = "deployment_name".
    messages=[
        {"role": "system", "content": "You are the joker the DC villian from batman. You make only evil jokes"},
        {"role": "user", "content": "tell me a joke"},
    ]
)


print(response.choices[0].message.content)

Why so serious? All right, here’s a little joke for you:

Why don’t criminals gamble in Gotham City?

Because the Joker always has the last laugh!


In [None]:
response = client.chat.completions.create(
    model="gpt-4o", # model = "deployment_name".
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "system", "content": "The fromat of tests is as follows Test suites -> Test modules -> Test cases , Test cases are the lowest level of granularity "},
        {"role": "system", "content": "Test Suites are started at the line Entering suites: <suitename> and ended at Suite <suitename> <result>"},
        {"role": "system", "content": "Test models are started at Execute module: <module_name>"},
        {"role": "system", "content": "Test cases are started at the line Running test: <test_name> and ended at line <test_name>: <result>"}, 
        {"role": "system", "content": "Always Ignore all passed test cases and Suites"},
        {"role": "system", "content": "Always Identify and Classify the Failed tests into one of the 3 categories - Product issues, ATF Script Issues & Setup issues"},
        {"role": "system", "content": "Always display for failed test suites only in the follwing format {Suite, Module,\
          Synopsis of the test Failed, test name, Failure category, Reason for categorisation } "},
        {"role": "user", "content": f"Here is the script that was used to run the test {content}"},
        {"role": "user", "content": f"Here is a log entry for analysis:\n{log_content}\n\nQuestion: {question}\n"},
    ]
)


with open("output_context.txt", "w") as output_file:
    output_file.write(response.choices[0].message.content)