<a href="https://colab.research.google.com/github/Jathin4/AI-Enhanced-IT-Helpdesk-with-Semantic-Retrieval-and-Query-Refinement/blob/main/AI_Enhanced_IT_Helpdesk_with_Semantic_Retrieval_and_Query_Refinement_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas numpy sentence-transformers langchain faiss-cpu groq openai
!pip install langchain-community
!pip install docx2txt python-docx
!pip install --upgrade langchain

from langchain.schema import Document

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting groq
  Downloading groq-0.24.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transfor

In [2]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from groq import Groq
import docx2txt
from docx import Document
import io

print("Current working directory:", os.getcwd())

# Function to extract text and Excel data from a single Word document
def extract_data_from_doc(file_path):
    print(f"Extracting data from: {file_path}")
    # Extract text content
    text_content = docx2txt.process(file_path)

    # Parse text content to extract topics, descriptions, and original index
    text_data = []
    lines = text_content.split('\n')
    current_topic = ""
    current_description = ""
    current_index = None
    for line in lines:
        if line.strip().isdigit():  # Assuming the original index is a number (S.No)
            current_index = line.strip()
        elif line.strip().isupper():  # Assuming topics are in uppercase
            if current_topic and current_description:
                text_data.append({"S.No": current_index, "Topic": current_topic, "Description": current_description.strip()})
            current_topic = line.strip()
            current_description = ""
        else:
            current_description += line + " "
    if current_topic and current_description:
        text_data.append({"S.No": current_index, "Topic": current_topic, "Description": current_description.strip()})

    # Extract Excel data (if any)
    doc = Document(file_path)
    excel_data = []
    for table in doc.tables:
        headers = [cell.text for cell in table.rows[0].cells]
        for row in table.rows[1:]:
            row_data = {headers[i]: cell.text for i, cell in enumerate(row.cells)}
            excel_data.append(row_data)

    # Combine text and Excel data
    combined_data = text_data + excel_data
    print(f"Extracted {len(combined_data)} entries from the document")
    return pd.DataFrame(combined_data)

# Load data from the document
file_path = '/content/Dataset of all topic.docx'  # Correct file path to the uploaded file
try:
    combined_df = extract_data_from_doc(file_path)
    print("Data loaded successfully.")
    print(f"Number of rows: {len(combined_df)}")
    print(f"Number of unique topics: {combined_df['Topic'].nunique()}")
    print("First few rows:")
    print(combined_df.head())
except FileNotFoundError:
    print(f"File not found: {file_path}")
    print("Please make sure the file is in the correct location and you have the necessary permissions.")
    exit(1)
except Exception as e:
    print(f"An error occurred while loading the file: {e}")
    exit(1)

# Load a pre-trained sentence transformer model for embedding
print("Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings_model = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

# Create a DataFrameLoader
print("Creating DataFrameLoader...")
loader = DataFrameLoader(combined_df, page_content_column='Description')

# Semantic Chunking - Split documents by paragraphs or larger semantic units
print("Splitting documents semantically...")

# Using RecursiveCharacterTextSplitter to create semantic chunks (with overlap)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Larger chunk size
    chunk_overlap=100,  # Overlap between chunks
    separators=["\n\n", "\n", " "],  # Splitting first by paragraphs, then by lines, then by spaces
)

# Load documents and split them using the semantic chunking approach
print("Loading and splitting documents...")
documents = loader.load()
documents = splitter.split_documents(documents)

# Add the metadata, ensuring the original index (S.No) is included
j = 0  # Initialize a separate counter for dataframe
for i, doc in enumerate(documents):
    if 'S.No' in combined_df.columns and j < len(combined_df):
        doc.metadata['S.No'] = combined_df.iloc[j]['S.No']
        j += 1  # Increment the counter for the dataframe

print(f"Total documents after splitting: {len(documents)}")

# Create FAISS vector store
print("Creating FAISS vector store...")
vector_store = FAISS.from_documents(documents, embeddings_model)
print("Vector store created successfully")

# Initialize Groq client
client = Groq(api_key="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")  # Replace with your actual API key

# Function to use Groq API to refine the query
def refine_query_with_groq(query_text):
    print("Refining query...")
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an AI assistant that refines user queries about computer networking topics and IT problems to improve search accuracy. Provide a concise, refined version of the user's query.",
                },
                {
                    "role": "user",
                    "content": f"Refine the following query about computer networking or IT problems for better search accuracy: {query_text}",
                }
            ],
            model="llama3-8b-8192",
            max_tokens=100,
        )
        refined_query = chat_completion.choices[0].message.content.strip()
        print(f"Query refined: {refined_query}")
        return refined_query
    except Exception as e:
        print(f"Error during query refinement: {e}")
        return query_text  # Return the original query in case of error

# Function to use Groq API and combine retrieval and generation (RAG)
def generate_answer_with_rag(refined_query_text, retrieval_context, sources):
    print("Generating answer...")
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an AI assistant specializing in computer networking and IT problem-solving. Generate detailed answers to user queries based on the retrieved context. Include information about the OSI layer, algorithms involved, and a detailed solution if applicable.",
                },
                {
                    "role": "user",
                    "content": f"Using the following retrieved context about computer networking topics and IT problems, provide a detailed answer to the query. Include the description, original index, problem, cause, solution, type, OSI layer, and algorithm if available. Then generate a detailed solution:\n\nContext: {retrieval_context}\n\nQuery: {refined_query_text}",
                }
            ],
            model="llama3-8b-8192",
            max_tokens=1000,
        )
        answer = chat_completion.choices[0].message.content.strip()
        print("Answer generated successfully")

        # Append sources to the answer
        sourced_answer = f"{answer}\n\nSources:\n" + "\n".join(sources)
        return sourced_answer
    except Exception as e:
        print(f"Error during answer generation: {e}")
        return "Sorry, I couldn't generate an answer."  # Return a default message in case of error

# Main execution
if __name__ == "__main__":
    query_text = input("Enter your query about computer networking or IT problems: ")

    print("Processing query...")
    refined_query_text = refine_query_with_groq(query_text)
    print(f"Refined query: {refined_query_text}")

    print("Searching for relevant information...")
    retrieved_documents = vector_store.similarity_search(refined_query_text, k=3)

    print("Preparing context for answer generation...")
    retrieval_context = ""
    sources = []  # List to hold sources for output
    for i, doc in enumerate(retrieved_documents):
        print(f"Document {i+1}:")
        print(f"  Topic: {doc.metadata.get('Topic', 'Unknown')}")
        print(f"  Description: {doc.page_content[:100]}...")  # Print first 100 chars
        print(f"  Original Index: {doc.metadata.get('S.No', 'Unknown')}")  # Ensure original index is printed
        retrieval_context += f"Topic: {doc.metadata.get('Topic', 'Unknown')}\n"
        retrieval_context += f"Description: {doc.page_content}\n"
        retrieval_context += f"Original Index: {doc.metadata.get('S.No', 'Unknown')}\n"
        for key, value in doc.metadata.items():
            if key != 'Topic':
                retrieval_context += f"{key}: {value}\n"
        retrieval_context += "\n"
        # Add document source information to sources list
        sources.append(f"Document {i+1} - Topic: {doc.metadata.get('Topic', 'Unknown')}, Original Index: {doc.metadata.get('S.No', 'Unknown')}")

    print("Generating final answer...")
    final_answer = generate_answer_with_rag(refined_query_text, retrieval_context, sources)
    print(f"Final Answer:\n{final_answer}")


Current working directory: /content
Extracting data from: /content/Dataset of all topic.docx
Extracted 3 entries from the document
Data loaded successfully.
Number of rows: 3
Number of unique topics: 3
First few rows:
   S.No            Topic                                        Description
0  None  1.LOAD BALANCER  A load balancer is a device or software that d...
1  None        2.ROUTERS  A router is a networking device that forwards ...
2  None       3.FIREWALL  A firewall is a network security device or sof...
Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  embeddings_model = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')


Creating DataFrameLoader...
Splitting documents semantically...
Loading and splitting documents...
Total documents after splitting: 7
Creating FAISS vector store...
Vector store created successfully
Enter your query about computer networking or IT problems: how to hack my router 
Processing query...
Refining query...
Query refined: I cannot provide information or guidance on illegal or harmful activities. Instead, I suggest refining your query to focus on legal and ethical ways to manage and secure your router. Here's a refined version of your query:

"How to secure my router and improve its settings for optimal performance and security."
or
"How to configure my router's settings for better Wi-Fi coverage and password protection."
or
"How to troubleshoot common router issues and optimize its performance."
Refined query: I cannot provide information or guidance on illegal or harmful activities. Instead, I suggest refining your query to focus on legal and ethical ways to manage and secur

In [3]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from groq import Groq
import docx2txt
from docx import Document
import io
import re
import urllib.parse

print("Current working directory:", os.getcwd())

# Function to extract text and Excel data from a single Word document
def extract_data_from_doc(file_path):
    print(f"Extracting data from: {file_path}")
    # Extract text content
    text_content = docx2txt.process(file_path)

    # Parse text content to extract topics, descriptions, and original index
    text_data = []
    lines = text_content.split('\n')
    current_topic = ""
    current_description = ""
    current_index = None
    for line in lines:
        if line.strip().isdigit():  # Assuming the original index is a number (S.No)
            current_index = line.strip()
        elif line.strip().isupper():  # Assuming topics are in uppercase
            if current_topic and current_description:
                text_data.append({"S.No": current_index, "Topic": current_topic, "Description": current_description.strip()})
            current_topic = line.strip()
            current_description = ""
        else:
            current_description += line + " "
    if current_topic and current_description:
        text_data.append({"S.No": current_index, "Topic": current_topic, "Description": current_description.strip()})

    # Extract Excel data (if any)
    doc = Document(file_path)
    excel_data = []
    for table in doc.tables:
        headers = [cell.text for cell in table.rows[0].cells]
        for row in table.rows[1:]:
            row_data = {headers[i]: cell.text for i, cell in enumerate(row.cells)}
            excel_data.append(row_data)

    # Combine text and Excel data
    combined_data = text_data + excel_data
    print(f"Extracted {len(combined_data)} entries from the document")
    return pd.DataFrame(combined_data)

# Load data from the document
file_path = '/content/Dataset of all topic.docx'  # Correct file path to the uploaded file
try:
    combined_df = extract_data_from_doc(file_path)
    print("Data loaded successfully.")
    print(f"Number of rows: {len(combined_df)}")
    print(f"Number of unique topics: {combined_df['Topic'].nunique()}")
    print("First few rows:")
    print(combined_df.head())
except FileNotFoundError:
    print(f"File not found: {file_path}")
    print("Please make sure the file is in the correct location and you have the necessary permissions.")
    exit(1)
except Exception as e:
    print(f"An error occurred while loading the file: {e}")
    exit(1)

# Load a pre-trained sentence transformer model for embedding
print("Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings_model = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

# Create a DataFrameLoader
print("Creating DataFrameLoader...")
loader = DataFrameLoader(combined_df, page_content_column='Description')

# Semantic Chunking - Split documents by paragraphs or larger semantic units
print("Splitting documents semantically...")

# Using RecursiveCharacterTextSplitter to create semantic chunks (with overlap)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Larger chunk size
    chunk_overlap=100,  # Overlap between chunks
    separators=["\n\n", "\n", " "],  # Splitting first by paragraphs, then by lines, then by spaces
)

# Load documents and split them using the semantic chunking approach
print("Loading and splitting documents...")
documents = loader.load()
documents = splitter.split_documents(documents)

# Add the metadata, ensuring the original index (S.No) is included
j = 0  # Initialize a separate counter for dataframe
for i, doc in enumerate(documents):
    if 'S.No' in combined_df.columns and j < len(combined_df):
        doc.metadata['S.No'] = combined_df.iloc[j]['S.No']
        j += 1  # Increment the counter for the dataframe

print(f"Total documents after splitting: {len(documents)}")

# Create FAISS vector store
print("Creating FAISS vector store...")
vector_store = FAISS.from_documents(documents, embeddings_model)
print("Vector store created successfully")

# Initialize Groq client
client = Groq(api_key="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")  # Replace with your actual API key

# Function to use Groq API to refine the query
def refine_query_with_groq(query_text):
    print("Refining query...")
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an AI assistant that refines user queries about computer networking topics and IT problems to improve search accuracy. Provide a concise, refined version of the user's query.",
                },
                {
                    "role": "user",
                    "content": f"Refine the following query about computer networking or IT problems for better search accuracy: {query_text}",
                }
            ],
            model="llama3-8b-8192",
            max_tokens=100,
        )
        refined_query = chat_completion.choices[0].message.content.strip()
        print(f"Query refined: {refined_query}")
        return refined_query
    except Exception as e:
        print(f"Error during query refinement: {e}")
        return query_text  # Return the original query in case of error

# Function to use Groq API and combine retrieval and generation (RAG)
def generate_answer_with_rag(refined_query_text, retrieval_context):
    print("Generating answer...")
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an AI assistant specializing in computer networking and IT problem-solving. Generate detailed answers to user queries based on the retrieved context. Include information about the OSI layer, algorithms involved, and a detailed solution if applicable.",
                },
                {
                    "role": "user",
                    "content": f"Using the following retrieved context about computer networking topics and IT problems, provide a detailed answer to the query. Include the description, original index, problem, cause, solution, type, OSI layer, and algorithm if available. Then generate a detailed solution:\n\nContext: {retrieval_context}\n\nQuery: {refined_query_text}",
                }
            ],
            model="llama3-8b-8192",
            max_tokens=1000,
        )
        answer = chat_completion.choices[0].message.content.strip()
        print("Answer generated successfully")

        # Generate query-specific links
        networking_expertise_link = generate_query_link("https://www.networkingexpertise.com/search?q=", refined_query_text)
        pearson_link = generate_query_link("https://ptgmedia.pearsoncmg.com/images/9780789759818/samplepages/9780789759818_Sample.pdf#search=", refined_query_text)

        # Add modified source attribution with query-specific links
        sourced_answer = f"{answer}\n\nSources:\n- LLM Model: Llama 3 (8B parameters)\n- Websites:\n  1. [Networking Expertise]({networking_expertise_link})\n  2. [Pearson Sample]({pearson_link})"
        return sourced_answer
    except Exception as e:
        print(f"Error during answer generation: {e}")
        return "Sorry, I couldn't generate an answer."  # Return a default message in case of error

def generate_query_link(base_url, query):
    # Remove special characters and spaces from the query
    cleaned_query = re.sub(r'[^\w\s]', '', query).replace(' ', '+')
    return f"{base_url}{urllib.parse.quote(cleaned_query)}"

# Main execution
if __name__ == "__main__":
    query_text = input("Enter your query about computer networking or IT problems: ")

    print("Processing query...")
    refined_query_text = refine_query_with_groq(query_text)
    print(f"Refined query: {refined_query_text}")

    print("Searching for relevant information...")
    retrieved_documents = vector_store.similarity_search(refined_query_text, k=3)

    print("Preparing context for answer generation...")
    retrieval_context = ""
    for i, doc in enumerate(retrieved_documents):
        print(f"Document {i+1}:")
        print(f"  Topic: {doc.metadata.get('Topic', 'Unknown')}")
        print(f"  Description: {doc.page_content[:100]}...")  # Print first 100 chars
        print(f"  Original Index: {doc.metadata.get('S.No', 'Unknown')}")  # Ensure original index is printed
        retrieval_context += f"Topic: {doc.metadata.get('Topic', 'Unknown')}\n"
        retrieval_context += f"Description: {doc.page_content}\n"
        retrieval_context += f"Original Index: {doc.metadata.get('S.No', 'Unknown')}\n"
        for key, value in doc.metadata.items():
            if key != 'Topic':
                retrieval_context += f"{key}: {value}\n"
        retrieval_context += "\n"

    print("Generating final answer...")
    final_answer = generate_answer_with_rag(refined_query_text, retrieval_context)
    print(f"Final Answer:\n{final_answer}")

Current working directory: /content
Extracting data from: /content/Dataset of all topic.docx
Extracted 3 entries from the document
Data loaded successfully.
Number of rows: 3
Number of unique topics: 3
First few rows:
   S.No            Topic                                        Description
0  None  1.LOAD BALANCER  A load balancer is a device or software that d...
1  None        2.ROUTERS  A router is a networking device that forwards ...
2  None       3.FIREWALL  A firewall is a network security device or sof...
Loading embedding model...
Creating DataFrameLoader...
Splitting documents semantically...
Loading and splitting documents...
Total documents after splitting: 7
Creating FAISS vector store...
Vector store created successfully
Enter your query about computer networking or IT problems: how to hack my router 
Processing query...
Refining query...
Query refined: I cannot provide information or guidance on illegal or harmful activities, including hacking. 

Is there anything els

In [4]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from groq import Groq
import docx2txt
from docx import Document
import io
import re
import urllib.parse

print("Current working directory:", os.getcwd())

# Custom guardrail for validating file extension
def validate_file_extension(file_path, valid_extensions=['.docx']):
    if not any(file_path.endswith(ext) for ext in valid_extensions):
        raise ValueError(f"Invalid file format: {file_path}. Expected formats: {', '.join(valid_extensions)}.")

# Function to extract text and Excel data from a single Word document
def extract_data_from_doc(file_path):
    validate_file_extension(file_path)
    print(f"Extracting data from: {file_path}")

    # Extract text content
    try:
        text_content = docx2txt.process(file_path)
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return pd.DataFrame()  # Return empty DataFrame on failure

    # Parse text content to extract topics, descriptions, and original index
    text_data = []
    lines = text_content.split('\n')
    current_topic = ""
    current_description = ""
    current_index = None
    for line in lines:
        if line.strip().isdigit():  # Assuming the original index is a number (S.No)
            current_index = line.strip()
        elif line.strip().isupper():  # Assuming topics are in uppercase
            if current_topic and current_description:
                text_data.append({"S.No": current_index, "Topic": current_topic, "Description": current_description.strip()})
            current_topic = line.strip()
            current_description = ""
        else:
            current_description += line + " "
    if current_topic and current_description:
        text_data.append({"S.No": current_index, "Topic": current_topic, "Description": current_description.strip()})

    # Extract Excel data (if any)
    doc = Document(file_path)
    excel_data = []
    for table in doc.tables:
        headers = [cell.text for cell in table.rows[0].cells]
        for row in table.rows[1:]:
            row_data = {headers[i]: cell.text for i, cell in enumerate(row.cells)}
            excel_data.append(row_data)

    # Combine text and Excel data
    combined_data = text_data + excel_data
    print(f"Extracted {len(combined_data)} entries from the document")
    return pd.DataFrame(combined_data)

# Load data from the document
file_path = '/content/Dataset of all topic.docx'  # Correct file path to the uploaded file
try:
    combined_df = extract_data_from_doc(file_path)
    if combined_df.empty:
        raise ValueError("No data extracted from document.")
    print("Data loaded successfully.")
    print(f"Number of rows: {len(combined_df)}")
    print(f"Number of unique topics: {combined_df['Topic'].nunique()}")
    print("First few rows:")
    print(combined_df.head())
except FileNotFoundError:
    print(f"File not found: {file_path}")
    print("Please make sure the file is in the correct location and you have the necessary permissions.")
    exit(1)
except Exception as e:
    print(f"An error occurred while loading the file: {e}")
    exit(1)

# Load a pre-trained sentence transformer model for embedding
print("Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings_model = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

# Create a DataFrameLoader
print("Creating DataFrameLoader...")
loader = DataFrameLoader(combined_df, page_content_column='Description')

# Semantic Chunking - Split documents by paragraphs or larger semantic units
print("Splitting documents semantically...")

# Using RecursiveCharacterTextSplitter to create semantic chunks (with overlap)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Larger chunk size
    chunk_overlap=100,  # Overlap between chunks
    separators=["\n\n", "\n", " "],  # Splitting first by paragraphs, then by lines, then by spaces
)

# Load documents and split them using the semantic chunking approach
print("Loading and splitting documents...")
documents = loader.load()
documents = splitter.split_documents(documents)

# Add the metadata, ensuring the original index (S.No) is included
j = 0  # Initialize a separate counter for dataframe
for i, doc in enumerate(documents):
    if 'S.No' in combined_df.columns and j < len(combined_df):
        doc.metadata['S.No'] = combined_df.iloc[j]['S.No']
        j += 1  # Increment the counter for the dataframe

print(f"Total documents after splitting: {len(documents)}")

# Create FAISS vector store
print("Creating FAISS vector store...")
vector_store = FAISS.from_documents(documents, embeddings_model)
print("Vector store created successfully")

# Initialize Groq client with key validation
api_key = "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"
if not api_key or len(api_key) < 20:  # Basic validation for API key format
    raise ValueError("Invalid or missing API key. Please check your Groq API key.")

client = Groq(api_key=api_key)

# Function to use Groq API to refine the query
def refine_query_with_groq(query_text):
    print("Refining query...")
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an AI assistant that refines user queries about computer networking topics and IT problems to improve search accuracy. Provide a concise, refined version of the user's query.",
                },
                {
                    "role": "user",
                    "content": f"Refine the following query about computer networking or IT problems for better search accuracy: {query_text}",
                }
            ],
            model="llama3-8b-8192",
            max_tokens=100,
        )
        refined_query = chat_completion.choices[0].message.content.strip()
        print(f"Query refined: {refined_query}")
        return refined_query
    except Exception as e:
        print(f"Error during query refinement: {e}")
        return query_text  # Return the original query in case of error

# Function to use Groq API and combine retrieval and generation (RAG)
def generate_answer_with_rag(refined_query_text, retrieval_context):
    print("Generating answer...")
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an AI assistant specializing in computer networking and IT problem-solving. Generate detailed answers to user queries based on the retrieved context. Include information about the OSI layer, algorithms involved, and a detailed solution if applicable.",
                },
                {
                    "role": "user",
                    "content": f"Using the following retrieved context about computer networking topics and IT problems, provide a detailed answer to the query. Include the description, original index, problem, cause, solution, type, OSI layer, and algorithm if available. Then generate a detailed solution:\n\nContext: {retrieval_context}\n\nQuery: {refined_query_text}",
                }
            ],
            model="llama3-8b-8192",
            max_tokens=1000,
        )
        answer = chat_completion.choices[0].message.content.strip()
        print("Answer generated successfully")

        # Generate query-specific links
        networking_expertise_link = generate_query_link("https://www.networkingexpertise.com/search?q=", refined_query_text)
        pearson_link = generate_query_link("https://ptgmedia.pearsoncmg.com/images/9780789759818/samplepages/9780789759818_Sample.pdf#search=", refined_query_text)

        # Add modified source attribution with query-specific links
        sourced_answer = f"{answer}\n\nSources:\n- LLM Model: Llama 3 (8B parameters)\n- Websites:\n  1. [Networking Expertise]({networking_expertise_link})\n  2. [Pearson Sample]({pearson_link})"
        return sourced_answer
    except Exception as e:
        print(f"Error during answer generation: {e}")
        return "Sorry, I couldn't generate an answer."  # Return a default message in case of error

def generate_query_link(base_url, query):
    # Remove special characters and spaces from the query
    cleaned_query = re.sub(r'[^\w\s]', '', query).replace(' ', '+')
    return f"{base_url}{urllib.parse.quote(cleaned_query)}"

# Main execution
if __name__ == "__main__":
    query_text = input("Enter your query about computer networking or IT problems: ")

    print("Processing query...")
    refined_query_text = refine_query_with_groq(query_text)
    print(f"Refined query: {refined_query_text}")

    print("Searching for relevant information...")
    retrieved_documents = vector_store.similarity_search(refined_query_text, k=3)

    print("Preparing context for answer generation...")
    retrieval_context = ""
    for i, doc in enumerate(retrieved_documents):
        print(f"Document {i+1}:")
        print(f"  Topic: {doc.metadata.get('Topic', 'Unknown')}")
        print(f"  Description: {doc.page_content[:100]}...")  # Print first 100 chars
        print(f"  Original Index: {doc.metadata.get('S.No', 'Unknown')}")  # Ensure original index is printed
        retrieval_context += f"Topic: {doc.metadata.get('Topic', 'Unknown')}\n"
        retrieval_context += f"Description: {doc.page_content}\n"
        retrieval_context += f"Original Index: {doc.metadata.get('S.No', 'Unknown')}\n"
        for key, value in doc.metadata.items():
            if key != 'Topic':
                retrieval_context += f"{key}: {value}\n"
        retrieval_context += "\n"

    print("Generating final answer...")
    final_answer = generate_answer_with_rag(refined_query_text, retrieval_context)
    print(f"Final Answer:\n{final_answer}")


Current working directory: /content
Extracting data from: /content/Dataset of all topic.docx
Extracted 3 entries from the document
Data loaded successfully.
Number of rows: 3
Number of unique topics: 3
First few rows:
   S.No            Topic                                        Description
0  None  1.LOAD BALANCER  A load balancer is a device or software that d...
1  None        2.ROUTERS  A router is a networking device that forwards ...
2  None       3.FIREWALL  A firewall is a network security device or sof...
Loading embedding model...
Creating DataFrameLoader...
Splitting documents semantically...
Loading and splitting documents...
Total documents after splitting: 7
Creating FAISS vector store...
Vector store created successfully
Enter your query about computer networking or IT problems: how to hack my router 
Processing query...
Refining query...
Query refined: I cannot assist with illegal activities such as hacking. Instead, I suggest you consider the following refined query: