In [2]:
# Import required libraries
import os
from dotenv import load_dotenv
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.messages import HumanMessage
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.tools import Tool
from langgraph.prebuilt import create_react_agent
from langchain.prompts import PromptTemplate
from openai import OpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import speech_recognition as sr
from elevenlabs.client import ElevenLabs
from elevenlabs import play

  warn(


In [3]:
# from zipfile import ZipFile
# file = ZipFile('transcripts.zip', 'r')
# file.extractall()
# file.close()

In [4]:
# Stores chat message history in memory
message_history = InMemoryChatMessageHistory()

In [5]:
# Load API keys from .env and initialize OpenAI and ElevenLabs clients
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.5)
client = OpenAI(api_key=OPENAI_API_KEY)
client_voice = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))

In [6]:
# Helper function to get chat completion from OpenAI API
def chat_complete(client, system_message: str, user_message: str, model: str = "gpt-3.5-turbo"):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
    )
    return response.choices[0].message.content.strip()

In [None]:
# Build a FAISS vector store from transcript text files
def build_vectorstore_from_transcripts(folder_path, persist_path, openai_api_key):
    docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            loader = TextLoader(os.path.join(folder_path, filename), encoding="utf-8")
            docs.extend(loader.load())

    # Split documents into smaller chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    split_docs = splitter.split_documents(docs)

    # Generate vector embeddings and store them
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
    vectorstore = FAISS.from_documents(split_docs, embeddings)
    vectorstore.save_local(persist_path)
    return vectorstore

In [8]:
# Create a QA tool that answers based on vector similarity
def create_qa_tool(vectorstore):
    def qa_func(inputs):
        query = inputs["__arg1"]
        messages = inputs.get("messages", [])

        chat_history_text = "\n".join(
            f"{msg.type.upper()}: {msg.content}" for msg in messages
        )

        # Find relevant documents
        docs = vectorstore.similarity_search(query, k=5)
        docs = [doc for doc in docs if doc.page_content.strip()]

        # Similarity check between query and context
        context_text = "\n".join(doc.page_content for doc in docs)
        if not docs or not context_text.strip():
            return "❗ This question is outside the scope of the cybersecurity playlist."

        vectorizer = TfidfVectorizer().fit([query, context_text])
        query_vec = vectorizer.transform([query])
        context_vec = vectorizer.transform([context_text])
        similarity_score = cosine_similarity(query_vec, context_vec)[0][0]

        if similarity_score < 0.5 or len(context_text.split()) < 20:
            return "❗ This question is outside the scope of the cybersecurity playlist."
        
        # Generate a prompt for the model
        prompt = PromptTemplate.from_template("""
        You are a cybersecurity tutor. ONLY answer based on the following:

        Context:
        {context}

        Chat history:
        {history}

        Question:
        {question}

        If not related to cybersecurity, say: "This question is outside the scope of the cybersecurity playlist."
        """)

        chain = (
            {"context": lambda _: context_text, "history": lambda _: chat_history_text, "question": lambda _: query}
            | prompt
            | llm
        )

        return chain.invoke({}).content

    return Tool(
        name="CyberSecurityQA",
        func=qa_func,
        description="Answer cybersecurity questions using vector context and chat history only."
    )

# Create a tool that generates multiple-choice cybersecurity quizzes
def create_quiz_tool(vectorstore):
    def quiz_func(query):
        try:
            num = int(query.strip())
        except:
            num = 3

        # Get context for quiz generation
        docs = vectorstore.similarity_search("cybersecurity", k=5)
        context = "\n".join([doc.page_content for doc in docs])
        prompt = PromptTemplate.from_template("""
        Generate {num_questions} cybersecurity multiple-choice questions based on the context below.
        Each question should have 4 options (A-D), and do NOT include answers.

        Context:
        {context}
        """)
        chain = (
            {"context": lambda x: context, "num_questions": lambda x: num}
            | prompt
            | llm
        )
        return chain.invoke({}).content

    return Tool(
        name="GenerateCyberQuiz",
        func=quiz_func,
        description="Generate cybersecurity quizzes based on playlist content."
    )

# Combine tools into an agent with chat memory
def create_agent_with_memory(vectorstore):
    tools = [create_qa_tool(vectorstore), create_quiz_tool(vectorstore)]
    agent = create_react_agent(model=llm, tools=tools)
    return RunnableWithMessageHistory(
        agent,
        lambda: message_history,
        input_messages_key="messages",
        history_messages_key="history",
    )


In [9]:
# Print summary of the agent response
def summarize_response(response):
    print("\n🗞 Summary:")

    system_message = """
    You are a strict filter that checks if a given paragraph is about cybersecurity.
    Only respond with 'yes' if the paragraph clearly discusses cybersecurity topics such as network security, malware, encryption, firewalls, threats, or vulnerabilities.
    If it does not relate to cybersecurity, respond with 'no'.
    Respond with only 'yes' or 'no'. Do not explain.
    """
    check_context = chat_complete(client, system_message=system_message, user_message=response['messages'][-1].content)

    if check_context.lower() == 'no':
        print("❌ This question is outside the scope of the cybersecurity playlist.")
        return

    # Print user message
    human_messages = [m for m in response['messages'] if m.type == "human"]
    if human_messages:
        print(f"🧠 User: {human_messages[-1].content}")

    # Print tool used
    tool_calls = [
        m for m in response['messages'] 
        if hasattr(m, "tool_calls") and m.tool_calls
    ]
    if tool_calls:
        print("🛠️ Tool Used:", tool_calls[-1].tool_calls[0]['name'])

    
    final_answer = [
        m for m in response['messages'] 
        if m.type == "ai" and m.content.strip() != ""
    ]
    if final_answer:
        print("🤖 AI Response:", final_answer[-1].content.strip())

    print("\n" + "="*50 + "\n")


In [10]:
# Main function for CLI interaction
def run_cli():
    vectorstore = FAISS.load_local(
        "vector_db",
        OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
        allow_dangerous_deserialization=True
    )

    agent_with_history = create_agent_with_memory(vectorstore)

    print("\n🔐 Cybersecurity Agent Ready! Ask anything or request a quiz. Type 'exit' to quit.\n")

    while True:
        print("\n⁉️ Choose input type: [1] Text   [2] Voice   [exit] to quit")
        mode = input("Your choice: ").strip().lower()

        if mode in ["exit", "quit"]:
            break

        if mode == "1":
            user_input = input("🧠 Your question: ")
        elif mode == "2":
            recognizer = sr.Recognizer()
            with sr.Microphone() as source:
                print("🎙️ Speak your question...")
                recognizer.adjust_for_ambient_noise(source)
                audio = recognizer.listen(source)

                try:
                    user_input = recognizer.recognize_google(audio, language='en-US')
                    print("📝 You said:", user_input)
                except sr.UnknownValueError:
                    print("😕 Could not understand the audio.")
                    continue
                except sr.RequestError:
                    print("❌ Could not connect to speech service.")
                    continue
        else:
            print("⚠️ Invalid option. Please enter 1 or 2.")
            continue
        
        # Prepare messages and get response
        previous_messages = message_history.messages[-5:]
        new_message = HumanMessage(content=user_input)
        all_messages = previous_messages + [new_message]

        response = agent_with_history.invoke({"messages": all_messages})
        summarize_response(response)


In [11]:
# Build the vector DB if it doesn't exist, then run CLI
if not os.path.exists("vector_db"):
    build_vectorstore_from_transcripts("transcripts", "vector_db", OPENAI_API_KEY)
run_cli()


🔐 Cybersecurity Agent Ready! Ask anything or request a quiz. Type 'exit' to quit.


⁉️ Choose input type: [1] Text   [2] Voice   [exit] to quit

🗞 Summary:
🧠 User: What is hashing?
🛠️ Tool Used: CyberSecurityQA
🤖 AI Response: Hashing is a process used in cybersecurity to convert data into a fixed-size string of characters, which is typically a sequence of numbers and letters. This transformation is done using a mathematical algorithm known as a hash function. The key characteristics of hashing include:

1. **Deterministic**: The same input will always produce the same hash output.
2. **Fixed Size**: Regardless of the size of the input data, the output hash will always be of a fixed size.
3. **Fast Computation**: Hash functions can quickly compute the hash value for any given input.
4. **Pre-image Resistance**: It should be computationally infeasible to reverse the process and retrieve the original input from its hash value.
5. **Collision Resistance**: It should be unlikely that two d

# 🙏 Thanks 💙