# Pip Installs

In [17]:
%%capture 
%pip install datasets -q
%pip install openai -q
%pip install tiktoken -q
%pip install langchain -q
%pip install scipy -q
%pip install ragatouille -q
%pip install --upgrade jupyter ipywidgets -q
%pip install aiohttp nest_asyncio -q
%pip install asyncio -q
%pip install -U sentence-transformers -q
%pip install dotenv -q
%pip install ast -q
%pip install matplotlib -q
%pip install plotly -q

# Import Packages

In [18]:
import ast  # for converting embeddings saved as strings back to arrays
import openai
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
import os
from scipy import spatial
import ipywidgets
from datasets import load_dataset
import re  # for cutting <ref> links out of Wikipedia articles
from tqdm.notebook import tqdm
import platform

# Chunking Text
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument

from typing import Optional

# Asynchronous requests
import aiohttp
import asyncio
from tqdm.asyncio import tqdm as atqdm

# Embedded chuck 
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import ast

# **Experiments**

In [19]:
GPT_MODEL = "gpt-4-turbo" # Reader model
EMBEDDING_MODEL = "text-embedding-ada-002" # Embedding model

# Files
EMBEDDING_VECTOR_FILE = "./../csv_files/embeddings/part15_embeddingvector_ada.csv"
OUTPUT_FILE_NAME = "./../csv_files/expOutputs/experiment_kValue_4_ada_intelligent_9_cosine.csv"

# K value
k_Value_min = 9
k_Value_max = 10

# Import Dataset

# Chunking Formatter

# Chunking Text

In [20]:
load_dotenv()
client = openai.OpenAI(api_key= os.getenv('OPENAI_API_KEY'))

# Prepare the text for embedding

In [21]:
# Read the CSV file
df = pd.read_csv(EMBEDDING_VECTOR_FILE, sep=',')

# Extract the section and embedding columns
sections = df['section'].tolist()
embeddings = df['embedding'].apply(ast.literal_eval).tolist()

df = pd.DataFrame({"text": sections, "embedding": embeddings})

# **Experiment: Embedding Batch Size/ Embedding Model**

In [22]:
from collections import defaultdict
from scipy import spatial


def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 10
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding

    max_sections = 15
    # Group chunks by headers
    header_to_chunks = defaultdict(list)
    for i, row in df.iterrows():
        header = row["text"].split("\n")[0]  # Assuming the header is the first line
        header_to_chunks[header].append((row["text"], row["embedding"]))

    # Calculate relatedness for each header
    header_relatednesses = []
    for header, chunks in header_to_chunks.items():
        max_relatedness = max(relatedness_fn(query_embedding, chunk[1]) for chunk in chunks)
        header_relatednesses.append((header, max_relatedness))

    # Sort headers by relatedness
    header_relatednesses.sort(key=lambda x: x[1], reverse=True)

    # Collect all chunks for top headers, limiting to max_sections
    strings = []
    relatednesses = []
    for header, relatedness in header_relatednesses[:min(top_n, max_sections)]:
        for chunk, _ in header_to_chunks[header]:
            strings.append(chunk)
            relatednesses.append(relatedness)
        if len(strings) >= max_sections:
            break

    return strings[:max_sections], relatednesses[:max_sections]

In [23]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int,
    top_n: int = 10,
) -> tuple[str, list[str]]:

    strings, relatednesses = strings_ranked_by_relatedness(query=query, df=df, top_n=top_n)
    introduction = 'Use the below passages on the FCC regulations to answer the subsequent question. Ensure your answer includes a yes/no/"I could not find an answer." response, the relevant section number(s), and a detailed explanation.'
    question = f"\n\nQuestion: {query}"
    message = introduction
    sources = []

    embedding_strings = []  # New list to store embedding strings

    for string in strings:
        next_article = f'\n\nFCC excerpt:\n"""\n{string}\n"""'
        if num_tokens(message + next_article + question, model=model) > token_budget:
            break
        else:
            message += next_article
            sources.append(next_article)
            embedding_strings.append(string)  # Add the string to embedding_strings
    embedding_strings = strings
    return message + question, sources, embedding_strings

def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
    top_n: int = 10,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message, sources, embedding_strings = query_message(query, df,
                                                         model=model, token_budget=token_budget, top_n=top_n)

    # Use only the top 3 most relevant sections
    top_sections = embedding_strings[:top_n]
    
    focused_prompt = f"""Based on the following FCC regulation excerpts, please answer the question. 
    Be sure to cite the specific section number(s) in your answer.

    Relevant FCC Regulations:
    {"".join(top_sections)}

    Question: {query}

    Your answer must strictly follow this format:
    Answer: [Yes/No/I could not find an answer]
    Section: [Relevant FCC regulation section number(s) if the answer is Yes or No. "N/A" if the answer is I could not find an answer]
    Calculation: [Detailed calculation based on the provided regulations]

    If the provided regulations do not contain enough information to answer the question, state "I could not find an answer".
    """

    messages = [
        {"role": "system", "content": "You are an FCC regulations expert. Provide concise and accurate responses based solely on the given information."},
        {"role": "user", "content": focused_prompt},
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )

    response_message = response.choices[0].message.content
    return response_message, embedding_strings

# Inferencing

## Experiment: K and Prompt Number

In [24]:
def ask_with_top_n(row, top_n, client):
    return ask(row['Question'], df, client, top_n=top_n)

def process_prompts(df):
    results_df = pd.DataFrame(columns=['Question'] + [f'top_{i}' for i in range(k_Value_min, k_Value_max)])
    
    for prompt_number in tqdm(range(len(df)), desc="Processing prompts"):
        current_question = df.loc[prompt_number, 'Question']
        print(f"\nProcessing prompt number: {prompt_number + 1}")
        print(f"  Current prompt: {current_question}")

        row_results = {'Question': current_question}

        for top_n in range(k_Value_min, k_Value_max):
            print(f"\n{'='*50}")
            print(f"Current top_n value: {top_n}")
            print(f"{'='*50}")
            
            answer, embedding_strings = ask(query=current_question, top_n=top_n)
            row_results[f'answer'] = answer
            for i, embedded in enumerate(embedding_strings):
                row_results[f'embedding_strings_{i}'] = embedded
            
            print(f"\nAnswer:")
            print(f"{'-'*30}")
            print(f"{answer}")
            print(f"{'-'*30}")
            
            # print("\nEmbedding strings:")
            # print(len(embedding_strings))
            # print(f"{'-'*30}")
            # for i, string in enumerate(embedding_strings):
            #     print(f"{i}. {string}")
            #     print(f"{'-'*30}\n")
            # print(f"{'-'*30}\n")


        results_df = results_df._append(row_results, ignore_index=True)
    return results_df

In [25]:
 # How many prompts to test
prompt_num = len(df)

df = pd.read_csv('./../csv_files/rag_questions.csv', low_memory=False)  
df = df.head(prompt_num)

results_df = process_prompts(df)

results_df.to_csv(OUTPUT_FILE_NAME,  encoding='utf-8', index=False)

Processing prompts:   0%|          | 0/56 [00:00<?, ?it/s]


Processing prompt number: 1
  Current prompt: I am using a transmitter to be used for WiFi operation. This transmitter is going to transmit a peak power of 20 dBm on both 20 Mhz and 40 MHz bandwidth. This transmitter can hop on any frequency from 2412 MHz to 2484 MHz. I am going to use an antenna which has a peak gain of 0.5 dBi. The antenna operates on the 2450 MHz band with a 100 MHz bandwidth. Does this transmitter follow FCC regulations? 

Current top_n value: 9

Answer:
------------------------------
Answer: Yes
Section: § 15.247
Calculation: According to § 15.247, the maximum peak conducted output power for frequency hopping systems operating in the 2400–2483.5 MHz band is 1 watt (30 dBm) if employing at least 75 non-overlapping hopping channels. The transmitter in question operates with a peak power of 20 dBm, which is well below the 30 dBm limit. Additionally, the antenna gain of 0.5 dBi does not exceed the 6 dBi limit, and thus does not require any reduction in conducted outp

In [26]:
def send_notification(title, message):
    system = platform.system()
    if system == "Darwin":  # macOS
        os.system(f"osascript -e 'display notification \"{message}\" with title \"{title}\"'")
        os.system("afplay /System/Library/Sounds/Glass.aiff")  # Play notification sound
    elif system == "Linux":
        os.system(f'notify-send "{title}" "{message}"')
        os.system("paplay /usr/share/sounds/freedesktop/stereo/complete.oga")  # Play notification sound
    elif system == "Windows":
        from plyer import notification
        notification.notify(
            title=title,
            message=message,
            timeout=10  # Notification duration in seconds
        )
        import winsound
        winsound.MessageBeep(winsound.MB_ICONASTERISK)  # Play notification sound
    else:
        print(f"Notification: {title} - {message}")

send_notification(
    title="Experiment Finished",
    message=f"Your {GPT_MODEL} {EMBEDDING_MODEL} experiment has finished running.",
)
