# Pip Installs

In [1]:
%%capture 
%pip install datasets -q
%pip install openai -q
%pip install tiktoken -q
%pip install langchain -q
%pip install scipy -q
%pip install ragatouille -q
%pip install --upgrade jupyter ipywidgets -q
%pip install aiohttp nest_asyncio -q
%pip install asyncio -q
%pip install -U sentence-transformers -q
%pip install dotenv -q
%pip install ast -q
%pip install matplotlib -q
%pip install plotly -q

# Import Packages

In [2]:
import ast  # for converting embeddings saved as strings back to arrays
import openai
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
import os
from scipy import spatial
import ipywidgets
from datasets import load_dataset
import re  # for cutting <ref> links out of Wikipedia articles
from tqdm.notebook import tqdm
import platform

# Chunking Text
import datasets
from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument


from typing import Optional

# Asynchronous requests
import aiohttp
import asyncio
from tqdm.asyncio import tqdm as atqdm

# Embedded chuck 
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import ast

from collections import defaultdict
from scipy import spatial

# **Experiments**

In [3]:
GPT_MODEL = "gpt-4-turbo" # Reader model
EMBEDDING_MODEL = "text-embedding-ada-002" # Embedding model

# Files
OUTPUT_FILE_NAME = "./../csv_files/expOutputs/experiment_chunking_4_ada_fixedrecursive_8_cosine.csv"

# K value
k_Value_min = 8
k_Value_max = 9

# Chunking Text

In [4]:
load_dotenv()
client = openai.OpenAI(api_key= os.getenv('OPENAI_API_KEY'))

In [5]:
dataset = load_dataset('lucyd/fcc_html')

combined_df = pd.concat([pd.DataFrame(dataset[split]) for split in dataset], ignore_index=True)


df = pd.DataFrame({
    'document_title': range(1, len(combined_df) + 1),
    'content': combined_df['content']
})

# print(df.head())
# print(df.shape)

# Filter the DataFrame to include only rows where 'content' contains 'PART 15'
part_15_df = df[df['content'].str.contains('PART 15—RADIO', case=False, na=False)]
df = part_15_df

In [6]:


MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

# chunking parameters - can be adjusted
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=10,
    add_start_index=True,
    strip_whitespace=True,
    separators=MARKDOWN_SEPARATORS,
)

corpus = [
    LangchainDocument(page_content=row["content"])
    for idx, row in tqdm(df.iterrows(), total=len(df))
]
docs_processed = []
for doc in corpus:
    docs_processed += text_splitter.split_documents([doc])

  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
docs_processed[0]

Document(metadata={'start_index': 0}, page_content='PART 15—RADIO FREQUENCY DEVICES Authority: 47 U.S.C. 154 , 302a , 303 , 304 , 307 , 336 , 544a , and 549 . Source: 54 FR 17714 , Apr. 25, 1989, unless otherwise noted. Subpart A—General § 15.1 Scope of this part. ( a ) This part sets out the regulations under which an intentional, unintentional, or incidental radiator may be operated without an individual license. It also contains the technical specifications, administrative requirements and other conditions relating to the marketing of part 15 devices. ( b ) The operation of an intentional or unintentional radiator that is not in accordance with the regulations in this part must be licensed pursuant to the provisions of section 301 of the Communications Act of 1934, as amended, unless otherwise exempted from the licensing requirements elsewhere in this chapter. ( c ) Unless specifically exempted, the operation or marketing of an intentional or unintentional radiator that is not in co

In [8]:
texts = []
for doc in docs_processed:
    if isinstance(doc, LangchainDocument):
        texts.append(doc.page_content)


In [9]:
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request

# embed text inputs
embeddings = []
for batch_start in range(0, len(texts), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = texts[batch_start:batch_end]
    response = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response.data):
        assert i == be.index  # double check embeddings are in same order as input
    batch_embeddings = [e.embedding for e in response.data]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": texts, "embedding": embeddings})

# **Experiment: Embedding Batch Size/ Embedding Model**

In [10]:

def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 10
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [11]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int,
    top_n: int = 10,
) -> tuple[str, list[str]]:

    strings, relatednesses = strings_ranked_by_relatedness(query=query, df=df, top_n=top_n)
    introduction = 'Use the below passages on the FCC regulations to answer the subsequent question. Ensure your answer includes a yes/no/"I could not find an answer." response, the relevant section number(s), and a detailed explanation.'
    question = f"\n\nQuestion: {query}"
    message = introduction
    sources = []

    embedding_strings = []  # New list to store embedding strings

    for string in strings:
        next_article = f'\n\nFCC excerpt:\n"""\n{string}\n"""'
        if num_tokens(message + next_article + question, model=model) > token_budget:
            break
        else:
            message += next_article
            sources.append(next_article)
            embedding_strings.append(string)  # Add the string to embedding_strings
    embedding_strings = strings
    return message + question, sources, embedding_strings

def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
    top_n: int = 10,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message, sources, embedding_strings = query_message(query, df,
                                                         model=model, token_budget=token_budget, top_n=top_n)

    # Use only the top 3 most relevant sections
    top_sections = embedding_strings[:top_n]
    
    focused_prompt = f"""Based on the following FCC regulation excerpts, please answer the question. 
    Be sure to cite the specific section number(s) in your answer.

    Relevant FCC Regulations:
    {"".join(top_sections)}

    Question: {query}

    Your answer must strictly follow this format:
    Answer: [Yes/No/I could not find an answer]
    Section: [Relevant FCC regulation section number(s) if the answer is Yes or No. "N/A" if the answer is I could not find an answer]
    Calculation: [Detailed calculation based on the provided regulations]

    If the provided regulations do not contain enough information to answer the question, state "I could not find an answer".
    """

    messages = [
        {"role": "system", "content": "You are an FCC regulations expert. Provide concise and accurate responses based solely on the given information."},
        {"role": "user", "content": focused_prompt},
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )

    response_message = response.choices[0].message.content
    return response_message, embedding_strings

# Inferencing

## Experiment: K and Prompt Number

In [12]:
def ask_with_top_n(row, top_n, client):
    return ask(row['Question'], df, client, top_n=top_n)

def process_prompts(df):
    results_df = pd.DataFrame(columns=['Question'] + [f'top_{i}' for i in range(k_Value_min, k_Value_max)])
    
    for prompt_number in tqdm(range(len(df)), desc="Processing prompts"):
        current_question = df.loc[prompt_number, 'Question']
        print(f"\nProcessing prompt number: {prompt_number + 1}")
        print(f"  Current prompt: {current_question}")

        row_results = {'Question': current_question}

        for top_n in range(k_Value_min, k_Value_max):
            print(f"\n{'='*50}")
            print(f"Current top_n value: {top_n}")
            print(f"{'='*50}")
            
            answer, embedding_strings = ask(query=current_question, top_n=top_n)
            row_results[f'answer'] = answer
            for i, embedded in enumerate(embedding_strings):
                row_results[f'embedding_strings_{i}'] = embedded
            
            print(f"\nAnswer:")
            print(f"{'-'*30}")
            print(f"{answer}")
            print(f"{'-'*30}")
            
            # print("\nEmbedding strings:")
            # print(len(embedding_strings))
            # print(f"{'-'*30}")
            # for i, string in enumerate(embedding_strings):
            #     print(f"{i}. {string}")
            #     print(f"{'-'*30}\n")
            # print(f"{'-'*30}\n")


        results_df = results_df._append(row_results, ignore_index=True)
    return results_df

In [13]:
 # How many prompts to test
prompt_num = len(df)

df = pd.read_csv('./../csv_files/rag_questions.csv', low_memory=False)  
df = df.head(prompt_num)

results_df = process_prompts(df)

results_df.to_csv(OUTPUT_FILE_NAME,  encoding='utf-8', index=False)

Processing prompts:   0%|          | 0/56 [00:00<?, ?it/s]


Processing prompt number: 1
  Current prompt: I am using a transmitter to be used for WiFi operation. This transmitter is going to transmit a peak power of 20 dBm on both 20 Mhz and 40 MHz bandwidth. This transmitter can hop on any frequency from 2412 MHz to 2484 MHz. I am going to use an antenna which has a peak gain of 0.5 dBi. The antenna operates on the 2450 MHz band with a 100 MHz bandwidth. Does this transmitter follow FCC regulations? 

Current top_n value: 8

Answer:
------------------------------
Answer: Yes
Section: (b)(1)
Calculation: The transmitter operates in the 2400-2483.5 MHz band, which is covered under the FCC regulation section (b)(1). The regulation specifies that for frequency hopping systems in this band, the maximum peak conducted output power shall not exceed 0.125 watts (or 21 dBm) if employing less than 75 hopping channels. Since the transmitter's peak power is 20 dBm, it is within the allowed limit of 21 dBm. Additionally, the antenna gain is 0.5 dBi, which

In [14]:
def send_notification(title, message):
    system = platform.system()
    if system == "Darwin":  # macOS
        os.system(f"osascript -e 'display notification \"{message}\" with title \"{title}\"'")
        os.system("afplay /System/Library/Sounds/Glass.aiff")  # Play notification sound
    elif system == "Linux":
        os.system(f'notify-send "{title}" "{message}"')
        os.system("paplay /usr/share/sounds/freedesktop/stereo/complete.oga")  # Play notification sound
    elif system == "Windows":
        from plyer import notification
        notification.notify(
            title=title,
            message=message,
            timeout=10  # Notification duration in seconds
        )
        import winsound
        winsound.MessageBeep(winsound.MB_ICONASTERISK)  # Play notification sound
    else:
        print(f"Notification: {title} - {message}")

send_notification(
    title="Experiment Finished",
    message=f"Your {GPT_MODEL} experiment has finished running.",
)


In [15]:
# # 1 (yes)  section 15.247
# ask('I am using a transmitter to be used for WiFi operation. This transmitter is going to transmit a peak power of 20 dBm on both 20 Mhz and 40 MHz bandwidth. This transmitter can hop on any frequency from 2412 MHz to 2484 MHz. I am going to use an antenna which has a peak gain of 0.5 dBi. The antenna operates on the 2450 MHz band with a 100 MHz bandwidth. Does this transmitter follow FCC regulations? ')

In [16]:
# 2 (yes)  section 15.247
# ask('I am using a transmitter to be used for WiFi operation. This transmitter is going to transmit a peak power of 30 dBm on both 20 Mhz and 40 MHz bandwidth. This transmitter can hop on any frequency from 2412 MHz to 2484 MHz. I am going to use an antenna which has a peak gain of 0.5 dBi. The antenna operates on the 2450 MHz band with a 100 MHz bandwidth. Does this transmitter follow FCC regulations? ')

In [17]:
# 3 (no)  section 15.247
# ask('I am using a transmitter to be used for WiFi operation. This transmitter is going to transmit a peak power of 40 dBm on both 20 Mhz and 40 MHz bandwidth. This transmitter can hop on any frequency from 2412 MHz to 2484 MHz. I am going to use an antenna which has a peak gain of 0.5 dBi. The antenna operates on the 2450 MHz band with a 100 MHz bandwidth. Does this transmitter follow FCC regulations? ')

In [18]:
# 4 (no)  section 15.247
# ask('I am using a transmitter to be used for WiFi operation. This transmitter is going to transmit a peak power of 50 dBm on both 20 Mhz and 40 MHz bandwidth. This transmitter can hop on any frequency from 2412 MHz to 2484 MHz. I am going to use an antenna which has a peak gain of 0.5 dBi. The antenna operates on the 2450 MHz band with a 100 MHz bandwidth. Does this transmitter follow FCC regulations? ')

In [19]:
# 5 (no) section 15.247
# ask('I am using a transmitter to be used for WiFi operation. This transmitter is going to transmit a peak power of 60 dBm on both 20 Mhz and 40 MHz bandwidth. This transmitter can hop on any frequency from 2412 MHz to 2484 MHz. I am going to use an antenna which has a peak gain of 0.5 dBi. The antenna operates on the 2450 MHz band with a 100 MHz bandwidth. Does this transmitter follow FCC regulations? ')

In [20]:
# UWB question (should contain section F 15.503)
# ask('I have a UWB transmitter that can operate at frequencies from 6 GHz to 9.3 GHz. At each frequency, it can operate with a bandwidth of 500 MHz to 3 GHz of configurable BW with -50 dBm/MHz. I am planning to use this transmitter for my UWB localization application and I want to operate on 7 GHz band with 2 GHz of bandwidth. I am planning to use an antenna with operations in 3100-5000 MHz band with corresponding gain 1.7 dBi and VSWR is < 1.8:1. Also, this antenna can operate on 5900-8500 MHz band with Gain given by 4.3 dBi and VSWR < 1.9:1. Does this transmission conform to FCC regulations?')