# Analyze Draft Application to Identify Support for Features in IDF
This Jupyter notebook takes in an IDF document and a Draft Application, creates chunks of text from the IDF that include invention details and examples, and outputs a report indicating how well supported the text chunks are by the Draft Application.

## Instructions:
0. Set your variables and environment

1. Upload a plain text IDF file and plain text detailed description file to the jupyter notebook workspace

2. update the IDF_filename and description_filename variables to point to the IDF and detailed description files you uploaded

## 0. Setup your variables and environment 

In [26]:
IDF_filename = 'IDF Text (0111-0266PRO).txt' 
description_filename = 'Detailed Description Text (0111-0266PRO).txt' 
auth_token = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1hbGxlbkBoYXJyaXR5bGxwLmNvbSIsImZpcnN0TmFtZSI6Ik1hdHQiLCJsYXN0TmFtZSI6IkFsbGVuIiwicGVybWlzc2lvbnMiOlsicG9ydGZvbGlvIiwidXNlcnMiLCJvcmdhbml6YXRpb25zIiwic3BlY2lmaWNhdGlvbnMiLCJmb3JtcyIsInRlbXBsYXRlcyIsImFwcGxpY2F0aW9ucyIsInVzcHRvIiwibWVzc2FnZXMiLCJ3b3JrZmxvd3MiLCJhZG1pbmlzdHJhdGlvbiIsImRvY2tldGluZ190YXNrcyIsImFkbWluaXN0cmF0aW9uX3Rhc2tzIiwiYm90X3NjcmlwdHMiLCJwcm9tcHRzIl0sInRlYW1zIjpbIlByb3NlY3V0aW9uIEF0dG9ybmV5cyJdLCJpYXQiOjE3MzEwMzg2NDgsImV4cCI6MTczMzYzMDY0OCwic3ViIjoiNWQ0ZGMzODEzZmUyMGZlODc1YTRmNjJlIn0.PyyCcZOa0HaFPW2pESP9-HpuAnC7NTpbPcGg3l-zxqM'

In [2]:
# staging Auth token
# auth_token = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1hbGxlbkBoYXJyaXR5bGxwLmNvbSIsImZpcnN0TmFtZSI6Ik1hdHQiLCJsYXN0TmFtZSI6IkFsbGVuIiwicGVybWlzc2lvbnMiOlsicG9ydGZvbGlvIiwidXNlcnMiLCJvcmdhbml6YXRpb25zIiwic3BlY2lmaWNhdGlvbnMiLCJmb3JtcyIsInRlbXBsYXRlcyIsImFwcGxpY2F0aW9ucyIsInVzcHRvIiwibWVzc2FnZXMiLCJ3b3JrZmxvd3MiLCJhZG1pbmlzdHJhdGlvbiIsImRvY2tldGluZ190YXNrcyIsImFkbWluaXN0cmF0aW9uX3Rhc2tzIiwiYm90X3NjcmlwdHMiLCJwcm9tcHRzIl0sInRlYW1zIjpbXSwiaWF0IjoxNzMxMDk3MjI4LCJleHAiOjE3MzM2ODkyMjgsInN1YiI6IjY1MTczYTFlYWRkYjA0M2RkM2QzMTI5NiJ9.myQU_ot5YREJCI3k-uB0YxSF4rvgi4rZGH9PcZomx7I'

In [None]:
# Uncomment to install any missing libraries
!pip install --upgrade pip
!pip install requests numpy pandas matplotlib tqdm ipywidgets widgetsnbextension pandas-profiling tabulate

In [8]:
# Import necessary libraries
import pandas as pd
import re, requests, json, urllib3
#from tqdm import tqdm          # progress bar library
from IPython.display import display, Markdown, JSON
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [23]:
# Define helper functions
import requests
import re

def get_prompt_chain(prompt_id):
    #promptChainURL = f"https://backoffice.staging.harritydev.com/api/prompts/{prompt_id}"
    promptChainURL = f"https://backoffice.h2tools.hhllp.local/api/prompts/{prompt_id}"
    headers = {
        'Authorization': f"Bearer {auth_token}",  
        'Content-Type': 'application/json'
    }
    promptChain = requests.get(promptChainURL, headers=headers, verify=False)
    return promptChain.json()

def get_variables(chain):
    temp_vars = []
    for link in chain.get("template").get("chain"):
        if link.get("variables"):
            for var in link["variables"]:
                temp_vars.append(var["name"])
    return temp_vars

def set_variables(chain, variable_name, value):
    for link in chain.get("template").get("chain"):
        if link.get("variables"):
            for var in link.get("variables"):
                if (var["name"] == variable_name):
                    var["value"] = value

def run_prompt_chain(chain_object):
    #url = 'https://backoffice.staging.harritydev.com/api/v2/nlp/prompts'
    url = 'https://backoffice.h2tools.hhllp.local/api/v2/nlp/prompts' 
    headers = {
    "authorization": f"Bearer {auth_token}",
    "content-type": "application/json",
    }
    return requests.post(url, headers=headers, json=chain_object["template"], verify=False, timeout=600)

# Function to execute the prompt chain and parse output
def process_chunk(chain, df_chunk, description_text):
    # Convert the DataFrame chunk to JSON
    json_chunk = df_chunk.to_json(orient='records')
    
    # Set variables for the prompt chain
    set_variables(chain, "IDFchunks", json_chunk)
    set_variables(chain, "DraftApp", description_text)
    
    # Run the prompt chain
    prompt_output = run_prompt_chain(chain)

    # For debugging
    # print(prompt_output.text)
    
    # Parse the JSON string into a Python dictionary
    data = json.loads(prompt_output.text)
    
    # Extract the list of disclosure details
    disclosure_details = data["Disclosure Details"]
    
    return disclosure_details

# Function to execute the prompt chain and parse output - text only output
def process_chunk_to(chain, df_chunk, description_text):
    # Convert the DataFrame chunk to JSON
    json_chunk = df_chunk.to_json(orient='records')
    
    # Set variables for the prompt chain
    set_variables(chain, "IDFchunks", json_chunk)
    set_variables(chain, "DraftApp", description_text)
    
    # Run the prompt chain
    prompt_output = run_prompt_chain(chain)
   
    return prompt_output

In [24]:
# Grab the chain that provides chunks from the IDF
IDF_chunks_chain = get_prompt_chain('672d12c04669613f04831eb7')
support_identifying_chain = get_prompt_chain('672d26984669611b2e831eb8')
support_identifying_chain_a = get_prompt_chain('672d80eb4669618119831eba')
support_identifying_chain_b = get_prompt_chain('672d811c4669616bb5831ebb')
JSON_chain = get_prompt_chain('67322512466961bc79831ec5')

#Uncomment if you want to inspect the prompt chains
#display(JSON(IDF_chunks_chain)) 
#display(JSON(support_identifying_chain)) 

# Uncomment to show the variable names needed by the prompt chain
#display(Markdown(f"### Provide the \"{IDF_chunks_chain['name']}\" prompt with these variables: {get_variables(IDF_chunks_chain)}."))
#display(Markdown(f"### Provide the \"{support_identifying_chain['name']}\" prompt with these variables: {get_variables(support_identifying_chain)}."))

In [6]:
# Staging prompt chains
#IDF_chunks_chain = get_prompt_chain('672e6f66786ea24fdb9f68df')
#support_identifying_chain_a = get_prompt_chain('672e70d6786ea227109f68e1')
#support_identifying_chain_b = get_prompt_chain('672e7113786ea27bc49f68e2')

#Uncomment if you want to inspect the prompt chains
#display(JSON(IDF_chunks_chain)) 
#display(JSON(support_identifying_chain_a)) 

In [27]:
# get IDF text
with open(IDF_filename, 'r') as file:
    IDF_text = file.read()
file.close()
# print(IDF_text)

# Run the first prompt chain to get the IDF chunks
set_variables(IDF_chunks_chain, "IDF", IDF_text) #Populate the chain's variable name[s] shown by the get_variables() function in the cell above
prompt1_output = run_prompt_chain(IDF_chunks_chain)

In [28]:
# Test that we got output from the prompt chain
# print(prompt_output.text)
display(JSON(json.loads(prompt1_output.text)))

<IPython.core.display.JSON object>

In [29]:
# Parse the JSON string into JSON for creating a dataframe
data = json.loads(prompt1_output.text)

# Extracting Invention Details and Examples
invention_details = data.get("Invention Details", [])
examples = data.get("Examples", [])

# Creating DataFrames
df_invention_details = pd.DataFrame(invention_details, columns=["Invention Details"])
df_examples = pd.DataFrame(examples, columns=["Examples"])

In [10]:
# Display invention details data frame 
#df_invention_details

In [11]:
# Display examples data frame 
#df_examples

In [12]:
# This Begins an Example on 5 rows only

In [13]:
# this was originally a test, but I put it into a function that we dont call. remove it from the function to test
def test_prompt():
    # get Detailed Description text
    with open(description_filename, 'r') as file:
        description_text = file.read()
    file.close()
    # print(description_text)
    
    # get first 5 rows of IDF chunks to test
    first_5_invention_details = df_invention_details.head(5)
    
    # Convert the first 5 rows to JSON
    json_invention_details = first_5_invention_details.to_json(orient='records')
    
    # Print the JSON strings
    print("JSON for the first 5 rows of Invention Details DataFrame:")
    print(json_invention_details)

    # Execute the prompt chain on the 5 rows
    set_variables(support_identifying_chain, "IDFchunks", json_invention_details) #Populate the chain's variable name[s] shown by the get_variables() function in the cell above
    set_variables(support_identifying_chain, "DraftApp", description_text) 
    prompt2_output = run_prompt_chain(support_identifying_chain)
    # print(prompt2_output.text)

    # Display results of prompt2
    # print(prompt2_output.text)
    display(JSON(json.loads(prompt2_output.text)))

    # Parse the JSON string into a Python dictionary
    data = json.loads(prompt2_output.text)
    
    # Extract the list of disclosure details
    disclosure_details = data["Disclosure Details"]
    
    # Create a DataFrame with the extracted data
    df = pd.DataFrame(disclosure_details, columns=["idfChunk", "disclosureText", "disclosureScore"])
    
    # Ensure disclosureScore is treated as integers
    df['disclosureScore'] = df['disclosureScore'].astype(int)
    
    # Display the DataFrame
    df
    #print(df)

    # Sort the DataFrame by the disclosureScore column in ascending order
    df_sorted = df.sort_values(by="disclosureScore", ascending=True)
    
    # Display the sorted DataFrame
    df_sorted
    # print(df_sorted)

def generate_html_df(df):
    # Convert DataFrame to HTML with custom CSS
    html_string = '''
    <!DOCTYPE html>
    <html>
    <head>
    <title>DataFrame Output</title>
    <style>
    body {{
    font-family: Arial, sans-serif;
    margin: 20px;
    }}
    table {{
    width: 100%;
    border-collapse: collapse;
    margin-bottom: 20px;
    }}
    th, td {{
    border: 1px solid #ddd;
    padding: 8px;
    text-align: left;
    }}
    th {{
    background-color: #f2f2f2;
    }}
    tr:nth-child(even) {{
    background-color: #f9f9f9;
    }}
    tr:hover {{
    background-color: #f1f1f1;
    }}
    </style>
    </head>
    <body>
    <h2>DataFrame Output</h2>
    {table}
    </body>
    </html>
    '''.format(table=df.to_html(index=False))
    
    # Save the HTML string to a file
    html_file_path = 'dataframe_output.html'
    with open(html_file_path, 'w') as f:
        f.write(html_string)
    
    print(f"DataFrame has been exported as HTML to {html_file_path}")

## 3. Iterate over the full data set

In [30]:
# Read Detailed Description text
with open(description_filename, 'r') as file:
    description_text = file.read()

# Define the size of each chunk
chunk_size = 5

# Initialize an empty list to collect all disclosure details
all_disclosure_details = []

# Loop through the entire DataFrame in chunks
for start in range(0, len(df_invention_details), chunk_size):
    # Get the chunk of the DataFrame
    df_chunk = df_invention_details.iloc[start:start + chunk_size]
    print(df_chunk)
    
    # Process the chunk and get disclosure details twice
    disclosure_details_a = process_chunk_to(support_identifying_chain_a, df_chunk, description_text)
    display(JSON(json.loads(disclosure_details_a.text)))
    disclosure_details_b = process_chunk_to(support_identifying_chain_a, df_chunk, description_text)
    display(JSON(json.loads(disclosure_details_b.text)))

    # Combine results a and b with other prompt chain
    # Set variables for the prompt chain
    set_variables(support_identifying_chain_b, "DraftApp", description_text)
    set_variables(support_identifying_chain_b, "FirstDisclosureRun", disclosure_details_a.text)
    set_variables(support_identifying_chain_b, "SecondDisclosureRun", disclosure_details_b.text)
    json_chunk = df_chunk.to_json(orient='records')
    set_variables(support_identifying_chain_b, "IDFchunks", json_chunk)
    
    # Run the prompt chain
    prompt_output = run_prompt_chain(support_identifying_chain_b)

    # Run JSON chain
    set_variables(JSON_chain, "IDFchunks", json_chunk)
    set_variables(JSON_chain, "DisclosureCheck", prompt_output.text)
    prompt_output = run_prompt_chain(JSON_chain)
    
    # Parse the JSON string into a Python dictionary
    data = json.loads(prompt_output.text)
    display(JSON(data))
    
    # Extract the list of disclosure details
    disclosure_details_c = data["Disclosure Details"]

    # Append the disclosure details to the list
    all_disclosure_details.extend(disclosure_details_c)

# Create a DataFrame with the collected disclosure details
df_all_disclosures = pd.DataFrame(all_disclosure_details, columns=["idfChunk", "disclosureText", "disclosureScore"])

df_all_disclosures

                                   Invention Details
0  This invention describes a way that allows to ...
1  The idea of this invention is to use the slots...
2  This way, bits L0 … L15 are available to imple...
3  In the state-of-the-art approach, inversion co...
4  Write: encoding in host controller, transmit, ...


<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

                                   Invention Details
5  Read: encoding in DRAM, transmit, decoding in ...
6  In case system meta data bits (M0 … M15) are u...
7  Encoding as well as decoding is done on host c...
8  Write: encoding in host controller, transmit, ...
9  Read: read DBI bits from carved-out memory spa...


<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

                                    Invention Details
10  The trade-off of the approach proposed is that...
11  This can still be feasible because, for exampl...
12  In case a 2nd meta data function (e.g. system ...
13  In the JEDEC implementation, 16 data bits are ...
14  In case a 2nd meta data function is required t...


<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

                                    Invention Details
15  This can be used in a flexible way, using n bi...


<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

Unnamed: 0,idfChunk,disclosureText,disclosureScore
0,This invention describes a way that allows to ...,Some implementations described herein enable s...,5
1,The idea of this invention is to use the slots...,By using portions of the data packet provision...,4
2,"This way, bits L0 … L15 are available to imple...","Additionally, the host system may use portions...",4
3,"In the state-of-the-art approach, inversion co...",To communicate a data packet from the host sys...,3
4,"Write: encoding in host controller, transmit, ...",To communicate a data packet from the host sys...,4
5,"Read: encoding in DRAM, transmit, decoding in ...",The memory system 210 may provide the encoded ...,3
6,In case system meta data bits (M0 … M15) are u...,The memory system may store system metadata as...,4
7,Encoding as well as decoding is done on host c...,The host system 205 may encode a payload of a ...,4
8,"Write: encoding in host controller, transmit, ...",The host system 205 may encode a payload of a ...,4
9,Read: read DBI bits from carved-out memory spa...,The memory system 210 may store the inversion ...,4


In [31]:
# loop through Examples (same description_text and chunk_size)
# Initialize an empty list to collect all example
all_example_details = []

# Loop through the entire DataFrame of examples in chunks
for start in range(0, len(df_examples), chunk_size):
    # Get the chunk of the DataFrame
    df_chunk = df_examples.iloc[start:start + chunk_size]
    print(df_chunk)
    
    # Process the chunk and get disclosure details twice
    disclosure_details_a = process_chunk_to(support_identifying_chain_a, df_chunk, description_text)
    display(JSON(json.loads(disclosure_details_a.text)))
    disclosure_details_b = process_chunk_to(support_identifying_chain_a, df_chunk, description_text)
    display(JSON(json.loads(disclosure_details_b.text)))

    # Combine results a and b with other prompt chain
    # Set variables for the prompt chain
    set_variables(support_identifying_chain_b, "DraftApp", description_text)
    set_variables(support_identifying_chain_b, "FirstDisclosureRun", disclosure_details_a.text)
    set_variables(support_identifying_chain_b, "SecondDisclosureRun", disclosure_details_b.text)
    json_chunk = df_chunk.to_json(orient='records')
    set_variables(support_identifying_chain_b, "IDFchunks", json_chunk)
    
    # Run the prompt chain
    prompt_output = run_prompt_chain(support_identifying_chain_b)

    # Run JSON chain
    set_variables(JSON_chain, "IDFchunks", json_chunk)
    set_variables(JSON_chain, "DisclosureCheck", prompt_output.text)
    prompt_output = run_prompt_chain(JSON_chain)
    
    # Parse the JSON string into a Python dictionary
    data = json.loads(prompt_output.text)
    display(JSON(data))
    
    # Extract the list of disclosure details
    disclosure_details_c = data["Disclosure Details"]

    # Append the disclosure details to the list
    all_example_details.extend(disclosure_details_c)

# Create a DataFrame with the collected disclosure details
df_all_examples = pd.DataFrame(all_example_details, columns=["idfChunk", "disclosureText", "disclosureScore"])

df_all_examples

                                            Examples
0  Data Burst: 288-bits over 12 pulses (24 Beats ...
1          256-bits mission data, 32-bits meta data.
2  Sub-channel 0/1, Meta Function 1: M0 through M...
3  System meta data is 1st buffered in registers ...
4         This approach was chosen to save die area.


<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

                                            Examples
5  16-bits (2B) transfer between internal registe...
6  256-bits (32B) transfer between internal regis...
7  LPDDR6 DBI Data Packet Format: JEDEC DBI data ...


<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

Unnamed: 0,idfChunk,disclosureText,disclosureScore
0,Data Burst: 288-bits over 12 pulses (24 Beats ...,"""The data packet 300 may include one or more e...",3
1,"256-bits mission data, 32-bits meta data.","""The memory system may store a payload (e.g., ...",2
2,"Sub-channel 0/1, Meta Function 1: M0 through M...","""For example, the communication control inform...",4
3,System meta data is 1st buffered in registers ...,"""After receiving a data packet, the memory sys...",5
4,This approach was chosen to save die area.,NO DISCLOSURE,0
5,16-bits (2B) transfer between internal registe...,NO DISCLOSURE,0
6,256-bits (32B) transfer between internal regis...,Such memory arrays 235 may also be referred to...,2
7,LPDDR6 DBI Data Packet Format: JEDEC DBI data ...,An inversion configuration may include one or ...,4


In [None]:
#print(prompt_output.text)

# Prepare Output

In [32]:
# Convert 'disclosureScore' column to numeric values (integer)
df_all_disclosures['disclosureScore'] = pd.to_numeric(df_all_disclosures['disclosureScore'], errors='coerce')
df_all_examples['disclosureScore'] = pd.to_numeric(df_all_examples['disclosureScore'], errors='coerce')

# Create filtered data frames where the disclosure score is 3 or less
df_details_weak = df_all_disclosures[df_all_disclosures['disclosureScore'] <= 3]
df_examples_weak = df_all_examples[df_all_examples['disclosureScore'] <= 3]
df_weak_disclosure = pd.concat([df_details_weak, df_examples_weak], ignore_index=True)

print(df_weak_disclosure)

                                            idfChunk  \
0  In the state-of-the-art approach, inversion co...   
1  Read: encoding in DRAM, transmit, decoding in ...   
2  The trade-off of the approach proposed is that...   
3  In the JEDEC implementation, 16 data bits are ...   
4  In case a 2nd meta data function is required t...   
5  Data Burst: 288-bits over 12 pulses (24 Beats ...   
6          256-bits mission data, 32-bits meta data.   
7         This approach was chosen to save die area.   
8  16-bits (2B) transfer between internal registe...   
9  256-bits (32B) transfer between internal regis...   

                                      disclosureText  disclosureScore  
0  To communicate a data packet from the host sys...                3  
1  The memory system 210 may provide the encoded ...                3  
2  "However, some communication protocols may not...                3  
3  "An inversion configuration may include one or...                2  
4  "In some cases, the 

In [33]:
# Convert both DataFrames to HTML
table1_html = df_all_disclosures.to_html(index=False)
table2_html = df_all_examples.to_html(index=False)
table3_html = df_weak_disclosure.to_html(index=False)

# Define the HTML template with placeholders for both tables
html_string = '''
<!DOCTYPE html>
<html>
<head>
<title>DataFrame Output</title>
<style>
body {{
font-family: Arial, sans-serif;
margin: 20px;
}}
table {{
width: 100%;
border-collapse: collapse;
margin-bottom: 20px;
}}
th, td {{
border: 1px solid #ddd;
padding: 8px;
text-align: left;
}}
th {{
background-color: #f2f2f2;
}}
tr:nth-child(even) {{
background-color: #f9f9f9;
}}
tr:hover {{
background-color: #f1f1f1;
}}
</style>
</head>
<body>
<h2>Weak Support (score <= 3)</h2>
{table3}
<h2>Invention Details</h2>
{table1}
<h2>Examples</h2>
{table2}
</body>
</html>
'''.format(table1=table1_html, table2=table2_html, table3=table3_html)

# Save the HTML string to a file
html_file_path = 'combined_output.html'
with open(html_file_path, 'w') as f:
    f.write(html_string)

print(f"DataFrame has been exported as HTML to {html_file_path}")

DataFrame has been exported as HTML to combined_output.html


In [34]:
# debugging
#disclosure_details_a = process_chunk_to(support_identifying_chain_a, df_chunk, description_text)
#display(JSON(json.loads(disclosure_details_a.text)))
#print(disclosure_details_b.text)

In [None]:
# include background stuff in the report, categorize it somewhere else