## Chat with mDGF document 

In [1]:
%%capture
# update or install the necessary libraries
!pip install --upgrade langchain-openai
!pip install --upgrade langchain
!pip install --upgrade python-dotenv

In [2]:
import openai
import os
import IPython
from langchain.llms import OpenAI
from dotenv import load_dotenv
load_dotenv()

True

Load environment variables. You can use anything you like but I used `python-dotenv`. Just create a `.env` file with your `OPENAI_API_KEY` then load it as follows:

In [3]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

We are adapting code from [here](https://langchain.readthedocs.io/en/latest/modules/chat/getting_started.html).

In [4]:
import os
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferMemory, ConversationSummaryMemory
from langchain.callbacks import get_openai_callback
from langchain import PromptTemplate, LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage,
)

In [5]:
# read the document
with open("../data/mdgf_document.txt", "r") as file:
    mdgf_document = file.read()

In [29]:
from langchain.prompts.prompt import PromptTemplate
import pprint
import json

MDGF_PROMPT = f"""
You are an expert in scientific data governance and management and you will assist the users by answering questions and creating documents. Use only the content in the Modern Data Governance Framework (MDGF) reference text after the delimiter for your answers. If a questions falls outside the reference text, then respond, “This is out of scope for me to answer”

Your responsibilities are two::

First - Answering Questions:
You will be asked questions. Answer the question only using the reference text provided.
Cite the passage from the document used to answer the question, prefixing it with citation.
If you cannot find an answer in the reference text, then respond, “I could not find the answer”

Second - Creating Documents:

When asked by a user to create either a requirements document or a procedure plan based on the reference text. Assist the user by asking a series of questions to capture their project needs.

Step 1: Identify the entity in the user’s project. Respond with: “Sure, I will be happy to help. First tell me the core entity or asset in that you will be managing

Data 
Metadata
Digital content 
Code
Software”

Step 2: Identify governance activity in the user’s project. Respond with: “Tell me about the governance activity need in your project

Planning and Design
Monitoring
Generation/Curation
Sharing
Use/Reuse
Preservation”

Step 3: Identify the user's need for the Type of document. Respond with: “Are you seeking Requirements or Procedures for your project?

Requirements
Procedures”

Finally, Respond with:
"Here are the headings for the Requirements document:
A.1.1.1, A.1.2.1, ..." 
You should provide only the headings (A.1.1.1, A.1.2.1, ...) provided in the DGF documents. You should never provide any additional information. Do NOT use placeholder text or ... or anything similar in the response.


Here is the reference DGF document:
{mdgf_document} 
"""

llm = ChatOpenAI(
    temperature=0,
	openai_api_key=os.environ["OPENAI_API_KEY"],
	model_name="gpt-4-turbo-preview"
)

prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=MDGF_PROMPT,
        ),  # The persistent system prompt
        MessagesPlaceholder(
            variable_name="history"
        ),  # Where the memory will be stored.
        HumanMessagePromptTemplate.from_template(
            "{input}"
        ),  # Where the human input will injected
    ]
)

def ask(chain, query, track_token=True):
    with get_openai_callback() as cb:
        result = chain.invoke(input=query)
        if track_token:
            print(f'Total tokens: {cb.total_tokens}')
            print(f'Requests: {cb.successful_requests}')
    print(result['response'])
    return result['response']

conversation = ConversationChain(
    prompt=prompt,
    llm=llm,
    verbose=False,
    memory=ConversationBufferMemory(ai_prefix="AI Assistant", memory_key="history", return_messages=True),
)

In [30]:
_ = ask(conversation, "help me create a MDGF format document")

Total tokens: 13719
Requests: 1
Sure, I will be happy to help. First tell me the core entity or asset in that you will be managing

- Data 
- Metadata
- Digital content 
- Code
- Software


In [31]:
_ = ask(conversation, "Digital content")


Total tokens: 13763
Requests: 1
Tell me about the governance activity need in your project

- Planning and Design
- Monitoring
- Generation/Curation
- Sharing
- Use/Reuse
- Preservation


In [32]:
_ = ask(conversation, "Planning and Design")

Total tokens: 13789
Requests: 1
Are you seeking Requirements or Procedures for your project?

- Requirements
- Procedures


In [33]:
model_response = ask(conversation, "both")

Total tokens: 13866
Requests: 1
Here are the headings for the Requirements document:
A3.1.1, A3.1.2, A3.1.3

Here are the headings for the Procedures document:
B3.1.1, B3.1.2a, B3.1.2b, B3.1.3


In [18]:
import re

# Example text to search
text = _

# Regex pattern to match patterns like "B 1.2.2b"
pattern = r'[A-Z]\d+\.\d+\.\d+[a-z]?'

# Using finditer() to find matches and print them
for match in re.finditer(pattern, text):
    print('Match found:', match.group())

# Alternatively, using findall() to get a list of all matches
headers = re.findall(pattern, text)
print('All matches:', headers)

Match found: A1.1.1
Match found: A1.1.2
Match found: A1.1.3
Match found: A1.1.4
Match found: A1.1.5
Match found: A1.1.6
Match found: A1.1.7
Match found: A1.1.8
Match found: A1.1.9
Match found: A1.1.10
Match found: A1.1.11
Match found: A1.1.12
Match found: A1.1.13
Match found: A1.1.14
Match found: A1.1.15
Match found: A1.1.16
Match found: B1.1.1
Match found: B1.1.2
Match found: B1.1.3
Match found: B1.1.4
Match found: B1.1.5
Match found: B1.1.6
Match found: B1.1.7
Match found: B1.1.8
Match found: B1.1.9
Match found: B1.1.10
Match found: B1.1.11
Match found: B1.1.12
Match found: B1.1.13
Match found: B1.1.14
Match found: B1.1.15
Match found: B1.1.16
All matches: ['A1.1.1', 'A1.1.2', 'A1.1.3', 'A1.1.4', 'A1.1.5', 'A1.1.6', 'A1.1.7', 'A1.1.8', 'A1.1.9', 'A1.1.10', 'A1.1.11', 'A1.1.12', 'A1.1.13', 'A1.1.14', 'A1.1.15', 'A1.1.16', 'B1.1.1', 'B1.1.2', 'B1.1.3', 'B1.1.4', 'B1.1.5', 'B1.1.6', 'B1.1.7', 'B1.1.8', 'B1.1.9', 'B1.1.10', 'B1.1.11', 'B1.1.12', 'B1.1.13', 'B1.1.14', 'B1.1.15', 'B1.1.16'

In [23]:
import json

# Your list of strings (headers you're interested in)
# headers = ['A4.3.1', 'B4.3.2a', 'A1.1.3']

# Your JSON data
data = {
    "code": {
        "sharing": [
            ["A4.3.1 Ensure that the code is openly accessible", "B4.3.1 Ensure the code repository is set to ‘public’ in GitHub. [DE]"],
            ["A4.3.2 Ensure that the code has a persistent identifier and is discoverable with the data", "B4.3.2a The code repository should be assigned a registered persistent identifier. Use Zenodo for assigning a new DOI. In rare cases that a DOI has been assigned via other mechanisms, ensure the DOI is uploaded to the repo. [DS] B4.3.2b Ensure the code identifier is added to the data product metadata. [DS] B4.3.2c Ensure the DOI is added to the Github citation file [DS + DE]"],
            ["A4.3.3 Ensure the code is documented", "B4.3.3a Include a read me document that describes the purpose of the code and any system requirements [DE] B4.3.3b Include a brief ‘About’ description of the code that will be displayed towards the top of the repo page. The description should be no more than 325 characters or 50 words. [DS] Example ‘About’ text."]
        ],
        "generation_curation": [
            ["A4.2.1 Develop code in accordance with current best practices", "B4.2.1 Use the IMPACT coding best practices [DE]"],
            ["A4.2.2 Ensure code is citable", "B4.2.2a Create a clear, sufficiently descriptive name for your code repo [DE] B4.2.2b Create a citation file for all code with information identified in B4.1.6. [DS]"]
        ]
        # ... other sections
    }
    # ... other top-level keys
}

def subset_data(headers, data):
    # Initialize a dictionary to hold the subsetted data
    subsetted_data = {}
    for top_key, top_value in data.items():
        if isinstance(top_value, dict):
            subsetted_section = {}
            
            for second_key, entries in top_value.items():
                subsetted_entries = []
                
                for entry_list in entries:
                    entry_item = []
                    for entry in entry_list:
                        if any(header in entry[:10] for header in headers):
                            entry_item.append(entry)
                    if entry_item:
                        subsetted_entries.append(entry_item)
                
                if subsetted_entries:
                    subsetted_section[second_key] = subsetted_entries
            if subsetted_section:
                subsetted_data[top_key] = subsetted_section
    
    return subsetted_data

import json

data = json.load(open("../data/dgf.json"))

subset = subset_data(headers, data)

# Printing the subset to verify
print(json.dumps(subset, indent=4))

{
    "data": {
        "plan_design": [
            [
                "A1.1.1 Define a data flow diagram with the purpose of identifying data sources and touchpoints for the project and for communicating to data users how data was handled.",
                "B1.1.1 Create a data flow diagram extending from acquisition/creation to user delivery and add diagram to DMP. [DE] Example diagram:"
            ],
            [
                "A1.1.2 Develop touchpoint agreements identified in the data flow diagram",
                "B1.1.2 Create needed touchpoint agreements such as Interface Control Documents, (ICDs) / Submission Agreement (SA), Memorandum of Understanding (MOU),or Service Level Agreement (SLA). [DS + DE]"
            ],
            [
                "A1.1.3 Adhere to community accepted standard machine readable data file formats",
                "B1.1.3 Select standard machine-readable data file format(s) from NASA Approved Data Formats [DS] The EOSDIS Data Product Develop

In [None]:
MDGF_QA_PROMPT = f"""
You are an expert in scientific data governance and management and you will assist the users by answering questions. Use only the content in the Modern Data Governance Framework (MDGF) reference text provided for your answers. If a questions falls outside the reference text, then respond, “This is out of scope for me to answer”

Here is the reference DGF document:
{mdgf_document} 
"""

llm = ChatOpenAI(
    temperature=0,
	openai_api_key=os.environ["OPENAI_API_KEY"],
	model_name="gpt-4-turbo-preview"
)

prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=MDGF_QA_PROMPT,
        ),  # The persistent system prompt
        MessagesPlaceholder(
            variable_name="history"
        ),  # Where the memory will be stored.
        HumanMessagePromptTemplate.from_template(
            "{input}"
        ),  # Where the human input will injected
    ]
)

conversation = ConversationChain(
    prompt=prompt,
    llm=llm,
    verbose=False,
    memory=ConversationBufferMemory(ai_prefix="AI Assistant", memory_key="history", return_messages=True),
)

In [None]:
MDGF_DOC_PROMPT = f"""
You are responsible for providing a JSON with the following schema:
,
{
    "Entities/Assets": List[{
        
    }],
    
    "Type": List[str]
}

When asked by a user to create either a requirements document or a procedure plan based on the reference text. Assist the user by asking a series of questions to capture their project needs.

Step 1: Identify the entities in the user’s project. Respond with: “Sure, I will be happy to help. First tell me the core entities or assets in that you will be managing

Data 
Metadata
Digital content 
Code
Software”

Step 2: Identify governance activities in the user’s project. Respond with: “Tell me about the governance activities need in your project

Planning and Design
Monitoring
Generation/Curation
Sharing
Use/Reuse
Preservation”

Step 3: Identify the user's need for the Type of document. Respond with: “Are you seeking Requirements or Procedures for your project?

Requirements
Procedures”

Finally. Respond with: “I will provide only the headings provided in the DGF documents to respond to your request. I will not provide the content under each heading.
 I will provide A JSON formatted document with the keys: Entities/Assets, Governance Activities, Type. and values as the user's response to the questions asked in the previous steps.
here is the JSON formatted document:
{
  "Entities/Assets": "",
  "Governance Activities": " ",
  "Type": " "
}
...
" Correctly Fill in all the needed headings here. Do not use filler text or ... in the response.



Here is the reference DGF document:
{mdgf_document} 
"""

llm = ChatOpenAI(
    temperature=0,
	openai_api_key=os.environ["OPENAI_API_KEY"],
	model_name="gpt-4-turbo-preview"
)

prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=MDGF_QA_PROMPT,
        ),  # The persistent system prompt
        MessagesPlaceholder(
            variable_name="history"
        ),  # Where the memory will be stored.
        HumanMessagePromptTemplate.from_template(
            "{input}"
        ),  # Where the human input will injected
    ]
)

conversation = ConversationChain(
    prompt=prompt,
    llm=llm,
    verbose=False,
    memory=ConversationBufferMemory(ai_prefix="AI Assistant", memory_key="history", return_messages=True),
)

In [40]:
from pydantic import BaseModel
from typing import List, Dict, Optional

class DocType(BaseModel):
    requirements: str
    procedures: str

class GovernanceActivities(BaseModel):
    planning_and_design: Optional[List[DocType]] = None
    monitoring: Optional[List[DocType]] = None
    generation_curation: Optional[List[DocType]] = None
    sharing: Optional[List[DocType]] = None
    use_reuse: Optional[List[DocType]] = None
    preservation: Optional[List[DocType]] = None

class CoreEntity(BaseModel):
    data: Optional[List[GovernanceActivities]] = None
    metadata: Optional[List[GovernanceActivities]] = None
    digital_content: Optional[List[GovernanceActivities]] = None
    code: Optional[List[GovernanceActivities]] = None
    software: Optional[List[GovernanceActivities]] = None

from typing import Dict, Any

def json_to_pydantic(json_dict: Dict[str, Any]) -> CoreEntity:
    def create_doc_type_list(details: List[List[str]]) -> List[DocType]:
        # This function takes a list of lists where each inner list contains requirements and procedures.
        return [DocType(requirements=detail[0], procedures=detail[1]) for detail in details]

    def create_governance_activities(activities: Dict[str, List[List[str]]]) -> GovernanceActivities:
        # This function creates a GovernanceActivities instance from a dictionary mapping activity names to lists of [requirements, procedures].
        activities_dict = {activity: create_doc_type_list(details) for activity, details in activities.items()}
        return GovernanceActivities(**activities_dict)

    # The corrected part: Ensure we correctly parse the structure of each core entity.
    core_entities = {}
    for entity, activities_list in json_dict.items():
        if isinstance(activities_list, list):  # Ensure it's the expected list structure
            core_entities[entity] = [create_governance_activities(activities) for activities in activities_list]

    return CoreEntity(**core_entities)

json_data = {
    "code": [{
        "sharing": [
            [
                "A4.3.1 Ensure that the code is openly accessible",
                "B4.3.1 Ensure the code repository is set to 'public' in GitHub. [DE]"
            ],
            [
                "A4.3.2 Ensure that the code has a persistent identifier and is discoverable with the data",
                "B4.3.2a The code repository should be assigned a registered persistent identifier. Use Zenodo for assigning a new DOI. In rare cases that a DOI has been assigned via other mechanisms, ensure the DOI is uploaded to the repo. [DS] B4.3.2b Ensure the code identifier is added to the data product metadata. [DS] B4.3.2c Ensure the DOI is added to the Github citation file [DS + DE]"
            ],
            [
                "A4.3.3 Ensure the code is documented",
                "B4.3.3a Include a read me document that describes the purpose of the code and any system requirements [DE] B4.3.3b Include a brief ‘About’ description of the code that will be displayed towards the top of the repo page. The description should be no more than 325 characters or 50 words. [DS]"
            ]
        ]
    }]
}

pydantic_model_instance = json_to_pydantic(json_data)

In [41]:
import json

dgf_json = json.load(open("../data/dgf.json"))

pydantic_model_instance = json_to_pydantic(dgf_json)

In [42]:
pydantic_model_instance

CoreEntity(data=None, metadata=None, digital_content=None, code=None, software=None)

Let's try an example that involves a system instruction and a task provided by user.