# GraphReasoning: Scientific Discovery through Knowledge Extraction and Multimodal Graph-based Representation and Reasoning

Markus J. Buehler, MIT, 2024 mbuehler@MIT.EDU

### Example: GraphReasoning: Loading graph and graph analysis

In [None]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device='cuda'

In [None]:
from tqdm.notebook import tqdm
from IPython.display import display, Markdown
from huggingface_hub import hf_hub_download
from GraphReasoning import *

### Load graph and embeddings 

In [None]:
#Hugging Face repo
repository_id = "lamm-mit/GraphReasoning"
data_dir='./GRAPHDATA'    

data_dir_output='./GRAPHDATA_OUTPUT/'

graph_name='BioGraph.graphml'

make_dir_if_needed(data_dir)
make_dir_if_needed(data_dir_output)

tokenizer_model="BAAI/bge-large-en-v1.5"

embedding_tokenizer = AutoTokenizer.from_pretrained(tokenizer_model, ) 
embedding_model = AutoModel.from_pretrained(tokenizer_model, ) 

filename = f"{data_dir}/{graph_name}"
file_path = hf_hub_download(repo_id=repository_id, filename=filename,  local_dir='./')
print(f"File downloaded at: {file_path}")

graph_name=f'{data_dir}/{graph_name}'
G = nx.read_graphml(graph_name)

In [None]:
embedding_file='BioGraph_embeddings_ge-large-en-v1.5.pkl'

generate_new_embeddings=False
if generate_new_embeddings:
    node_embeddings = generate_node_embeddings(G, embedding_tokenizer, embedding_model, )
    save_embeddings(node_embeddings, f'{data_dir}/{embedding_file}')
    
else:
    filename = f"{data_dir}/{embedding_file}"
    file_path = hf_hub_download(repo_id=repository_id, filename=filename, local_dir='./')
    print(f"File downloaded at: {file_path}")

    node_embeddings = load_embeddings(f'{data_dir}/{embedding_file}')

### Load LLM: BioMixtral

In [None]:
from llama_cpp import Llama
import llama_cpp

#m
repository_id='lamm-mit/BioinspiredMixtral'
filename='ggml-model-q5_K_M.gguf'
file_path = hf_hub_download(repo_id=repository_id, filename=filename,  local_dir='./models/')

chat_format="mistral-instruct"

llm = Llama(model_path=file_path,
             n_gpu_layers=-1,verbose= True, #False,#False,
             n_ctx=10000,
             main_gpu=0,
             chat_format=chat_format,
             )

In [None]:
file_path

In [None]:
def generate_BioMixtral (system_prompt='You are a biomaterials cientist.', 
                         prompt="What is spider silk?",temperature=0.333,
                         max_tokens=10000, 
                         ):

    if system_prompt==None:
        messages=[
            {"role": "user", "content": prompt},
        ]
    else:
        messages=[
            {"role": "system",  "content": system_prompt, },
            {"role": "user", "content": prompt},
        ]

    result=llm.create_chat_completion(
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
        )
    return result['choices'][0]['message']['content']
     

In [None]:
q='''What is graphene?'''
start_time = time.time()
res=generate_BioMixtral( system_prompt='You design materials.', 
         prompt=q, max_tokens=1024, temperature=0.3,  )

print (res)
deltat=time.time() - start_time
print("--- %s seconds ---" % deltat)
display (Markdown(res))

In [None]:
response, (best_node_1, best_similarity_1, best_node_2, best_similarity_2), path, path_graph, shortest_path_length, fname, graph_GraphML = find_path_and_reason(
    G, 
    node_embeddings,
    embedding_tokenizer, 
    embedding_model, 
    generate_BioMixtral, 
    data_dir=data_dir_output,
    verbatim=True,
    include_keywords_as_nodes=True,  # Include keywords in the graph analysis
    keyword_1="collagen",
    keyword_2="copper",
    N_limit=9999,  # The limit for keywords, triplets, etc.
    instruction='Develop a new research idea around collagen and copper.',
    keywords_separator=', ',
    graph_analysis_type='nodes and relations',
    temperature=0.3, 
    inst_prepend='### ',  # Instruction prepend text
    prepend='''You are given a set of information from a graph that describes the relationship 
               between materials, structure, properties, and properties. You analyze these logically 
               through reasoning.\n\n''',  # Prepend text for analysis
    visualize_paths_as_graph=True,  # Whether to visualize paths as a graph
    display_graph=True,  # Whether to display the graph
)
display(Markdown(response))

In [None]:
response, (best_node_1, best_similarity_1, best_node_2, best_similarity_2), path