# GraphReasoning: Scientific Discovery through Knowledge Extraction and Multimodal Graph-based Representation and Reasoning

Markus J. Buehler, MIT, 2024 mbuehler@MIT.EDU

### Example: GraphReasoning: Loading graph and graph analysis

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# device='cuda'

In [None]:
# !pip install llama-index-embeddings-huggingface

In [None]:
from tqdm.notebook import tqdm
from IPython.display import display, Markdown
from huggingface_hub import hf_hub_download
from GraphReasoning import *

### Load graph and embeddings 

In [None]:
data_dir='./GRAPHDATA_TSMC/'
data_dir_output='./GRAPHDATA_TSMC_OUTPUT/'

#Hugging Face repo
# repository_id = "lamm-mit/GraphReasoning"
# graph_name='BioGraph.graphml'

# make_dir_if_needed(data_dir)
# make_dir_if_needed(data_dir_output)

model="Mistral-7B-Instruct-v0.3"

embedding_tokenizer = AutoTokenizer.from_pretrained(f'/{model}', ) 
embedding_model = AutoModel.from_pretrained(f'{data_dir}/{model}', )

# filename = f"{data_dir}/{graph_name}"
# file_path = hf_hub_download(repo_id=repository_id, filename=filename,  local_dir='./')
# print(f"File downloaded at: {file_path}")



graph_name = 'graph_30_augmented_graphML_integrated.graphml'
graph_path = f'{data_dir_output}{graph_name}'

G = nx.read_graphml(graph_path)

In [None]:
os.environ['TOKENIZERS_PARALLELISM']='true'

embedding_file='TSMC_embeddings.pkl'
generate_new_embeddings=True

if os.path.exists(f'{data_dir}/{embedding_file}'):
    generate_new_embeddings=False

if generate_new_embeddings:
    node_embeddings = generate_node_embeddings(G, embedding_tokenizer, embedding_model, )
    save_embeddings(node_embeddings, f'{data_dir}/{embedding_file}')
    
else:
    filename = f"{data_dir}/{embedding_file}"
    # file_path = hf_hub_download(repo_id=repository_id, filename=filename, local_dir='./')
    # print(f"File downloaded at: {file_path}")
    node_embeddings = load_embeddings(f'{data_dir}/{embedding_file}')


### Graph statistics and properties

In [None]:
visualize_embeddings_2d_pretty_and_sample(node_embeddings, n_clusters=10, n_samples=10, data_dir=data_dir_output, alpha=.7)

In [None]:
describe_communities_with_plots_complex(G, N=6, data_dir=data_dir_output)

In [None]:
graph_statistics_and_plots_for_large_graphs(G, data_dir=data_dir_output,include_centrality=False,
                                               make_graph_plot=False,)

In [None]:
is_scale_free (G, data_dir=data_dir_output)

### Working with the graph

#### Find best fitting node

In [None]:
find_best_fitting_node_list("semiconductor", node_embeddings, embedding_tokenizer, embedding_model, 5)

In [None]:
find_best_fitting_node_list("etching", node_embeddings , embedding_tokenizer, embedding_model, 5)

#### Find path in graph based on two keywords

In [None]:
(best_node_1, best_similarity_1, best_node_2, best_similarity_2), path, path_graph, shortest_path_length, fname, graph_GraphML=find_path( G, node_embeddings,
                                embedding_tokenizer, embedding_model , second_hop=False, data_dir=data_dir_output,
                                  keyword_1 = "semiconductor", keyword_2 = "graphene",
                                      similarity_fit_ID_node_1=0, similarity_fit_ID_node_2=0,
                                       )



In [None]:
path

In [None]:
path_list, path_string=print_path_with_edges_as_list(G , path)
path_list,path_string

In [None]:
visualize_paths_pretty([path_list], 'knowledge_graph_paths.svg', display_graph=True,data_dir=data_dir_output, scale=0.75)

In [None]:
triplets=find_all_triplets(path_graph) 

In [None]:
triplets

### Load LLM: clean Mistral 7B

In [None]:
# filename = f"{data_dir}/{graph_name}"
# file_path = hf_hub_download(repo_id=repository_id, filename=filename,  local_dir='./')
# print(f"File downloaded at: {file_path}")

# graph_name=f'{data_dir}/{graph_name}'
# G = nx.read_graphml(graph_name)

repository_id='MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF'
filename='Mistral-7B-Instruct-v0.3.Q8_0.gguf'

# repository_id='bartowski/Mistral-7B-Instruct-v0.3-GGUF'
# filename='Mistral-7B-Instruct-v0.3-Q8_0.gguf'

file_path = hf_hub_download(repo_id=repository_id, filename=filename,  local_dir='/home/mkychsu/pool/llm')
# file_path = f'{model}/'

In [None]:
from llama_cpp import Llama
import llama_cpp

llm = Llama(model_path=file_path,
             n_gpu_layers=-1,verbose= True, #False,#False,
             n_ctx=10000,
             main_gpu=0,
             # chat_format='mistral-instruct',
             )


In [None]:
def generate_Mistral (system_prompt='You are a semiconductor engineer. Try to find the clear relationship in the provided information', 
                         prompt="How to make silicon into chip?",temperature=0.333,
                         max_tokens=10000, 
                         ):

    if system_prompt==None:
        messages=[
            {"role": "user", "content": prompt},
        ]
    else:
        messages=[
            {"role": "system",  "content": system_prompt, },
            {"role": "user", "content": prompt},
        ]

    result=llm.create_chat_completion(
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
        )
    return result['choices'][0]['message']['content']
     

In [None]:
verbatim=True

In [None]:
response = generate_Mistral(
    system_prompt = 'You are given a set of information from a graph that describes the relationship, between materials and manufacturing process. You analyze these logically through reasoning.',
    prompt='Develop a new research idea and very detail steps which improve the current manufacturing process of semiconductors.',
)
display(Markdown(response))

In [None]:
response, (best_node_1, best_similarity_1, best_node_2, best_similarity_2), path, path_graph, shortest_path_length, fname, graph_GraphML = find_path_and_reason(
    G, 
    node_embeddings,
    embedding_tokenizer, 
    embedding_model, 
    generate_Mistral, 
    data_dir=data_dir_output,
    verbatim=verbatim,
    include_keywords_as_nodes=True,  # Include keywords in the graph analysis
    keyword_1="semiconductor",
    keyword_2="manufacture",
    N_limit=9999,  # The limit for keywords, triplets, etc.
    instruction='Develop a new research idea and very detail steps which improve the current manufacturing process of semiconductors.',
    keywords_separator=', ',
    graph_analysis_type='nodes and relations',
    temperature=0.3, 
    inst_prepend='### ',  # Instruction prepend text
    prepend='''You are given a set of information from a graph that describes the relationship 
               between materials and manufacturing process. You analyze these logically 
               through reasoning.\n\n''',  # Prepend text for analysis
    visualize_paths_as_graph=True,  # Whether to visualize paths as a graph
    display_graph=True,  # Whether to display the graph
)
display(Markdown(response))

In [None]:
from pyvis.network import Network
nx.draw(G)
nt = Network('500px', '500px')
nt.from_nx()
nt.show('.html')

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever

graph_rag_retriever = KnowledgeGraphRAGRetriever(
    storage_context=storage_context,
    verbose=True,
)

query_engine = RetrieverQueryEngine.from_args(
    graph_rag_retriever,
)

In [None]:
response = query_engine.query(
    "Tell me about Peter Quill?",
)
display(Markdown(f"<b>{response}</b>"))