# GraphReasoning: Scientific Discovery through Knowledge Extraction and Multimodal Graph-based Representation and Reasoning

Markus J. Buehler, MIT, 2024 mbuehler@MIT.EDU

### Example: GraphReasoning: Loading graph and graph analysis

In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# device='cuda:0'

from tqdm.notebook import tqdm
from IPython.display import display, Markdown
from huggingface_hub import hf_hub_download
from GraphReasoning import *


In [2]:
verbatim=False

### Load dataset

In [3]:
import glob

doc_data_dir = '/home/mkychsu/pool/TSMC/dataset_textbook/'
doc_list=[f'{doc_data_dir}dry-etching-technology-for-semiconductors_compress.pdf',
          f'{doc_data_dir}plasma-etching-an-introduction_compress.pdf',
          f'{doc_data_dir}handbook-of-silicon-wafer-cleaning-technology-third-edition_compress.pdf',
          f'{doc_data_dir}Ultraclean Surface Processing of Silicon Wafers - PDF Free Download.pdf',
          f'{doc_data_dir}Atomic Layer Processing_semiconductor.pdf'   
]

doc_list_all=sorted(glob.glob(f'{doc_data_dir}*.pdf'))

from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

for i, doc in enumerate(doc_list_all):
    if doc in doc_list:
        continue
    try:
        temp_doc = doc_list_all[i+1]
        sim = similar(temp_doc.lower(), doc.lower())
        if sim < 0.9:
            doc_list.append(doc)
        else:
            if abs(os.stat(doc).st_size - os.stat(temp_doc).st_size)/os.stat(doc).st_size < 1e-3:
                print(f'{i}:{sim},\n {doc} \n {temp_doc}')
            else:
                doc_list.append(doc)
    except:
        pass


34:1.0,
 /home/mkychsu/pool/TSMC/dataset_textbook/(Advanced texts in physics) Claus Klingshirn - Semiconductor Optics-Springer (2005).pdf 
 /home/mkychsu/pool/TSMC/dataset_textbook/(Advanced texts in physics) Claus Klingshirn - Semiconductor optics-Springer (2005).pdf
92:0.9061224489795918,
 /home/mkychsu/pool/TSMC/dataset_textbook/(Analog Circuits and Signal Processing) Athanasios T. Ramkaj, Marcel J. M. Pelgrom, Michiel S. J. Steyaert, Filip Tavernier - Multi-Gigahertz Nyquist Analog-to-Digital Converters. Architecture and Cir.pdf 
 /home/mkychsu/pool/TSMC/dataset_textbook/(Analog Circuits and Signal Processing) Athanasios T. Ramkaj, Marcel J.M. Pelgrom, Michiel S. J. Steyaert, Filip Tavernier - Multi-Gigahertz Nyquist Analog-to-Digital Converters_ Architecture and Circ.pdf
109:0.9604863221884499,
 /home/mkychsu/pool/TSMC/dataset_textbook/(Artech House Microwave Library) Guillermo Gonzalez - Foundations of Oscillator Circuit Design-Artech House (2007).pdf 
 /home/mkychsu/pool/TSMC/da

In [4]:
# 

In [5]:
# import glob

# doc_data_dir = '/home/mkychsu/pool/TSMC/dataset_textbook_txt/'
# # doc_list = []
# doc_list=[f'{doc_data_dir}dry-etching-technology-for-semiconductors_compress/dry-etching-technology-for-semiconductors_compress.md',
#           f'{doc_data_dir}plasma-etching-an-introduction_compress/plasma-etching-an-introduction_compress.md',
#           f'{doc_data_dir}handbook-of-silicon-wafer-cleaning-technology-third-edition_compress/handbook-of-silicon-wafer-cleaning-technology-third-edition_compress.md',
#           f'{doc_data_dir}Ultraclean Surface Processing of Silicon Wafers - PDF Free Download/Ultraclean Surface Processing of Silicon Wafers - PDF Free Download.md',
#           f'{doc_data_dir}Atomic Layer Processing_semiconductor/Atomic Layer Processing_semiconductor.md'   
# ]

# doc_list_all=sorted(glob.glob(f'{doc_data_dir}*/*.md'))

# from difflib import SequenceMatcher

# def similar(a, b):
#     return SequenceMatcher(None, a, b).ratio()

# for i, doc in enumerate(doc_list_all):
#     if doc in doc_list:
#         continue
#     try:
#         temp_doc = doc_list_all[i+1]
#         sim = similar(temp_doc.lower(), doc.lower())
#         if sim < 0.9:
#             doc_list.append(doc)
#         else:
#             if abs(os.stat(doc).st_size - os.stat(temp_doc).st_size)/os.stat(doc).st_size < 1e-3:
#                 print(f'{i}:{sim},\n {doc} \n {temp_doc}')
#             else:
#                 doc_list.append(doc)
#     except:
#         pass
# print(len(doc_list),doc_list[0])

### Load the LLM and the tokenizer

In [6]:
#Hugging Face repo
# repository_id = "lamm-mit/GraphReasoning"
data_dir='./GRAPHDATA_TSMC'    
data_dir_output='./GRAPHDATA_TSMC_OUTPUT/'

# data_dir_output='./GRAPHDATA_OUTPUT/'
# graph_name='BioGraph.graphml'

# make_dir_if_needed(data_dir)
# make_dir_if_needed(data_dir_output)

tokenizer_model="BAAI/bge-large-en-v1.5"
# tokenizer_model="f'/home/mkychsu/pool/llm/Mistral-7B-Instruct-v0.3/tokenizer.json"

embedding_tokenizer = AutoTokenizer.from_pretrained(tokenizer_model, ) 
embedding_model = AutoModel.from_pretrained(tokenizer_model, )
# embedding_model.to('cuda:0')



In [7]:
# filename = f"{data_dir}/{graph_name}"
# file_path = hf_hub_download(repo_id=repository_id, filename=filename,  local_dir='./')
# print(f"File downloaded at: {file_path}")

# graph_name=f'{data_dir}/{graph_name}'
# G = nx.read_graphml(graph_name)


# repository_id='TheBloke/Mistral-7B-Instruct-v0.1-GGUF'
filename='mistral-7b-instruct-v0.1.Q8_0.gguf'

# repository_id='bartowski/Meta-Llama-3.1-8B-Instruct-GGUF'
# filename='Meta-Llama-3.1-8B-Instruct-Q4_K_L.gguf'

# file_path=hf_hub_download(repo_id=repository_id, filename=filename,  local_dir='/home/mkychsu/pool/llm')
file_path = f'/home/mkychsu/pool/llm/{filename}'

### Load LLM: clean Mistral 7B

In [8]:
from llama_cpp import Llama
# import llama_cpp

llm = Llama(model_path=file_path,
             n_gpu_layers=-1,verbose= True, #False,#False,
             n_ctx=8192,
             main_gpu=0,
             n_threads= 8 ,
             n_threads_batch=32,
             # chat_format='mistral-instruct',
             )


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/mkychsu/pool/llm/mistral-7b-instruct-v0.1.Q8_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:      

In [9]:
# llm.verbose = False

In [10]:
def generate_Mistral (system_prompt='You are a semiconductor engineer. Try to find the clear relationship in the provided information', 
                         prompt="How to make silicon into chip?",temperature=0.333,
                         max_tokens=8192, stream = True
                         ):     
    try:
        if system_prompt==None:
            messages=[
                {"role": "user", "content": prompt},
            ]

        else:
            messages=[
                {"role": "system",  "content": system_prompt},
                {"role": "user", "content": prompt},
            ]
        # result=llm.create_chat_completion_openai_v1(
        result=llm.create_chat_completion(
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
            )
        # return result.choices[0].message.content
        return result['choices'][0]['message']['content']
    except:
        return ''
        # return generate_Mistral( system_prompt=system_prompt, prompt=prompt[:len(prompt)//2+100], temperature=temperature, max_tokens=max_tokens) + \
        #       generate_Mistral( system_prompt=system_prompt, prompt=prompt[len(prompt)//2-100:], temperature=temperature, max_tokens=max_tokens)


In [12]:
import time
q="What are the recent methods to do dry etching?"
start_time = time.time()
res=generate_Mistral( system_prompt='You are an expert in semiconductor fields. Try to find the clear relation in the provided information. Skip the authorship information if it is not relevant', 
         prompt=q, max_tokens=1024, temperature=0.3,  )

deltat=time.time() - start_time
print("--- %s seconds ---" % deltat)
display (Markdown(res))

Llama.generate: 59 prefix-match hit, remaining 1 prompt tokens to eval

llama_print_timings:        load time =      90.39 ms
llama_print_timings:      sample time =       8.96 ms /   282 runs   (    0.03 ms per token, 31469.70 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (    -nan ms per token,     -nan tokens per second)
llama_print_timings:        eval time =    3181.46 ms /   282 runs   (   11.28 ms per token,    88.64 tokens per second)
llama_print_timings:       total time =    3318.27 ms /   282 tokens


--- 3.3243534564971924 seconds ---


 Dry etching is a process used in semiconductor manufacturing to remove layers of material from a substrate without the use of water or other liquids. There are several methods used for dry etching, including:

1. Reactive ion etching (RIE): This method uses a plasma to generate reactive ions that can etch the material. The plasma is generated by applying a high voltage to a gas, which ionizes the gas and creates a plasma.
2. Chemical vapor etching (CVE): This method uses a gas to etch the material. The gas is typically a mixture of hydrogen and fluorine, which react with the material to be etched.
3. Deep UV photolithography: This method uses light to etch the material. The light is typically generated by a deep UV laser, which can be focused on the material to be etched.
4. Electrical discharge etching (EDE): This method uses an electrical discharge to etch the material. The discharge is generated by applying a high voltage to a gas, which creates a plasma that can etch the material.

These are some of the recent methods used for dry etching in semiconductor manufacturing. The choice of method depends on the material being etched and the desired etch rate and depth.

In [13]:
# graph_HTML, graph_GraphML, G, net, output_pdf = make_graph_from_text(res, generate_Mistral,
#                                                                      chunk_size=1000,chunk_overlap=200,
#                                                                      do_distill=True, data_dir='temp', verbatim=True,
#                                                                      repeat_refine=0)

In [55]:
os.environ['TOKENIZERS_PARALLELISM']='true'

embedding_file='TSMC_KG_mistral_instruct_v0.3.pkl'
generate_new_embeddings=True

if os.path.exists(f'{data_dir}/{embedding_file}'):
    generate_new_embeddings=False

if generate_new_embeddings:
    node_embeddings = generate_node_embeddings(G, embedding_tokenizer, embedding_model, )
    save_embeddings(node_embeddings, f'{data_dir}/{embedding_file}')
    
else:
    filename = f"{data_dir}/{embedding_file}"
    # file_path = hf_hub_download(repo_id=repository_id, filename=filename, local_dir='./')
    # print(f"File downloaded at: {file_path}")
    node_embeddings = load_embeddings(f'{data_dir}/{embedding_file}')


In [56]:
doc_list[:5]

['/home/mkychsu/pool/TSMC/dataset_textbook/dry-etching-technology-for-semiconductors_compress.pdf',
 '/home/mkychsu/pool/TSMC/dataset_textbook/plasma-etching-an-introduction_compress.pdf',
 '/home/mkychsu/pool/TSMC/dataset_textbook/handbook-of-silicon-wafer-cleaning-technology-third-edition_compress.pdf',
 '/home/mkychsu/pool/TSMC/dataset_textbook/Ultraclean Surface Processing of Silicon Wafers - PDF Free Download.pdf',
 '/home/mkychsu/pool/TSMC/dataset_textbook/Atomic Layer Processing_semiconductor.pdf']

In [None]:
for i, doc in enumerate(doc_list):

    title = doc.split('/')[-1].split('.pdf')[0]
    doc = doc.split('/')
    doc[-2]+=f'_txt'
    doc[-1]=title+f'/{title}.md'
    doc='/'.join(doc)
    
    graph_root = f'{title}'
    
    _graph_GraphML= f'{data_dir_output}/{graph_root}_augmented_graphML_integrated.graphml'
    txt=''
    print(f'{doc}')
    if os.path.exists(_graph_GraphML):
        G = nx.read_graphml(_graph_GraphML)
        print(f'Main KG loaded: {_graph_GraphML}, {G}')
        continue
        
    
    if os.path.exists(f'{title}_err.txt'):
        print(f'No. {i}: {title} got something wrong.')
        continue

    elif os.path.exists(f'{data_dir}/{graph_root}_graph.graphml'):
        print(f'Found a graph fragment to merge: {graph_root}: {doc}.')
        graph_GraphML = f'{data_dir}/{graph_root}_graph.graphml'
        
        print(f'Merging graph No. {i}: {doc} to the main one')
        try:
            _, G, _, node_embeddings, res = add_new_subgraph_from_text('', generate_Mistral,
                               node_embeddings, embedding_tokenizer, embedding_model,
                               original_graph=G, data_dir_output=data_dir_output, graph_root=graph_root,
                               chunk_size=2000,chunk_overlap=200,
                               do_simplify_graph=True,size_threshold=10,
                               repeat_refine=0,similarity_threshold=0.95,
                               do_Louvain_on_new_graph=True, include_contextual_proximity=False,
                               #whether or not to simplify, uses similiraty_threshold defined above
                               return_only_giant_component=False,
                               save_common_graph=False,G_to_add=None,graph_GraphML_to_add=graph_GraphML,
                               verbatim=True,)

            save_embeddings(node_embeddings, f'{data_dir}/{embedding_file}')
        except:
            pass
        
    else:
        continue
        
#         print(f'Generating a knowledge graph from {doc}')
#         with open(doc, "r") as f:
#             txt = " ".join(f.read().splitlines())  # separate lines with a single space

#         try:
#             _, graph_GraphML, _, _, _ = make_graph_from_text(txt,generate_Mistral,
#                                   include_contextual_proximity=False,
#                                   graph_root=graph_root,
#                                   chunk_size=1000,chunk_overlap=100,
#                                   repeat_refine=0,verbatim=False,
#                                   data_dir=data_dir,
#                                   save_PDF=False,#TO DO
#                                  )
#         except Exception as e:
#             print(f'Something is wrong with No. {i}: {doc}.')
#             f = open(f'{title}_err.txt', 'w')
#             f.write(f'{e}\n{txt}')
#             f.close()          
#             pass

    



/home/mkychsu/pool/TSMC/dataset_textbook_txt/dry-etching-technology-for-semiconductors_compress/dry-etching-technology-for-semiconductors_compress.md
Found a graph fragment to merge: dry-etching-technology-for-semiconductors_compress: /home/mkychsu/pool/TSMC/dataset_textbook_txt/dry-etching-technology-for-semiconductors_compress/dry-etching-technology-for-semiconductors_compress.md.
Merging graph No. 0: /home/mkychsu/pool/TSMC/dataset_textbook_txt/dry-etching-technology-for-semiconductors_compress/dry-etching-technology-for-semiconductors_compress.md to the main one


...

Now create or load new graph...
Loading or using provided graph... Any txt data provided will be ignored...: None ./GRAPHDATA_TSMC/dry-etching-technology-for-semiconductors_compress_graph.graphml
--- 0.05784296989440918 seconds ---
Now grow the existing graph...
Now update node embeddings


100%|██████████| 17540/17540 [00:20<00:00, 840.40it/s] 


Now simplify graph.
Start...


100%|██████████| 17980/17980 [00:00<00:00, 2363543.61it/s]


Node to keep and merge: photolithography <-- lithography
Node to keep and merge: dry etching <-- dry etching performance
Node to keep and merge: dry etching <-- dry etching process
Node to keep and merge: dry etching <-- dry etching processes
Node to keep and merge: dry etching <-- dry etching technique
Node to keep and merge: semiconductor device fabrication <-- semiconductor fabrication
Node to keep and merge: semiconductor device fabrication <-- semiconductor fabrication processes
Node to keep and merge: semiconductor manufacturing <-- semiconductor manufacturing industry
Node to keep and merge: silicon dioxide <-- silicon dioxide (sio2)
Node to keep and merge: temperature <-- temperatures
Node to keep and merge: al multilevel interconnect process <-- process flow for al multilevel interconnects
Node to keep and merge: aluminum alloy stacked metal layer structures <-- al alloy stacked metal layer structures
Node to keep and merge: anisotropic etching <-- anisotropic dry etching
Node

100%|██████████| 144/144 [00:04<00:00, 30.04it/s]


Relcaulated embeddings... 
Now save graph... 
Graph simplified and saved to ./GRAPHDATA_TSMC_OUTPUT//simple_graph_graphML_simplified.graphml
Remove small fragments
using weakly connected components...


100%|██████████| 17141/17141 [00:00<00:00, 936688.18it/s]

Removing embedding for node no longer in graph: table
Removing embedding for node no longer in graph: enhanced
Removing embedding for node no longer in graph: design and optimization
Removing embedding for node no longer in graph: tsvs
Removing embedding for node no longer in graph: o2 concentration
Removing embedding for node no longer in graph: 25 nm
Removing embedding for node no longer in graph: gate length
Removing embedding for node no longer in graph: information presented
Removing embedding for node no longer in graph: knowledge
Removing embedding for node no longer in graph: periphery
Removing embedding for node no longer in graph: discharge
Removing embedding for node no longer in graph: biased
Removing embedding for node no longer in graph: realization
Removing embedding for node no longer in graph: scaling
Removing embedding for node no longer in graph: vg
Removing embedding for node no longer in graph: explanation
Removing embedding for node no longer in graph: optimize
Re




Number of Communities = 77
Communities:  [['chemical solution', 'aerosol opcs', 'bleed and feed method', 'calibration model', 'centrifugal spray machines', 'contact angle mean', 'core vacancy', 'data analysis', 'defect measurement systems', 'defect size', 'detected particles', 'larger sample volumes', 'lbpcs', 'measure', 'molecular light scattering', 'nir spectroscopy', 'pca', 'pls', 'sample volume', 'scattered light', 'standard deviation', 'validation set', 'wet chemical monitoring', 'reagents and reaction products', 'correlation value', 'cross-validation', 'generated', 'rmsep', 'used to calculate', 'batch- or single-wafer dynamic wetcleaning', 'cleaning, rinsing, and spin-drying or ipa vapor drying', 'fluid flow', 'mcgraw-hill (1998)', 'results interpretation', 'accurate scattering models', 'cov', 'defect coordinates', 'hardware components', 'mean measured defect size', 'mean true defect size', 'mean true defect size standard deviation', 'multiple detectors', 'multiple scattering cha

...

Now create or load new graph...
Loading or using provided graph... Any txt data provided will be ignored...: None ./GRAPHDATA_TSMC/plasma-etching-an-introduction_compress_graph.graphml
--- 0.24931979179382324 seconds ---
Now grow the existing graph...
Now update node embeddings


100%|██████████| 22783/22783 [03:03<00:00, 124.32it/s]   


Now simplify graph.
Start...


100%|██████████| 24351/24351 [00:00<00:00, 573112.34it/s]


Node to keep and merge: integrated circuits <-- integrated circuit
Node to keep and merge: integrated circuits <-- integrated circuits (ics)
Node to keep and merge: chemical vapor deposition (cvd) <-- high temperature chemical vapor deposition (cvd)
Node to keep and merge: dry etching <-- used in dry etching
Node to keep and merge: etching <-- etch
Node to keep and merge: high voltage <-- high voltages
Node to keep and merge: materials science <-- material science
Node to keep and merge: reactive ion etching <-- reactive ion etcher
Node to keep and merge: reactive ion etching <-- reactive ion etchers
Node to keep and merge: reactive ion etching <-- plasma etching and reactive ion etching
Node to keep and merge: semiconductor manufacturing <-- semiconductor manufacturing industry
Node to keep and merge: wet etching <-- wet chemical etching
Node to keep and merge: temperature <-- temperatures
Node to keep and merge: aluminum (al) deposition <-- aluminum deposition
Node to keep and merge:

100%|██████████| 564/564 [00:18<00:00, 31.12it/s]


Relcaulated embeddings... 
Now save graph... 
Graph simplified and saved to ./GRAPHDATA_TSMC_OUTPUT//simple_graph_graphML_simplified.graphml
Remove small fragments
using weakly connected components...


  0%|          | 0/20431 [00:00<?, ?it/s]

Generating embedding for new node: h2 plasmas


 40%|████      | 8240/20431 [00:00<00:00, 82394.77it/s]

Generating embedding for new node: plasma etching processes
Generating embedding for new node: fluorine atom etching
Generating embedding for new node: hazardous materials
Generating embedding for new node: ion-sheath density


 87%|████████▋ | 17861/20431 [00:00<00:00, 81602.86it/s]

Generating embedding for new node: ion-sheath width
Generating embedding for new node: plasma oscillations


100%|██████████| 20431/20431 [00:00<00:00, 57728.40it/s]


Generating embedding for new node: single-wafer etcher
Generating embedding for new node: plasma density control
Removing embedding for node no longer in graph: / . appl. phys. 36, 2363 (1965)
Removing embedding for node no longer in graph: / . appl. phys. 37, 180 (1966)
Removing embedding for node no longer in graph: / . electrochem. soc, sol. state sci. and technol. 130, 648 (1983)
Removing embedding for node no longer in graph: / . ieee trans, on plasma science ps-14, 137 (1986)
Removing embedding for node no longer in graph: / . j. appl. phys. 41, 2117 (1970)
Removing embedding for node no longer in graph: / . j. vac. sci. technol. al , 845 (1983)
Removing embedding for node no longer in graph: / . journal of physics e: scientific instruments 5, 595 (1972)
Removing embedding for node no longer in graph: / . nucl. mater. 121, 41 (1984)
Removing embedding for node no longer in graph: / . nucl. mater. il l & 112, 165 (1982)
Removing embedding for node no longer in graph: / . phys. d10

...

Now create or load new graph...
Loading or using provided graph... Any txt data provided will be ignored...: None ./GRAPHDATA_TSMC/handbook-of-silicon-wafer-cleaning-technology-third-edition_compress_graph.graphml
--- 0.3489723205566406 seconds ---
Now grow the existing graph...
Now update node embeddings


100%|██████████| 25338/25338 [02:44<00:00, 153.64it/s]   


Now simplify graph.
Start...


100%|██████████| 29400/29400 [00:00<00:00, 1108915.73it/s]


Node to keep and merge: photolithography <-- lithography
Node to keep and merge: silicon wafers <-- si wafers
Node to keep and merge: silicon wafers <-- silicon wafer
Node to keep and merge: photoresist <-- photoresists
Node to keep and merge: chemical solution <-- chemical solutions
Node to keep and merge: dry etching <-- dry etch
Node to keep and merge: fluorine <-- fluorine (f)
Node to keep and merge: materials science <-- material science
Node to keep and merge: semiconductor device fabrication <-- semiconductor fabrication
Node to keep and merge: semiconductor device fabrication <-- semiconductor fabrication processes
Node to keep and merge: semiconductor manufacturing <-- semiconductor device manufacturing
Node to keep and merge: semiconductor manufacturing <-- semiconductor manufacturing industry
Node to keep and merge: semiconductor manufacturing process <-- semiconductor manufacturing processes
Node to keep and merge: silicon nitride <-- silicon nitride (si3n4)
Node to keep an

100%|██████████| 1084/1084 [00:36<00:00, 29.84it/s]


Relcaulated embeddings... 
Now save graph... 
Graph simplified and saved to ./GRAPHDATA_TSMC_OUTPUT//simple_graph_graphML_simplified.graphml
Remove small fragments
using weakly connected components...


  0%|          | 0/20864 [00:00<?, ?it/s]

Generating embedding for new node: thin film processing


 89%|████████▊ | 18479/20864 [00:00<00:00, 88903.12it/s]

Generating embedding for new node: semiconductor wafer cleaning science and technology


100%|██████████| 20864/20864 [00:00<00:00, 82295.08it/s]


Removing embedding for node no longer in graph: 0.1 mm
Removing embedding for node no longer in graph: 1 h
Removing embedding for node no longer in graph: 1-108 cm2
Removing embedding for node no longer in graph: 1/2 feature size criterion
Removing embedding for node no longer in graph: 10
Removing embedding for node no longer in graph: 10 h
Removing embedding for node no longer in graph: 10 vol% h2e90 vol% n2
Removing embedding for node no longer in graph: 100-c
Removing embedding for node no longer in graph: 1070 psia
Removing embedding for node no longer in graph: 11
Removing embedding for node no longer in graph: 11.5 atm
Removing embedding for node no longer in graph: 12
Removing embedding for node no longer in graph: 13
Removing embedding for node no longer in graph: 13-year period from 1993 to 2006
Removing embedding for node no longer in graph: 15
Removing embedding for node no longer in graph: 15%-20%
Removing embedding for node no longer in graph: 17%
Removing embedding for n

...

Now create or load new graph...
Loading or using provided graph... Any txt data provided will be ignored...: None ./GRAPHDATA_TSMC/Ultraclean Surface Processing of Silicon Wafers - PDF Free Download_graph.graphml
--- 0.3402535915374756 seconds ---
Now grow the existing graph...
Now update node embeddings


100%|██████████| 23645/23645 [01:32<00:00, 255.91it/s]   


Now simplify graph.
Start...


100%|██████████| 26253/26253 [00:00<00:00, 479952.33it/s]

Node to keep and merge: diameter <-- diameters
Node to keep and merge: impurities <-- impurity
Node to keep and merge: photolithography <-- lithography
Node to keep and merge: integrated circuits <-- integrated circuits (ics)
Node to keep and merge: silicon wafers <-- silicon wafer
Node to keep and merge: silicon wafers <-- si wafers
Node to keep and merge: photoresist <-- photoresists
Node to keep and merge: chemical solution <-- chemical solutions
Node to keep and merge: dry etching <-- dry etching process
Node to keep and merge: dry etching <-- dry etching processes
Node to keep and merge: dry etching <-- dry etching is a process
Node to keep and merge: dry etching <-- in dry etching
Node to keep and merge: high voltage <-- high-voltage
Node to keep and merge: materials science <-- material science
Node to keep and merge: semiconductor device fabrication <-- semiconductor fabrication
Node to keep and merge: semiconductor device fabrication <-- semiconductor fabrication processes
Nod




New graph generated, nodes relabled. 


100%|██████████| 832/832 [00:27<00:00, 30.42it/s]


Relcaulated embeddings... 
Now save graph... 
Graph simplified and saved to ./GRAPHDATA_TSMC_OUTPUT//simple_graph_graphML_simplified.graphml
Remove small fragments
using weakly connected components...


100%|██████████| 21004/21004 [00:00<00:00, 1080469.50it/s]


Removing embedding for node no longer in graph: -60 mv
Removing embedding for node no longer in graph: 0.1 flm
Removing embedding for node no longer in graph: 0.2 flm
Removing embedding for node no longer in graph: 1 mm
Removing embedding for node no longer in graph: 1/1 + l/w
Removing embedding for node no longer in graph: 1m drams
Removing embedding for node no longer in graph: 2099.1 cm-1
Removing embedding for node no longer in graph: 500°c
Removing embedding for node no longer in graph: 600
Removing embedding for node no longer in graph: 700°c
Removing embedding for node no longer in graph: 8c-1 cleaning
Removing embedding for node no longer in graph: ::=si-h
Removing embedding for node no longer in graph: =si=h3
Removing embedding for node no longer in graph: a hot wall cvd reactor is required to suppress by-product formation.
Removing embedding for node no longer in graph: acid injection systems
Removing embedding for node no longer in graph: acrylic acid resin box
Removing embe

...

Now create or load new graph...
Loading or using provided graph... Any txt data provided will be ignored...: None ./GRAPHDATA_TSMC/Atomic Layer Processing_semiconductor_graph.graphml
--- 0.17118215560913086 seconds ---
Now grow the existing graph...
Now update node embeddings


100%|██████████| 22919/22919 [01:04<00:00, 355.18it/s]   


Now simplify graph.
Start...


100%|██████████| 24457/24457 [00:00<00:00, 1604393.28it/s]


Node to keep and merge: photolithography <-- lithography
Node to keep and merge: photoresist <-- photoresists
Node to keep and merge: chemical vapor deposition (cvd) <-- cvd (chemical vapor deposition)
Node to keep and merge: chemical vapor deposition (cvd) <-- chemical vapor deposition (cvd) process
Node to keep and merge: dry etching <-- dry etching technique
Node to keep and merge: dry etching <-- used in dry etching
Node to keep and merge: materials science <-- material science
Node to keep and merge: reactive ion etching <-- by reactive ion etching
Node to keep and merge: semiconductor manufacturing <-- semiconductor device manufacturing
Node to keep and merge: substrate material <-- substrate materials
Node to keep and merge: pattern <-- patterns
Node to keep and merge: substrate <-- substrates
Node to keep and merge: pressure <-- pressures
Node to keep and merge: temperature <-- temperatures
Node to keep and merge: bcl 3 <-- bcl3
Node to keep and merge: capacitively coupled plas

100%|██████████| 486/486 [00:16<00:00, 29.68it/s]


Relcaulated embeddings... 
Now save graph... 
Graph simplified and saved to ./GRAPHDATA_TSMC_OUTPUT//simple_graph_graphML_simplified.graphml
Remove small fragments
using weakly connected components...


100%|██████████| 21076/21076 [00:00<00:00, 2805520.68it/s]

Removing embedding for node no longer in graph: 1 2 ta2o5
Removing embedding for node no longer in graph: 1 2 ta2o5 + 5 4 sf4
Removing embedding for node no longer in graph: 193 nm wavelength lasers
Removing embedding for node no longer in graph: 1990s
Removing embedding for node no longer in graph: 19th century
Removing embedding for node no longer in graph: 2000s
Removing embedding for node no longer in graph: 209th odd-even effect
Removing embedding for node no longer in graph: 252nd odd-even effect
Removing embedding for node no longer in graph: 3171-3175
Removing embedding for node no longer in graph: 3d nand channel hole etching
Removing embedding for node no longer in graph: acetylacetonate surface coverage
Removing embedding for node no longer in graph: activation energy barriers
Removing embedding for node no longer in graph: addition of argon
Removing embedding for node no longer in graph: additional etching mechanisms
Removing embedding for node no longer in graph: advanced 




Number of Communities = 84
Colors:                                                      node    color  group
0                                      chemical solution  #7edb57      1
1                                               am clean  #7edb57      1
2                               back-end-of-line process  #7edb57      1
3                              batch immersion processes  #7edb57      1
4                                  bleed and feed method  #7edb57      1
...                                                  ...      ...    ...
21071                                monolayer per cycle  #d057db     84
21072                                  finite step times  #d057db     84
21073           lower and upper limits of the ale window  #d057db     84
21074  lowest temperature with appreciable desorption...  #d057db     84
21075    disappearance of an appreciable adsorption rate  #d057db     84

[21076 rows x 3 columns]
Done, assigned colors and groups...
Done Louvain...
Done updat

...

Now create or load new graph...
Loading or using provided graph... Any txt data provided will be ignored...: None ./GRAPHDATA_TSMC/(Analog Circuits and Signal Processing) Pui-In Mak, Rui Paulo Martins (auth.) - High-_Mixed-Voltage Analog and RF Circuit Techniques for Nanoscale CMOS-Springer-Verlag New York (2012)_graph.graphml
--- 0.03495311737060547 seconds ---
Now grow the existing graph...
Now update node embeddings


100%|██████████| 23055/23055 [01:06<00:00, 345.26it/s]   


Now simplify graph.
Start...


100%|██████████| 23501/23501 [00:00<00:00, 910504.79it/s]


Node to keep and merge: integrated circuits <-- integrated circuit
Node to keep and merge: high temperature <-- high temperatures
Node to keep and merge: reducing <-- reduces
Node to keep and merge: power supplies <-- power supply
Node to keep and merge: time-dependent dielectric breakdown <-- time-dependent dielectric breakdown (tddb)
Node to keep and merge: time-dependent dielectric breakdown <-- time dependent dielectric breakdown
Node to keep and merge: technique <-- techniques
Node to keep and merge: properties of materials <-- materials properties
Node to keep and merge: damage to devices <-- device damage
Node to keep and merge: other components <-- additional components
Node to keep and merge: need be considered <-- must be considered
Node to keep and merge: radio frequency <-- radio frequency (rf)
Node to keep and merge: thick oxide <-- thick-oxide
Node to keep and merge: channel length modulation <-- channel-length modulation
Node to keep and merge: oxide film breakdown volta

100%|██████████| 142/142 [00:04<00:00, 29.31it/s]


Relcaulated embeddings... 
Now save graph... 
Graph simplified and saved to ./GRAPHDATA_TSMC_OUTPUT//simple_graph_graphML_simplified.graphml
Remove small fragments
using weakly connected components...


  0%|          | 0/22070 [00:00<?, ?it/s]

Generating embedding for new node: analog and rf circuits
Generating embedding for new node: nanoscale cmos


100%|██████████| 22070/22070 [00:00<00:00, 263045.35it/s]


Removing embedding for node no longer in graph: (1.1)
Removing embedding for node no longer in graph: (1.2)
Removing embedding for node no longer in graph: (1.3)
Removing embedding for node no longer in graph: (1.4)
Removing embedding for node no longer in graph: (1.5)
Removing embedding for node no longer in graph: +32/3.4-dbm out-of-channel iip2/iip3
Removing embedding for node no longer in graph: 0.12mm cmos dvb-t tuner
Removing embedding for node no longer in graph: 0.46 mm2
Removing embedding for node no longer in graph: 0.7-db passband roll-off
Removing embedding for node no longer in graph: 1-db icp
Removing embedding for node no longer in graph: 1-db input-referred compression point (icp)
Removing embedding for node no longer in graph: 1.27-to1.92-ghz pll + vco
Removing embedding for node no longer in graph: 1.8-o
Removing embedding for node no longer in graph: 170-to-240 mhz
Removing embedding for node no longer in graph: 18 db
Removing embedding for node no longer in graph: 1

...

Now create or load new graph...
Loading or using provided graph... Any txt data provided will be ignored...: None ./GRAPHDATA_TSMC/(Devices circuits & systems series) Chrzanowska-Jeske, Malgorzata_ Weide-Zaage, Kirsten - Semiconductor devices in harsh conditions-Taylor & Francis, CRC Press (2017)_graph.graphml
--- 0.40149474143981934 seconds ---
Now grow the existing graph...
Now update node embeddings


100%|██████████| 26310/26310 [02:20<00:00, 186.80it/s]   


Now simplify graph.
Start...


100%|██████████| 27474/27474 [00:00<00:00, 1880762.33it/s]


Node to keep and merge: photolithography <-- lithography
Node to keep and merge: photolithography <-- photolithography techniques
Node to keep and merge: silicon wafers <-- silicon wafer
Node to keep and merge: high voltage <-- high-voltage
Node to keep and merge: materials science <-- material science
Node to keep and merge: substrate material <-- substrate materials
Node to keep and merge: wet etching <-- novel wet etching
Node to keep and merge: temperature <-- temperatures
Node to keep and merge: modern electronic devices <-- modern electronics
Node to keep and merge: si substrates <-- silicon substrates
Node to keep and merge: high temperature <-- high-temperature
Node to keep and merge: high temperature <-- high temperatures
Node to keep and merge: increase <-- increases
Node to keep and merge: mechanism <-- mechanisms
Node to keep and merge: miniaturization <-- miniaturisation
Node to keep and merge: low pressure <-- low pressures
Node to keep and merge: occurs <-- occur
Node to

100%|██████████| 382/382 [00:12<00:00, 29.78it/s]


Relcaulated embeddings... 
Now save graph... 
Graph simplified and saved to ./GRAPHDATA_TSMC_OUTPUT//simple_graph_graphML_simplified.graphml
Remove small fragments
using weakly connected components...


  0%|          | 0/24414 [00:00<?, ?it/s]

Generating embedding for new node: advanced iii-n materials


 91%|█████████▏| 22305/24414 [00:00<00:00, 196321.26it/s]

Generating embedding for new node: design diversity
Generating embedding for new node: self-healing calibration
Generating embedding for new node: digital self-healing calibration scheme
Generating embedding for new node: harsh environment packaging
Generating embedding for new node: heavy-ion impact
Generating embedding for new node: changing rate of interfacial sliding


100%|██████████| 24414/24414 [00:00<00:00, 73169.48it/s] 

Generating embedding for new node: single event effects (see)
Removing embedding for node no longer in graph: 1.5 hours
Removing embedding for node no longer in graph: 100s of kilo-rad with large fins
Removing embedding for node no longer in graph: 1995 design automation conference scholarship award
Removing embedding for node no longer in graph: 2008 donald o. pederson best paper award
Removing embedding for node no longer in graph: 210-ghz inaln/gan hemts
Removing embedding for node no longer in graph: 232th and 238u
Removing embedding for node no longer in graph: 3d circuit geometry information
Removing embedding for node no longer in graph: 43% less
Removing embedding for node no longer in graph: 5% nacl spray
Removing embedding for node no longer in graph: 500w push-pull algan/gan hemt amplifier
Removing embedding for node no longer in graph: 512 mib ddr-ii dram memory
Removing embedding for node no longer in graph: 90% metal powder
Removing embedding for node no longer in graph: 




Number of Communities = 94
Colors:                                                      node    color  group
0                                      chemical solution  #c0db57      1
1                                          hafnium oxide  #c0db57      1
2                                                hfsio 2  #c0db57      1
3                                                 hfsion  #c0db57      1
4                                            high-κ film  #c0db57      1
...                                                  ...      ...    ...
24409                          portland state university  #57db97     94
24410                                          professor  #57db97     94
24411  research has been supported by the national sc...  #57db97     94
24412                                 research interests  #57db97     94
24413  vlsi & emerging technology design automation l...  #57db97     94

[24414 rows x 3 columns]
Done, assigned colors and groups...
Done Louvain...
Done updat

...

Now create or load new graph...
Loading or using provided graph... Any txt data provided will be ignored...: None ./GRAPHDATA_TSMC/(Devices circuits and systems)  - Electrostatic discharge protection of semiconductor devices and integrated circuits _ short-term static and dynamic loading conditions-Crc Press (2015)_graph.graphml
--- 0.09291744232177734 seconds ---
Now grow the existing graph...
Now update node embeddings


100%|██████████| 29087/29087 [02:34<00:00, 187.94it/s]   


Now simplify graph.
Start...


100%|██████████| 30509/30509 [00:00<00:00, 722487.07it/s]

Node to keep and merge: photolithography <-- lithography
Node to keep and merge: integrated circuits <-- integrated circuits (ic)
Node to keep and merge: integrated circuits <-- integrated circuits (ics)
Node to keep and merge: integrated circuits <-- integrated circuit
Node to keep and merge: high voltage <-- high voltage (hv)
Node to keep and merge: high voltage <-- high voltages
Node to keep and merge: bias voltage <-- voltage bias
Node to keep and merge: equivalent oxide thickness (eot) <-- effective oxide thickness (eot)
Node to keep and merge: three-dimensional integrated circuits (3d ic) <-- three-dimensional ic designs
Node to keep and merge: high temperature <-- high temperatures
Node to keep and merge: mechanism <-- mechanisms
Node to keep and merge: complex phenomena <-- complex phenomenon
Node to keep and merge: complex structures <-- complex structure
Node to keep and merge: mos devices <-- mos device
Node to keep and merge: occurs <-- occur
Node to keep and merge: alterna




New graph generated, nodes relabled. 


100%|██████████| 430/430 [00:14<00:00, 29.30it/s]


Relcaulated embeddings... 
Now save graph... 
Graph simplified and saved to ./GRAPHDATA_TSMC_OUTPUT//simple_graph_graphML_simplified.graphml
Remove small fragments
using weakly connected components...


  0%|          | 0/27000 [00:00<?, ?it/s]

Generating embedding for new node: esd protection circuits
Generating embedding for new node: esd protection requirements


100%|██████████| 27000/27000 [00:00<00:00, 177700.20it/s]


Generating embedding for new node: reducing the voltage drop across the mos capacitor (i)
Removing embedding for node no longer in graph: 1 t
Removing embedding for node no longer in graph: 1 v/µs
Removing embedding for node no longer in graph: 100 pf shunt capacitor to ground
Removing embedding for node no longer in graph: 13 chapters
Removing embedding for node no longer in graph: 1500 ω resistor in series with the device under test (dut) path
Removing embedding for node no longer in graph: 1900 v
Removing embedding for node no longer in graph: 281 dc leakage result
Removing embedding for node no longer in graph: 285 extension/halo implant
Removing embedding for node no longer in graph: 3.3 v or lower-rated nmos
Removing embedding for node no longer in graph: 3.5 a
Removing embedding for node no longer in graph: 30
Removing embedding for node no longer in graph: 44-48
Removing embedding for node no longer in graph: 5 v-rated nmos
Removing embedding for node no longer in graph: 7 a
Re

...

Now create or load new graph...
Loading or using provided graph... Any txt data provided will be ignored...: None ./GRAPHDATA_TSMC/(Devices, Circuits, & Systems) Santosh K. Kurinec, Krzysztof Iniewski - Nanoscale Semiconductor Memories_ Technology and Applications-CRC Press (2013)_graph.graphml
--- 0.1541128158569336 seconds ---
Now grow the existing graph...
Now update node embeddings


 96%|█████████▌| 31757/33083 [02:39<00:06, 198.96it/s]   

/home/mkychsu/pool/TSMC/dataset_textbook_txt/(Devices, Circuits, and Systems) Iniewski, Krzysztof_ Reza, Salim - Semiconductor Radiation Detectors_ Technology and Applications-CRC Press (2018)/(Devices, Circuits, and Systems) Iniewski, Krzysztof_ Reza, Salim - Semiconductor Radiation Detectors_ Technology and Applications-CRC Press (2018).md
/home/mkychsu/pool/TSMC/dataset_textbook_txt/(Devices, circuits, and systems) Artur Balasinki - Semiconductors _ integrated circuit design for manufacturability-CRC (2011)/(Devices, circuits, and systems) Artur Balasinki - Semiconductors _ integrated circuit design for manufacturability-CRC (2011).md
/home/mkychsu/pool/TSMC/dataset_textbook_txt/(Devices, circuits, and systems) Krzysztof Iniewski - Semiconductor radiation detection systems-CRC Press_Taylor & Francis (2010)/(Devices, circuits, and systems) Krzysztof Iniewski - Semiconductor radiation detection systems-CRC Press_Taylor & Francis (2010).md
/home/mkychsu/pool/TSMC/dataset_textbook_txt/(




...

Now create or load new graph...
Loading or using provided graph... Any txt data provided will be ignored...: None ./GRAPHDATA_TSMC/(Diffusion and defect data., Pt. B,, Solid state phenomena _, v. 145-146) Paul Mertens_ Marc Meuris_ Marc Heyns - Ultra clean processing of semiconductor surfaces IX (UCPSS 2008) _ 9th International S_graph.graphml
--- 0.14722919464111328 seconds ---
Now grow the existing graph...
Now update node embeddings


100%|██████████| 33569/33569 [03:39<00:00, 153.19it/s]   


Now simplify graph.
Start...


100%|██████████| 36105/36105 [00:00<00:00, 615851.20it/s]

Node to keep and merge: defects <-- defect
Node to keep and merge: photolithography <-- lithography
Node to keep and merge: silicon wafers <-- silicon wafer
Node to keep and merge: silicon wafers <-- si wafers
Node to keep and merge: dry etching <-- dry-etching
Node to keep and merge: semiconductor device fabrication <-- semiconductor fabrication
Node to keep and merge: silicon oxide <-- silicon oxides
Node to keep and merge: wet etching <-- wet etch
Node to keep and merge: wet etching <-- wet etching method
Node to keep and merge: wet etching <-- wet etching process
Node to keep and merge: unwanted material <-- unwanted materials
Node to keep and merge: substrate <-- substrates
Node to keep and merge: improved performance <-- better performance
Node to keep and merge: pressure <-- pressures
Node to keep and merge: 300-mm wafers <-- 300 mm wafers
Node to keep and merge: anisotropic etching <-- anisotropic etchant
Node to keep and merge: etch residues <-- etch residue
Node to keep and m




New graph generated, nodes relabled. 


 46%|████▌     | 383/834 [00:12<00:14, 30.38it/s]

In [1]:
graph_GraphML

NameError: name 'graph_GraphML' is not defined

In [None]:
import numpy as np

while doc_list != []:
    doc = np.random.choice(doc_list)   
    i = doc_list.index(doc)
    
    title = doc.split('/')[-1].split('.pdf')[0]
    doc = doc.split('/')
    doc[-2]+=f'_txt'
    doc[-1]=title+f'/{title}.md'
    doc='/'.join(doc)
    
    title = doc.split('/')[-1].split('.md')[0]
    graph_root = f'{title}'
    print(f'{doc}')
    if os.path.exists(f'{title}.txt'):
        print(f'No. {i}: {title} has been read')
        doc_list.pop(i)
        continue
    
    if os.path.exists(f'{title}_err.txt'):
        print(f'No. {i}: {title} got something wrong.')
        doc_list.pop(i)
        continue
    with open(doc, "r") as f:
        txt = " ".join(f.read().splitlines())  # separate lines with a single space

    try:
        _, graph_GraphML, _, _, _ = make_graph_from_text(txt,generate_Mistral,
                              include_contextual_proximity=False,
                              graph_root=graph_root,
                              chunk_size=2000,chunk_overlap=200,
                              repeat_refine=0,verbatim=False,
                              data_dir=data_dir,
                              save_PDF=False,#TO DO
                             )
    except Exception as e:
        print(f'Something is wrong with No. {i}: {title}.')
        f = open(f'{title}_err.txt', 'w')
        f.write(f'{e}\n{txt}')
        f.close()          
        continue




In [14]:
# doc = doc_list[0]
# title = doc.split('/')[-1].split('.pdf')[0]
# graph_root = f'{title}'

G = nx.read_graphml(f'{data_dir_output}/4books_integrated.graphml')
print(f'KG loaded: {G}')
# node_embeddings = generate_node_embeddings(G, embedding_tokenizer, embedding_model, )

KG loaded: DiGraph with 17015 nodes and 26185 edges


In [54]:
node_sorted=sorted(list(G.nodes), key= lambda x: -len(x.split()))
node_sorted[0]

'j.m. rosamilia, t. boone, j. sapjeta, k. raghavachari, g.s. higashi, q. liu, in: g. higashi, m. hirose, s. raghavan, s. verhaverbeke (eds.), symposium proceedings, science technology for semiconductor surface preparations, 477, materials research society, pittsburgh, pa, 1997, p. 181'

In [52]:
G.in_edges(node_sorted[0]).

InEdgeDataView([('surface preparation for semiconductor manufacturing', 'j.m. rosamilia, t. boone, j. sapjeta, k. raghavachari, g.s. higashi, q. liu, in: g. higashi, m. hirose, s. raghavan, s. verhaverbeke (eds.), symposium proceedings, science technology for semiconductor surface preparations, 477, materials research society, pittsburgh, pa, 1997, p. 181')])

In [31]:
len(node_sorted)

17015

▪What are the knobs that can change the uniformity in radical sietching process?
 
▪How to increase the selectivity ratio (gas/power/pressure) of si to oxide in ICP (Inductively Coupled Plasma) etching?
 
▪How to reduce the particle in the dechuck step?
 
▪How to improve the cleaning or etching ability of Al particles?

In [14]:
# def split_documents_into_chunks(documents, chunk_size=600, overlap_size=100):
#     chunks = []
#     for document in documents:
#         for i in range(0, len(document), chunk_size - overlap_size):
#             chunk = document[i:i + chunk_size]
#             chunks.append(chunk)
#     return chunks

# def extract_elements_from_chunks(chunks):
#     elements = []
#     for index, chunk in enumerate(chunks):
#         response = client.chat.completions.create(
#             model="gpt-4",
#             messages=[
#                 {"role": "system", "content": "Extract entities and relationships from the following text."},
#                 {"role": "user", "content": chunk}
#             ]
#         )
#         entities_and_relations = response.choices[0].message.content
#         elements.append(entities_and_relations)
#     return elements

# def summarize_elements(elements):
#     summaries = []
#     for index, element in enumerate(elements):
#         response = client.chat.completions.create(
#             model="gpt-4",
#             messages=[
#                 {"role": "system", "content": "Summarize the following entities and relationships in a structured format. Use \"->\" to represent relationships, after the \"Relationships:\" word."},
#                 {"role": "user", "content": element}
#             ]
#         )
#         summary = response.choices[0].message.content
#         summaries.append(summary)
#     return summaries

# def build_graph_from_summaries(summaries):
#     G = nx.Graph()
#     for summary in summaries:
#         lines = summary.split("\n")
#         entities_section = False
#         relationships_section = False
#         entities = []
#         for line in lines:
#             if line.startswith("### Entities:") or line.startswith("**Entities:**"):
#                 entities_section = True
#                 relationships_section = False
#                 continue
#             elif line.startswith("### Relationships:") or line.startswith("**Relationships:**"):
#                 entities_section = False
#                 relationships_section = True
#                 continue
#             if entities_section and line.strip():
#                 entity = line.split(".", 1)[1].strip() if line[0].isdigit() and line[1] == "." else line.strip()
#                 entity = entity.replace("**", "")
#                 entities.append(entity)
#                 G.add_node(entity)
#             elif relationships_section and line.strip():
#                 parts = line.split("->")
#                 if len(parts) >= 2:
#                     source = parts[0].strip()
#                     target = parts[-1].strip()
#                     relation = " -> ".join(parts[1:-1]).strip()
#                     G.add_edge(source, target, label=relation)
#     return G

def detect_communities(graph):
    # communities = []
#     for component in nx.weakly_connected_components(graph):
#         subgraph = graph.subgraph(component)
#         if len(subgraph.nodes) > 1:
#             try:
#                 # sub_communities = algorithms.leiden(subgraph)
#                 sub_communities = nx.community.girvan_newman(subgraph)
                
#                 # for community in sub_communities.communities:
#                 for community in tqdm(sub_communities):
#                     communities.append(list(community))
       
#                 communities = sorted(map(sorted, next_level_communities))
#             except Exception as e:
#                 print(f"Error processing community: {e}")
#         else:
#             communities.append(list(subgraph.nodes))

    communities_generator = nx.community.girvan_newman(G)
    next_level_communities = next(communities_generator)
    # next_level_communities = next(communities_generator)
    communities = sorted(map(sorted, next_level_communities), key = lambda x: -len(x) )
    return communities

def summarize_communities(communities, graph, generate):
    community_summaries = []
    for index, community in tqdm(enumerate(communities)):
        subgraph = graph.subgraph(community)
        nodes = list(subgraph.nodes)
        edges = list(subgraph.edges(data=True))
        description = "Entities: " + ", ".join(nodes) + "\nRelationships: "
        relationships = []
        for edge in edges:
            relationships.append(
                f"{edge[0]} -> {edge[2]['title']} -> {edge[1]}")
        description += ", ".join(relationships)
        # try:
        response = generate(system_prompt= "Summarize the following community of entities and relationships.",
                                       prompt= description)
        # response = client.chat.completions.create(
        #     model="gpt-4",
        #     messages=[
        #         {"role": "system", "content": "Summarize the following community of entities and relationships."},
        #         {"role": "user", "content": description}
        #     ]
        # )
        # summary = response.choices[0].message.content.strip()
        # except:
        print(description)
        summary = response.strip()
        community_summaries.append(summary)
    return community_summaries

def generate_answers_from_communities(community_summaries, generate, query):
    intermediate_answers = []
    for summary in tqdm(community_summaries):
        try:
            response = generate(system_prompt= "Answer the following query based on the provided summary.",
                                       prompt=f"Query: {query} Summary: {summary}")
            # response = client.chat.completions.create(
            #     model="gpt-4",
            #     messages=[
            #         {"role": "system", "content": "Answer the following query based on the provided summary."},
            #         {"role": "user", "content": f"Query: {query} Summary: {summary}"}
            #     ]
            # )
            intermediate_answers.append(response)
        except:
            print(f'TL;DR: {summary[0:100]}...{summary[-100:]}')
            return 0
    final_response = generate(system_prompt= "Combine these answers into a final, concise response.",
                                prompt=f"Intermediate answers: {' '.join(intermediate_answers)}")

    # final_response = client.chat.completions.create(
    #     model="gpt-4",
    #     messages=[
    #         {"role": "system", "content": "Combine these answers into a final, concise response."},
    #         {"role": "user", "content": }
    #     ]
    # )
    # final_answer = final_response.choices[0].message.content
    return final_response

# def graph_rag_pipeline(documents, query, chunk_size=600, overlap_size=100):
def graph_rag_pipeline(graph, generate, query):
    # chunks = split_documents_into_chunks(documents, chunk_size, overlap_size)
    # elements = extract_elements_from_chunks(chunks)
    # summaries = summarize_elements(elements)
    # graph = build_graph_from_summaries(summaries)
    
    communities = detect_communities(graph)
    if verbatim:
        print("Number of Communities = ", len(communities))
    community_summaries = summarize_communities(communities, graph, generate)
    final_answer = generate_answers_from_communities(community_summaries, generate, query)
    return final_answer



In [16]:
graph=G
generate = generate_Mistral
communities = detect_communities(graph)

In [17]:
community_summaries = summarize_communities(communities, graph, generate)

1it [00:36, 36.10s/it]

Entities: chemical solution, chemical vapor deposition (cvd), chemicals, dry etching, etching, evolution, fabrication, fast process, fluorine, gas, high voltage, high-quality semiconductor devices, light, mask, materials science, new technologies, passivation, photolithography, photoresist, physical vapor deposition (pvd), plasma, precise control, reactive ion etching, relatively slow process, semiconductor device, semiconductor device fabrication, semiconductor devices, semiconductor fabrication process, semiconductor manufacturing, semiconductor manufacturing process, semiconductor physics, silicon, silicon nitride, silicon oxide, slow process, substrate material, vapor, wet etching, cvd, acidic solutions, pattern, plasma or ionizing radiation, unwanted material, fabrication process, semiconducting materials, substrate, pvd, exposure, corrosion, other forms of damage, protection layer, silicon dioxide, processes and techniques, computers, other electronic devices, smartphones, advanc


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =     188.08 ms /  6437 runs   (    0.03 ms per token, 34224.80 tokens per second)
llama_print_timings: prompt eval time =     622.09 ms /  1755 tokens (    0.35 ms per token,  2821.14 tokens per second)
llama_print_timings:        eval time =  100596.48 ms /  6436 runs   (   15.63 ms per token,    63.98 tokens per second)
llama_print_timings:       total time =  131662.81 ms /  8191 tokens
2it [02:47, 92.32s/it]Llama.generate: 27 prefix-match hit, remaining 1627 prompt tokens to eval


Entities: j. electrochem. soc. 144, l253, schwartz and schaible (1983), c.d. fung, j.p. joly, t.q. hurd et al., k. barla, l.j. olmer, reynolds et al. (1999), a. lippert et al., w. lee, r.s. srinivasan, proceedings of 2nd international symposium on ultra clean processing of silicon surfaces, raoux et al. (2008), raley et al. (2018), l. loewenstein, a.m. papon, l. li, j. alay, p. w. mertens, m. meuris, w. vandervorst, and m. m. heyns, o.d. patterson, discussion of future, extended abstracts of the 1991 international conference on solid state devices and materials, g.w. gale et al., l.a. cheema, posseme et al. (2014), oostende, belgium, b. bowman, m.f. charpin, semiconductor wafer bonding: science, technology, and applications 1992, m.b. burns, s. d. hossain pas, m. f. s. hossain, and m. a. s. hossain, j. lauerhaas, rudolph et al. (2004), shaqfeh and jurgensen (1989), s.s. lopez, 1995 ieee/semi advanced semiconductor manufacturing conference, w. yang, t. bearda, c. paillet, a. skumanich, 


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =     188.47 ms /  6538 runs   (    0.03 ms per token, 34690.06 tokens per second)
llama_print_timings: prompt eval time =     564.12 ms /  1627 tokens (    0.35 ms per token,  2884.14 tokens per second)
llama_print_timings:        eval time =  101890.20 ms /  6537 runs   (   15.59 ms per token,    64.16 tokens per second)
llama_print_timings:       total time =  132239.17 ms /  8164 tokens
3it [05:00, 110.55s/it]Llama.generate: 27 prefix-match hit, remaining 359 prompt tokens to eval


Entities: x. fu, d. landolt, mat. res. soc. proc. 566 (2000) 97, thin film applications in electronics, c.r. chang, j. bakke, a. karagoz, in: interconnect technology conference/advanced metallization conference (iitc/amc), 2016, p. 108, r. sreenivasan, g.w. rubloff, n. breil, e. zafiriou, y. xu, t. gougousi, ecs trans. 50 (39) (2013) 3, u. mahajan, h. iravani, k. xu, t. sahin, y. lei, g.b. basim, j.t. sears, k. mikhaylich, k.h. chang, j. kidder jr., s.h. shen, i. carlsson, ecs j. solid state sci. technol. 5 (6) (2016) p361, u.s. patent 5,672,543, g. jian, in: camp 3rd annual international symposium on cmp, lake placid, new york, 1998, r.k. singh, m. bielman, b.j. palla, z. ozdemir, http://electroiq.com/blog/2016/08/lam-research-enables-next-generation-memory-withindustrys-first-ald-process-for-low-fluorine-tungsten-fill/, j. electron. mater. 25 (9) (1996) 1531, s. mischler, p.t. chu, j. fung, t.y. liu, k. daito, e. rosset, d.o. shah, t.m. peng, o.h. gokce
Relationships: thin film appli


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       4.97 ms /   165 runs   (    0.03 ms per token, 33225.94 tokens per second)
llama_print_timings: prompt eval time =     119.77 ms /   359 tokens (    0.33 ms per token,  2997.44 tokens per second)
llama_print_timings:        eval time =    1895.81 ms /   164 runs   (   11.56 ms per token,    86.51 tokens per second)
llama_print_timings:       total time =    2084.74 ms /   523 tokens
4it [05:02, 67.73s/it] Llama.generate: 27 prefix-match hit, remaining 2038 prompt tokens to eval


Entities: style of communication, personal opinions or interpretations, informed decisions, john w. campbell, complex systems or processes, guide for students and researchers, academic writing, matter-of-fact language, chen, j., & zhang, y. (2018)., research findings, emotive language or personal anecdotes, 2018, accurate and reliable, scientific writing, scientific phenomena, oxford university press, facts and data, unbiased manner
Relationships: matter-of-fact language -> often used -> academic writing, matter-of-fact language -> important in scientific communication,important in scientific communication -> accurate and reliable, matter-of-fact language -> clear and concise manner -> complex systems or processes, matter-of-fact language -> avoid -> emotive language or personal anecdotes, matter-of-fact language -> conveys -> facts and data, matter-of-fact language -> crucial for making -> informed decisions, matter-of-fact language -> avoid -> personal opinions or interpretations, ma


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       5.65 ms /   189 runs   (    0.03 ms per token, 33439.49 tokens per second)
llama_print_timings: prompt eval time =     670.64 ms /  2038 tokens (    0.33 ms per token,  3038.87 tokens per second)
llama_print_timings:        eval time =    2465.50 ms /   188 runs   (   13.11 ms per token,    76.25 tokens per second)
llama_print_timings:       total time =    3223.98 ms /  2226 tokens
5it [05:05, 44.47s/it]Llama.generate: 27 prefix-match hit, remaining 751 prompt tokens to eval


Entities: f. shadman, r. governal, a. bonner, in: proceedings of institute of environmental sciences, annual technical meeting, 1990, p. 221, e. terrell, in: semiconductor pure water chemicals conference (spwcc) upw track, 2005, p. 91, a. tonti, in: extended abstracts, the electrochemical society fall meeting 91-2, the electrochemical society, pennington, nj, 1991, p. 758, l.m. loewenstein, f. charpin, p.w. mertens, j. electrochem. soc. 146 (2) (1999) 719, d. jan, i. ali, s. raghavan, in: proceedings of the institute of environmental sciences, annual technical meeting, 1991, p. 849, state phenom. 76 and 77 (2001) 43, surface preparation for semiconductor manufacturing, w.c. krussell, d.i. golland, in: j. ruzyllo, r.e. novak (eds.), first international symposium on cleaning technology in semiconductor device manufacturing, 90-9, the electrochemical society, pennington, nj, 1990, p. 23, r. chiarello, r. parker, m. tritapoe, micro 18 (6) (2000) 111, s. kasi, m. liehr, appl. phys. lett. 57


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       9.89 ms /   330 runs   (    0.03 ms per token, 33377.16 tokens per second)
llama_print_timings: prompt eval time =     251.43 ms /   751 tokens (    0.33 ms per token,  2986.89 tokens per second)
llama_print_timings:        eval time =    3932.89 ms /   329 runs   (   11.95 ms per token,    83.65 tokens per second)
llama_print_timings:       total time =    4356.27 ms /  1080 tokens
6it [05:09, 30.84s/it]Llama.generate: 27 prefix-match hit, remaining 1379 prompt tokens to eval


Entities: p. grutter, a. kikukawa, extended abstract '91 autumn meeting, jpn. soc. appl. phys. no.2 (1991) 426, j. appl. phys. 69, 668 (1991), t. hasegawa, j. vac. sci. technol. a8, 3386 (1990), k.ltaya, jpn. j. appl. phys. 31, 908 (1992) [in japanese], s. akamine, appl. phys. lett. 53, 487 (1988), s. hosaka, t. r. albrecht, appl. phys. lett. 53, 1045 (1998), g. meyer and n. m. amer, j. vac. sci. technol. all, 3092 (1993)
Relationships: a. kikukawa -> developed scanning capacitance microscopy. -> j. vac. sci. technol. all, 3092 (1993), a. kikukawa -> developed scanning capacitance microscopy. -> s. hosaka, k.ltaya -> developed Kerbin force microscopy. -> j. vac. sci. technol. a8, 3386 (1990), s. hosaka -> developed scanning capacitance microscopy. -> appl. phys. lett. 53, 487 (1988), s. hosaka -> developed scanning capacitance microscopy.,developed scanning capacitance microscopy. -> extended abstract '91 autumn meeting, jpn. soc. appl. phys. no.2 (1991) 426, s. hosaka -> developed sca


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =      16.64 ms /   565 runs   (    0.03 ms per token, 33958.41 tokens per second)
llama_print_timings: prompt eval time =     453.66 ms /  1379 tokens (    0.33 ms per token,  3039.75 tokens per second)
llama_print_timings:        eval time =    7148.99 ms /   564 runs   (   12.68 ms per token,    78.89 tokens per second)
llama_print_timings:       total time =    7980.16 ms /  1943 tokens
7it [05:17, 23.37s/it]Llama.generate: 27 prefix-match hit, remaining 317 prompt tokens to eval


Entities: h.b. profijt, s.e. potts, s.m.c.m. van de, w.m.m. kessels, j. vac. sci. technol. a29 (2011) 050801, h.b.r. lee, h. kim, electrochem. solid state lett. 9 (2006) g323, electrochemical monitoring, b.c. peethala, h.p. amanapu, u.r.k. lagudu, s.v. babu, j. electrochem. soc. 159 (2012) h582, j. yoon, h.b.r. lee, d. kim, t. cheon, s.h. kim, h. kim, j. electrochem. soc. 158 (2011) h1179, k. kim, k. lee, s. han, w. jeong, h. jeon, j. electrochem. soc. 154 (2007) h177, surface coatings monitoring, j. park, h.b.r. lee, d. kim, j. yoon, c. lansalot, j. gatineau, h. chevrel, h. kim, j. energy chem. 22 (2013) 403, k. kim, k. lee, s. han, t. park, y. lee, j. kim, s. yeom, h. jeon, jpn. j. appl. phys. 46 (2007) l173, j.m. kim, j.b.r. lee, l. lansalot, c. dussarrat, j. gatineau, h. kim, jpn. j. appl. phys. 49 (05fa10) (2010), j.-h. park, d.-y. moon, d.-s. han, y.-j. kang, s.-r. shin, h.-t. jeon, j.-w. park, surf. coat. technol. 259 (2014) 98
Relationships: electrochemical monitoring -> is a t


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       5.19 ms /   174 runs   (    0.03 ms per token, 33513.10 tokens per second)
llama_print_timings: prompt eval time =     114.39 ms /   317 tokens (    0.36 ms per token,  2771.12 tokens per second)
llama_print_timings:        eval time =    1995.32 ms /   173 runs   (   11.53 ms per token,    86.70 tokens per second)
llama_print_timings:       total time =    2181.98 ms /   490 tokens
8it [05:19, 16.62s/it]Llama.generate: 27 prefix-match hit, remaining 271 prompt tokens to eval


Entities: independent of t when diffusion length is small, measurable using high intensity of 10191020 photons per cm2, accurate measurement requires saturated photopotential, measurable by making surface flat, deduced from well-known function q8c(v), surface barrier height, measurable by making band flat, estimable using linear curve, measurable using gaaias laser diode, measurement of l, deduced from q8c(v)
Relationships: surface barrier height -> requires photopotential in saturated state for -> accurate measurement requires saturated photopotential, surface barrier height -> can be deduced from -> deduced from q8c(v), surface barrier height -> can be deduced from -> deduced from well-known function q8c(v), surface barrier height -> can be estimated using -> estimable using linear curve, surface barrier height -> when diffusion length is small, becomes independent of -> independent of t when diffusion length is small, surface barrier height -> can be measured by -> measurable by mak


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       4.92 ms /   164 runs   (    0.03 ms per token, 33340.11 tokens per second)
llama_print_timings: prompt eval time =     111.17 ms /   271 tokens (    0.41 ms per token,  2437.60 tokens per second)
llama_print_timings:        eval time =    1870.26 ms /   163 runs   (   11.47 ms per token,    87.15 tokens per second)
llama_print_timings:       total time =    2046.63 ms /   434 tokens
9it [05:21, 12.07s/it]Llama.generate: 27 prefix-match hit, remaining 253 prompt tokens to eval


Entities: principle of ecr plasma etching, microwave generated, magnetic field formed, independent control, structure and operation of an ecr plasma etcher, microwave introduced, etch chamber with quartz window, electrons go into cyclotron motion, of ion energy and plasma discharge, ecr increases probability of collisions
Relationships: principle of ecr plasma etching -> and generates high-density plasma -> ecr increases probability of collisions, principle of ecr plasma etching -> under the electric field -> electrons go into cyclotron motion, principle of ecr plasma etching -> orthogonally to the electric field -> magnetic field formed, principle of ecr plasma etching -> by a magnetron -> microwave generated, independent control -> and independent control of -> of ion energy and plasma discharge, structure and operation of an ecr plasma etcher -> under electric and magnetic fields -> electrons go into cyclotron motion, structure and operation of an ecr plasma etcher -> and electromag


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       5.13 ms /   170 runs   (    0.03 ms per token, 33119.03 tokens per second)
llama_print_timings: prompt eval time =      92.37 ms /   253 tokens (    0.37 ms per token,  2739.10 tokens per second)
llama_print_timings:        eval time =    1935.48 ms /   169 runs   (   11.45 ms per token,    87.32 tokens per second)
llama_print_timings:       total time =    2094.98 ms /   422 tokens
10it [05:24,  8.99s/it]Llama.generate: 27 prefix-match hit, remaining 214 prompt tokens to eval


Entities: comparison of different cvd techniques, due to the use of plasma etching, metal cvd vs. low pressure cvd, less controlled, reducing particle contamination, improving cooling efficiency, plasma cvd vs. metal cvd, higher particle contamination, atmospheric pressure cvd vs. low pressure cvd, due to the use of metal precursors
Relationships: comparison of different cvd techniques -> has -> atmospheric pressure cvd vs. low pressure cvd, comparison of different cvd techniques -> has -> plasma cvd vs. metal cvd, metal cvd vs. low pressure cvd -> has -> due to the use of metal precursors, metal cvd vs. low pressure cvd -> has -> higher particle contamination, improving cooling efficiency -> are necessary for future high-current systems -> reducing particle contamination, plasma cvd vs. metal cvd -> has -> due to the use of plasma etching, plasma cvd vs. metal cvd -> has -> reducing particle contamination, atmospheric pressure cvd vs. low pressure cvd -> has -> higher particle contami


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       5.53 ms /   182 runs   (    0.03 ms per token, 32917.34 tokens per second)
llama_print_timings: prompt eval time =      87.38 ms /   214 tokens (    0.41 ms per token,  2448.96 tokens per second)
llama_print_timings:        eval time =    2066.50 ms /   181 runs   (   11.42 ms per token,    87.59 tokens per second)
llama_print_timings:       total time =    2228.06 ms /   395 tokens
11it [05:26, 29.66s/it]

Entities: atomistic simulation, graves and brault, molecular dynamics simulation, si vertical walls, gate etching processes, damage formation, cl+ etching, hamaguchi et al., gou et al. (2010), computational technique
Relationships: graves and brault -> used -> molecular dynamics simulation, molecular dynamics simulation -> used to simulate the behavior of atoms and molecules under the influence of an ion beam -> computational technique, molecular dynamics simulation -> models -> damage formation, molecular dynamics simulation -> models -> si vertical walls, molecular dynamics simulation -> is related to -> cl+ etching, gate etching processes -> causes -> damage formation, hamaguchi et al. -> conducted atomistic simulations of plasma-surface interactions for ALD and ALE processes -> atomistic simulation, hamaguchi et al. -> used MD simulation to analyze surface reactions during SiO2 ALE -> molecular dynamics simulation, gou et al. (2010) -> conducted -> molecular dynamics simulation





In [18]:
# query = "What are the main techniques to make semiconductors?"
query = "What are the recent methods to do dry etching?"

last_response=''
for i, summary in tqdm(enumerate(community_summaries)):
    response = generate(system_prompt= "Answer the query detailedly based on the collected information and the combined with the last thought you have. ",
                               prompt=f"Query: {query} Collected information: {summary} You last thought: {last_response}")
    last_response=response
    print(last_response)


0it [00:00, ?it/s]Llama.generate: 8 prefix-match hit, remaining 54 prompt tokens to eval

llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       2.61 ms /    77 runs   (    0.03 ms per token, 29479.33 tokens per second)
llama_print_timings: prompt eval time =      42.97 ms /    54 tokens (    0.80 ms per token,  1256.69 tokens per second)
llama_print_timings:        eval time =     854.88 ms /    76 runs   (   11.25 ms per token,    88.90 tokens per second)
llama_print_timings:       total time =     927.54 ms /   130 tokens
1it [00:00,  1.07it/s]Llama.generate: 52 prefix-match hit, remaining 6522 prompt tokens to eval


 Recent methods for dry etching include plasma etching and reactive ion etching. Plasma etching uses a plasma to selectively remove material from a substrate, while reactive ion etching uses a gas plasma to selectively remove material from a substrate. Both methods are commonly used in the semiconductor industry for the fabrication of microelectronic devices.



llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =      47.00 ms /  1618 runs   (    0.03 ms per token, 34422.60 tokens per second)
llama_print_timings: prompt eval time =    2744.69 ms /  6522 tokens (    0.42 ms per token,  2376.23 tokens per second)
llama_print_timings:        eval time =   28682.74 ms /  1617 runs   (   17.74 ms per token,    56.38 tokens per second)
llama_print_timings:       total time =   33737.59 ms /  8139 tokens
2it [00:34, 20.24s/it]Llama.generate: 58 prefix-match hit, remaining 167 prompt tokens to eval


 1.discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses", "discusses


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       4.84 ms /   161 runs   (    0.03 ms per token, 33236.99 tokens per second)
llama_print_timings: prompt eval time =      82.19 ms /   167 tokens (    0.49 ms per token,  2031.80 tokens per second)
llama_print_timings:        eval time =    1822.90 ms /   160 runs   (   11.39 ms per token,    87.77 tokens per second)
llama_print_timings:       total time =    1969.69 ms /   327 tokens
4it [00:36,  8.17s/it]Llama.generate: 61 prefix-match hit, remaining 348 prompt tokens to eval


 Recent methods for dry etching include plasma etching, reactive ion etching, and chemisorption etching. These methods use different types of energy sources and chemical reactions to selectively remove material from a substrate surface. Plasma etching uses a plasma to break down the chemical bonds between atoms, while reactive ion etching uses a gas plasma to ionize the gas and create highly reactive ions that can etch the substrate. Chemisorption etching involves the adsorption of a chemical species onto the substrate surface, followed by the removal of the adsorbed species, which can etch the substrate. These methods are commonly used in microfabrication and semiconductor manufacturing to create complex patterns and structures on substrate materials.



llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       5.05 ms /   171 runs   (    0.03 ms per token, 33847.98 tokens per second)
llama_print_timings: prompt eval time =     118.54 ms /   348 tokens (    0.34 ms per token,  2935.72 tokens per second)
llama_print_timings:        eval time =    1968.29 ms /   170 runs   (   11.58 ms per token,    86.37 tokens per second)
llama_print_timings:       total time =    2156.17 ms /   518 tokens
5it [00:38,  6.31s/it]Llama.generate: 58 prefix-match hit, remaining 502 prompt tokens to eval


 Based on the collected information and the last thought, recent methods for dry etching include plasma etching, reactive ion etching, and chemisorption etching. These methods use different types of energy sources and chemical reactions to selectively remove material from a substrate surface. Plasma etching uses a plasma to break down the chemical bonds between atoms, while reactive ion etching uses a gas plasma to ionize the gas and create highly reactive ions that can etch the substrate. Chemisorption etching involves the adsorption of a chemical species onto the substrate surface, followed by the removal of the adsorbed species, which can etch the substrate. These methods are commonly used in microfabrication and semiconductor manufacturing to create complex patterns and structures on substrate materials.



llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       5.04 ms /   171 runs   (    0.03 ms per token, 33908.39 tokens per second)
llama_print_timings: prompt eval time =     150.81 ms /   502 tokens (    0.30 ms per token,  3328.60 tokens per second)
llama_print_timings:        eval time =    1995.42 ms /   170 runs   (   11.74 ms per token,    85.20 tokens per second)
llama_print_timings:       total time =    2217.18 ms /   672 tokens
6it [00:41,  5.06s/it]Llama.generate: 60 prefix-match hit, remaining 735 prompt tokens to eval


 Based on the collected information and the last thought, recent methods for dry etching include plasma etching, reactive ion etching, and chemisorption etching. These methods use different types of energy sources and chemical reactions to selectively remove material from a substrate surface. Plasma etching uses a plasma to break down the chemical bonds between atoms, while reactive ion etching uses a gas plasma to ionize the gas and create highly reactive ions that can etch the substrate. Chemisorption etching involves the adsorption of a chemical species onto the substrate surface, followed by the removal of the adsorbed species, which can etch the substrate. These methods are commonly used in microfabrication and semiconductor manufacturing to create complex patterns and structures on substrate materials.



llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       5.01 ms /   171 runs   (    0.03 ms per token, 34131.74 tokens per second)
llama_print_timings: prompt eval time =     250.17 ms /   735 tokens (    0.34 ms per token,  2937.96 tokens per second)
llama_print_timings:        eval time =    2027.42 ms /   170 runs   (   11.93 ms per token,    83.85 tokens per second)
llama_print_timings:       total time =    2348.40 ms /   905 tokens
7it [00:43,  4.24s/it]Llama.generate: 58 prefix-match hit, remaining 346 prompt tokens to eval


 Based on the collected information and the last thought, recent methods for dry etching include plasma etching, reactive ion etching, and chemisorption etching. These methods use different types of energy sources and chemical reactions to selectively remove material from a substrate surface. Plasma etching uses a plasma to break down the chemical bonds between atoms, while reactive ion etching uses a gas plasma to ionize the gas and create highly reactive ions that can etch the substrate. Chemisorption etching involves the adsorption of a chemical species onto the substrate surface, followed by the removal of the adsorbed species, which can etch the substrate. These methods are commonly used in microfabrication and semiconductor manufacturing to create complex patterns and structures on substrate materials.



llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       4.98 ms /   171 runs   (    0.03 ms per token, 34344.25 tokens per second)
llama_print_timings: prompt eval time =     118.97 ms /   346 tokens (    0.34 ms per token,  2908.37 tokens per second)
llama_print_timings:        eval time =    1972.57 ms /   170 runs   (   11.60 ms per token,    86.18 tokens per second)
llama_print_timings:       total time =    2161.18 ms /   516 tokens
8it [00:45,  3.61s/it]Llama.generate: 58 prefix-match hit, remaining 336 prompt tokens to eval


 Based on the collected information and the last thought, recent methods for dry etching include plasma etching, reactive ion etching, and chemisorption etching. These methods use different types of energy sources and chemical reactions to selectively remove material from a substrate surface. Plasma etching uses a plasma to break down the chemical bonds between atoms, while reactive ion etching uses a gas plasma to ionize the gas and create highly reactive ions that can etch the substrate. Chemisorption etching involves the adsorption of a chemical species onto the substrate surface, followed by the removal of the adsorbed species, which can etch the substrate. These methods are commonly used in microfabrication and semiconductor manufacturing to create complex patterns and structures on substrate materials.



llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       8.16 ms /   274 runs   (    0.03 ms per token, 33582.55 tokens per second)
llama_print_timings: prompt eval time =     117.37 ms /   336 tokens (    0.35 ms per token,  2862.81 tokens per second)
llama_print_timings:        eval time =    3167.52 ms /   273 runs   (   11.60 ms per token,    86.19 tokens per second)
llama_print_timings:       total time =    3410.27 ms /   609 tokens
9it [00:49,  3.55s/it]Llama.generate: 58 prefix-match hit, remaining 445 prompt tokens to eval


 Recent methods to do dry etching include plasma etching, reactive ion etching, and chemisorption etching. Plasma etching uses a plasma to break down the chemical bonds between atoms, while reactive ion etching uses a gas plasma to ionize the gas and create highly reactive ions that can etch the substrate. Chemisorption etching involves the adsorption of a chemical species onto the substrate surface, followed by the removal of the adsorbed species, which can etch the substrate. These methods are commonly used in microfabrication and semiconductor manufacturing to create complex patterns and structures on substrate materials. The ECR plasma etcher is one of the most commonly used plasma etching systems, which generates a high-density plasma, increasing the probability of collisions. Electrons go into cyclotron motion under the electric field, and the magnetic field is formed orthogonally to the electric field. The ECR plasma etcher is generated by a magnetron, and the microwave is intro


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       8.07 ms /   274 runs   (    0.03 ms per token, 33936.09 tokens per second)
llama_print_timings: prompt eval time =     138.90 ms /   445 tokens (    0.31 ms per token,  3203.79 tokens per second)
llama_print_timings:        eval time =    3197.52 ms /   273 runs   (   11.71 ms per token,    85.38 tokens per second)
llama_print_timings:       total time =    3463.20 ms /   718 tokens
10it [00:52,  3.53s/it]Llama.generate: 56 prefix-match hit, remaining 459 prompt tokens to eval


 Recent methods to do dry etching include plasma etching, reactive ion etching, and chemisorption etching. Plasma etching uses a plasma to break down the chemical bonds between atoms, while reactive ion etching uses a gas plasma to ionize the gas and create highly reactive ions that can etch the substrate. Chemisorption etching involves the adsorption of a chemical species onto the substrate surface, followed by the removal of the adsorbed species, which can etch the substrate. These methods are commonly used in microfabrication and semiconductor manufacturing to create complex patterns and structures on substrate materials. The ECR plasma etcher is one of the most commonly used plasma etching systems, which generates a high-density plasma, increasing the probability of collisions. Electrons go into cyclotron motion under the electric field, and the magnetic field is formed orthogonally to the electric field. The ECR plasma etcher is generated by a magnetron, and the microwave is intro


llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       8.12 ms /   274 runs   (    0.03 ms per token, 33752.16 tokens per second)
llama_print_timings: prompt eval time =     145.10 ms /   459 tokens (    0.32 ms per token,  3163.27 tokens per second)
llama_print_timings:        eval time =    3200.71 ms /   273 runs   (   11.72 ms per token,    85.29 tokens per second)
llama_print_timings:       total time =    3473.53 ms /   732 tokens
11it [00:55,  5.09s/it]

 Recent methods to do dry etching include plasma etching, reactive ion etching, and chemisorption etching. Plasma etching uses a plasma to break down the chemical bonds between atoms, while reactive ion etching uses a gas plasma to ionize the gas and create highly reactive ions that can etch the substrate. Chemisorption etching involves the adsorption of a chemical species onto the substrate surface, followed by the removal of the adsorbed species, which can etch the substrate. These methods are commonly used in microfabrication and semiconductor manufacturing to create complex patterns and structures on substrate materials. The ECR plasma etcher is one of the most commonly used plasma etching systems, which generates a high-density plasma, increasing the probability of collisions. Electrons go into cyclotron motion under the electric field, and the magnetic field is formed orthogonally to the electric field. The ECR plasma etcher is generated by a magnetron, and the microwave is intro




In [19]:
last_response

' Recent methods to do dry etching include plasma etching, reactive ion etching, and chemisorption etching. Plasma etching uses a plasma to break down the chemical bonds between atoms, while reactive ion etching uses a gas plasma to ionize the gas and create highly reactive ions that can etch the substrate. Chemisorption etching involves the adsorption of a chemical species onto the substrate surface, followed by the removal of the adsorbed species, which can etch the substrate. These methods are commonly used in microfabrication and semiconductor manufacturing to create complex patterns and structures on substrate materials. The ECR plasma etcher is one of the most commonly used plasma etching systems, which generates a high-density plasma, increasing the probability of collisions. Electrons go into cyclotron motion under the electric field, and the magnetic field is formed orthogonally to the electric field. The ECR plasma etcher is generated by a magnetron, and the microwave is intr

In [20]:
final_response = generate(system_prompt= "Combine these answers into a final, concise response.",
                            prompt=f" answers: {last_response}")


Llama.generate: 8 prefix-match hit, remaining 298 prompt tokens to eval

llama_print_timings:        load time =     187.56 ms
llama_print_timings:      sample time =       4.21 ms /   143 runs   (    0.03 ms per token, 33934.50 tokens per second)
llama_print_timings: prompt eval time =     112.75 ms /   298 tokens (    0.38 ms per token,  2642.92 tokens per second)
llama_print_timings:        eval time =    1628.65 ms /   142 runs   (   11.47 ms per token,    87.19 tokens per second)
llama_print_timings:       total time =    1796.89 ms /   440 tokens


In [None]:
# query = "What are the main techniques to make semiconductors?"
query = "What are the recent methods to do dry etching?"

last_response=''
for i, summary in tqdm(enumerate(community_summaries)):
    response = generate(system_prompt= "Answer the query detailedly based on the collected information and the combined with the last thought you have. ",
                               prompt=f"Query: {query} Collected information: {summary} You last thought: {last_response}")
    last_response=response
    print(last_response)

In [None]:
response, (best_node_1, best_similarity_1, best_node_2, best_similarity_2), path, path_graph, shortest_path_length, fname, graph_GraphML = find_path_and_reason(
    G, 
    node_embeddings,
    embedding_tokenizer, 
    embedding_model, 
    generate_Mistral, 
    data_dir=data_dir_output,
    verbatim=verbatim,
    include_keywords_as_nodes=True,  # Include keywords in the graph analysis
    keyword_1="Temperature",
    keyword_2="Semiconductors",
    N_limit=9999,  # The limit for keywords, triplets, etc.
    instruction='What is the best temperature when manufacturing semiconductors.',
    keywords_separator=', ',
    graph_analysis_type='nodes and relations',
    temperature=0.3, 
    inst_prepend='### ',  # Instruction prepend text
    prepend='''You are given a set of information from a graph that describes the relationship 
               between materials and manufacturing process. You analyze these logically 
               through reasoning.\n\n''',  # Prepend text for analysis
    visualize_paths_as_graph=True,  # Whether to visualize paths as a graph
    display_graph=True,  # Whether to display the graph
)
display(Markdown(response))

In [None]:
path

In [None]:
visualize_embeddings_2d_pretty_and_sample(node_embeddings, n_clusters=10, n_samples=10, data_dir=data_dir_output, alpha=.7)


In [None]:
# describe_communities_with_plots_complex(G, N=6, data_dir=data_dir_output)


In [None]:
# graph_statistics_and_plots_for_large_graphs(G, data_dir=data_dir_output,include_centrality=False,
                                               # make_graph_plot=False,)

In [None]:
is_scale_free (G, data_dir=data_dir_output)

In [None]:
# find_best_fitting_node_list("semiconductor", node_embeddings, embedding_tokenizer, embedding_model, 5)

In [None]:
# find_best_fitting_node_list("better manufactoring process for semiconductor", node_embeddings , embedding_tokenizer, embedding_model, 5)

In [20]:
(best_node_1, best_similarity_1, best_node_2, best_similarity_2), path, path_graph, shortest_path_length, fname, graph_GraphML=find_path( G, node_embeddings,
                                embedding_tokenizer, embedding_model , second_hop=False, data_dir=data_dir_output,
                                  keyword_1 = "new materials", keyword_2 = "semiconductor",
                                      similarity_fit_ID_node_1=0, similarity_fit_ID_node_2=0,
                                       )



0nth best fitting node for 'new materials': 'deposited materials' with similarity: 0.7707762399971468
0nth best fitting node for 'semiconductor': 'semiconductor device' with similarity: 0.8970911224120532


NodeNotFound: Either source deposited materials or target semiconductor device is not in G

In [None]:
# path

In [None]:
# path_list, path_string=print_path_with_edges_as_list(G , path)
# path_list,path_string

In [None]:
# visualize_paths_pretty([path_list], 'knowledge_graph_paths.svg', display_graph=True,data_dir=data_dir_output, scale=0.75)

In [None]:
# triplets=find_all_triplets(path_graph) 

In [None]:
# triplets