# Get the model from huggingface 
HOWTO: Get tokenizer, model, load in 4 bits, make a pipeline for the tokenizer+model

In [1]:
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, pipeline
from transformers import BitsAndBytesConfig
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

quantization_config = BitsAndBytesConfig( # To load model in 4 bit quantization
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Get the model from huggingface
model_path="google/flan-t5-xxl" # This is text2text (encoder-decoder) model - good for translation, extraction etc
# model_path="mistralai/Mistral-7B-v0.1" # This is text generation (decoder only) model - good for chatbot
# model_path="tiiuae/falcon-7b-instruct"

config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                         padding='longest')#, padding_side='left')
# model = AutoModelForCausalLM.from_pretrained(model_path,  # Use with text generation model
model = T5ForConditionalGeneration.from_pretrained(model_path, 
                                             quantization_config=quantization_config,
                                             # load_in_4bit=True, 
                                             device_map="cuda:0")

# Transformers pipeline
pipeline = pipeline(
        # "text-generation",
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        # use_cache=True,
        device_map="cuda:0",
        max_length=1024, # Experiment with this 
        do_sample=True, 
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
# # Test the model
# input_text = "Describe the solar system."
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
# outputs = model.generate(input_ids, max_length=1024)
# print(tokenizer.decode(outputs[0]))

# Test the pipeline
print(pipeline("Describe the solar system in detail")[0])

print(pipeline("Describe the solar system in brief")[0])

{'generated_text': 'Describe the sun'}
{'generated_text': 'The sun is the largest object in the solar system and it is located in the constellation of Cancer. This constellation is also the name given to the group of objects which includes the Moon, planets and comets.'}


# Load pipeline into LangChain 
Advantages: Langchain tools - like prompt templates, memory, etc 

In [3]:
# Load into langchain
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain

llm = HuggingFacePipeline(pipeline=pipeline)

#### Prompt
template = """<s>[INST] You are a helpful, respectful and honest assistant. Answer exactly in few words from the context
Answer the question below from context below :
Context: {context}
Question: {question} [/INST] </s>
"""

# template = ''' <s>[INST] 
# Context: {context}
# Question: {question} [/INST] </s>
# '''

question_p = """What is the date for announcement"""
context_p = """ On August 10 said that its arm JSW Neo Energy has agreed to buy a portfolio of 1753 mega watt renewable energy generation capacity from Mytrah Energy India Pvt Ltd for Rs 10,530 crore."""
prompt = PromptTemplate(template=template, input_variables=["question","context"])
# llm_chain = LLMChain(prompt=prompt, llm=llm)
llm_chain = prompt | llm
response = llm_chain.invoke({"question":question_p,"context":context_p})
response

'On August 10 said that its arm JSW Neo Energy has agreed to buy a portfolio of 1753 mega watt renewable energy generation capacity'

# Tring out some different propmts

In [4]:
llm(""" <s> [INST] 
Context: On August 10 said that its arm JSW Neo Energy has agreed to buy a portfolio of 1753 mega watt renewable energy generation capacity from Mytrah Energy India Pvt Ltd for Rs 10,530 crore.
Give the date for announcement. Let's think step by step.[/INST] </s>""")

'On August 10, JSW Steel said that its arm JSW Neo Energy has agreed to buy a portfolio of 1753 mega watt renewable energy generation capacity from Mytrah Energy India Pvt Ltd for Rs 10,530 crore. The answer: August 10.'

In [5]:
print(llm('''
I went to the market and bought 10 apples. I gave 2 apples to the neighbor and 2 to the repairman. I then went and bought 5 more apples and ate 1. How many apples did I remain with?
Let's think step by step.'''))

I had 10 apples and gave out 2 and 2 = 4 apples. I then bought 5 and ate 1 = 4 apples. I was left with 4 apples after giving away the neighbor and repairman. The answer: 4.


In [6]:
print(llm('''
Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done,
there will be 21 trees. How many trees did the grove workers plant today?
A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted.
So, they must have planted 21 - 15 = 6 trees. The answer is 6.

Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.

Q: When I was 6 my sister was half my age. Now I’m 70 how old is my sister?
A:'''))

When I was six my sister was half my age so she was 6 / 2 = 3 years old. Now that I am 70 my sister is 70 - 30 = 30 years old. The answer is 30.


Okay these models are pretty bad 

# Okay, enough play. Getting back to materials extraction

In [7]:
llm('''
Extract the alloy composition, preperation method, phase and property analysed from given context. If not present, answer 'unknown".

Context: High-entropy alloys are newly developed alloys that are composed, by deﬁnition, of at least ﬁve principal elements with concentrations in the range of 5–35 at.%. Therefore, the alloying behavior of any given principal element is signiﬁcantly affected by all the other principal elements present. In order to elucidate this further, the inﬂuence of iron, silver and gold addition on the microstructure and hardness of AlCoCrCuNi-based equimolar alloys has been examined. The as-cast AlCoCrCuNi base alloy is found to have a dendritic structure, of which only solid solution FCC and BCC phases can be observed. The BCC dendrite has a chemical composition close to that of the nominal alloy, with a deﬁciency in copper however, which is found to segregate and form a FCC Cu-rich interdendrite. The microstructure of the iron containing alloys is similar to that of the base alloy. It is found that both of these aforementioned alloys have hardnesses of about 420 HV, which is equated to their similar microstructures.
''')

'Al-Co-Cr-Cu-Ni-Al, AlCoCoCrCuNi-Al-Co-Co-Cr-Ni-Al-CoCoNi-AlCoCoCrCu-AlCoCoCrCuNi-AlCoCoCoCrCuNi-AlCoCoCoCrCuNi, AlCoCoCoCoCrCuCuNi, AlCoCoCoCoCrCuNi-AlCoCoCoCrCuNi-AlCoCoCoCuNi, AlCoCrCuNi-AlCoCrCuNi-based, AlCoCoCoCrCuNi-based, AlCoCoCoCrCuNi-based'

#### Trying output from a reference text file

In [46]:
# First we need a prompt template and chain
# template = """Extract the alloy compositions, preperation method, phase and property analysed from given context. If not present, answer 'unknown".
template = """Extract the alloy compositions, preperation method, phase, property analysed from given context. Also extract values for these as well, if available. If any information is not present, answer 'unknown".

Context: {context}
"""
prompt = PromptTemplate(template=template, input_variables=["context"])
llm_chain = prompt | llm

# Now,...........
lines = open("reference_txts/reference_1.txt", 'r').readlines()
answers = []
for line in lines: 
    answers.append(llm_chain.invoke({"context":line}))

# To run as a batch
# answers = llm_chain.batch([{"context":line} for line in lines])



In [47]:
answers

['Unknown',
 'AlCoCrCuNi, as-cast, FCC, BCC, 420 HV',
 'Ag, Cu, Pd, Pt, Cu, Pb, Sn, Sb, Au, Pd, Pc, Cu, Pt, Pb, Pc, Sb, Sr, Pd, Pt, Pd, Cu, Pt, Pb, Cu, Pd',
 'The spherical particles were produced from 0.5 m diameter particles of the alumino-silicate glass powders by sonication, then the spherical powder was dried by heat treatment, then the powder was smelted in a crucible at 1300 °C for 1 hour and rolled into sheets.',
 'Unknown',
 'Unknown',
 'unknown',
 'AlCoCrCuNi, , microstructure, hardness',
 'Unknown',
 'AlCoCrCuNi, AlCoCrCuFeNi, AgAlCoCrCuNi, AlAuCoCrCuNi, hardness',
 'Unknown',
 'AlCoCrCuNi, XRD, phase, BCC, FCC, property, volume fraction, undefined',
 'Cast',
 'AlxCoCrCuFeNi, preparation method, phase, ID, respectively',
 'AlCoCrCuNi base alloy, casting, unknown, phase, unknown, property analysed',
 'unknown',
 'AlCoCrCuNi: AlCoCrCuFeNi: AlCoCrCuFeNi%, %',
 'cast alloy, composition, interdendrite region, majority dendrite, Cu, Co, Cr, Ni, Ni',
 'AlCoCuNiFe AlCuNiFeAlCrCuNiFe

In [50]:
answers = list(filter(lambda x: " " in x, answers))
answers

['AlCoCrCuNi, as-cast, FCC, BCC, 420 HV',
 'Ag, Cu, Pd, Pt, Cu, Pb, Sn, Sb, Au, Pd, Pc, Cu, Pt, Pb, Pc, Sb, Sr, Pd, Pt, Pd, Cu, Pt, Pb, Cu, Pd',
 'The spherical particles were produced from 0.5 m diameter particles of the alumino-silicate glass powders by sonication, then the spherical powder was dried by heat treatment, then the powder was smelted in a crucible at 1300 °C for 1 hour and rolled into sheets.',
 'AlCoCrCuNi, , microstructure, hardness',
 'AlCoCrCuNi, AlCoCrCuFeNi, AgAlCoCrCuNi, AlAuCoCrCuNi, hardness',
 'AlCoCrCuNi, XRD, phase, BCC, FCC, property, volume fraction, undefined',
 'AlxCoCrCuFeNi, preparation method, phase, ID, respectively',
 'AlCoCrCuNi base alloy, casting, unknown, phase, unknown, property analysed',
 'AlCoCrCuNi: AlCoCrCuFeNi: AlCoCrCuFeNi%, %',
 'cast alloy, composition, interdendrite region, majority dendrite, Cu, Co, Cr, Ni, Ni',
 'AlCoCuNiFe AlCuNiFeAlCrCuNiFeAlCrCuNiFe',
 'AlCoCrCuFeNi, cast, crystal structure, crystal structure and microstructure',


In [60]:
string = ' '.join(answers)
string = f"Extract all the alloy multicompositions like 'AlCoCrCuFe' from the context.\n\nContext: {string}"
print(string)
print("#"*500)
print(llm(string))

Extract all the alloy multicompositions like 'AlCoCrCuFe' from the context.

Context: The alloy composition was 0.02% by mass Fe and 0.8% Cu and the microstructure is amorphous. Al-Co-Cr-Cu-Ni-based alloys, %, Al-Co-Cr-Cu-Ni-based alloys, %, FCC, BCC, Cu-rich, Cu, %, Cu-rich %Silver, %Cu, %Cu, Cu, Ag, Ag, Cu, Ag, Cu, Cu, Ag, Cu, Cu, Cu, Cu In this study, the effect of the addition of niobium and titanium on the microstructures and mechanical properties of the reconstituted tungsten-nickel-iron-nickel alloy (RNI) has been investigated. Thermogravitational energy (G) of the system. m = 1 wt.%, s = 0.035 wt.% Inclusions: Inclusions, %, %, %, %, %, %, % AlCoCrCuNi, Fe, Cu, Ni, Ag, Au, Ag, AgCu, NiCu, NiCu, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo The alloy compositions were: -0.1% -0.25 -0.28 -0.5% -0.4% -0.6% -0.8% -0.25% -0.25 -0.28% -0.5% -0.25% -0.52% AlCoCrCuNi, AlCoCrCuFeNi, AgAlCoCrCuNi, AlAuCoCrCuNi, hardness The results of tens



AlCoCrCuNi AlCoCrCuFeNi


In [53]:
llm_chain.invoke({"context":string})



'Cu-Ni alloy, casting, unknown, phase, unknown, undefined, property analysed AlCoCrCuNi, AgAlCoCrCuNi, AlAuCoCrCuNi, Ag, Co, Cr, Cu, Ni, unknown, AgAlCoCrCuNi, unknown Ag-Cu, Ag-Cu -Cu, Ag-Cu, Cu, -Cu, Cu AlCoCrCuNi, AlCoCrCuNi-Fe, Fe-containing AlCoCrCuNi AlAcCoCrCuNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuFeNi, AlxCoCrCuNi, undefined, %, % cast alloy, composition, interdendrite region, majority dendrite, Ag, Cu, -Ag, Ag, -Ag, -Ag, Ag, n-Ag, n-Ag, n-Ag, n-Ni, n-Ag, n-Ni, n-Ni, n-Ni, n-Ni, n-Ni, n-Ni, n-Ni, n-Ni, -Cr, n-Ni'

#### Trying out summary to see if it performs any better

In [57]:
# First we need a prompt template and chain
# template = """Extract the alloy compositions, preperation method, phase and property analysed from given context. If not present, answer 'unknown".
template = """Summarize and keep all relevant information about the alloy compositions, preperation method, phase, property analysed from given context. Also extract values for these as well, if available.

Context: {context}
"""
prompt = PromptTemplate(template=template, input_variables=["context"])
llm_chain = prompt | llm

# Now,...........
lines = open("reference_txts/reference_1.txt", 'r').readlines()
answers = []
for line in lines: 
    answers.append(llm_chain.invoke({"context":line}))

# To run as a batch
# answers = llm_chain.batch([{"context":line} for line in lines])



In [58]:
answers

['The alloy composition was 0.02% by mass Fe and 0.8% Cu and the microstructure is amorphous.',
 'Al-Co-Cr-Cu-Ni-based alloys, %, Al-Co-Cr-Cu-Ni-based alloys, %, FCC, BCC, Cu-rich, Cu, %, Cu-rich',
 '%Silver, %Cu, %Cu, Cu, Ag, Ag, Cu, Ag, Cu, Cu, Ag, Cu, Cu, Cu, Cu',
 'In this study, the effect of the addition of niobium and titanium on the microstructures and mechanical properties of the reconstituted tungsten-nickel-iron-nickel alloy (RNI) has been investigated.',
 'Thermogravitational energy (G) of the system.',
 'm = 1 wt.%, s = 0.035 wt.%',
 'Inclusions: Inclusions, %, %, %, %, %, %, %',
 'AlCoCrCuNi, Fe, Cu, Ni, Ag, Au, Ag, AgCu, NiCu, NiCu, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo, NiCo',
 'The alloy compositions were: -0.1% -0.25 -0.28 -0.5% -0.4% -0.6% -0.8% -0.25% -0.25 -0.28% -0.5% -0.25% -0.52%',
 'AlCoCrCuNi, AlCoCrCuFeNi, AgAlCoCrCuNi, AlAuCoCrCuNi, hardness',
 'The results of tensile tests showed that the tensile strength w