# Mixtral Prompt Engineering

This notebook runs 4-bit quantized 8 x7B Mixtral model locally with llama_cpp

Before you run this notebook, please follow the instructions below to properly set up all the dependencies.  

Run one of the following commands, according to your system:

```shell
# Base ctransformers with no GPU acceleration
pip install llama-cpp-python
# With NVidia CUDA acceleration
CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
# Or with OpenBLAS acceleration
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
# Or with CLBLast acceleration
CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
# Or with AMD ROCm GPU acceleration (Linux only)
CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
# Or with Metal GPU acceleration for macOS systems only
CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python

# In windows, to set the variables CMAKE_ARGS in PowerShell, follow this format; eg for NVidia CUDA:
$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on"
pip install llama-cpp-python
```

In [1]:
from llama_cpp import Llama
import pandas as pd
import time

## Model Loading

In [2]:
# I downloaded the model to local
model_path = "/Users/haydenchiu/.cache/lm-studio/models/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/"

In [3]:
n_gpu_layers = -1
n_batch = 512
n_ctx=10000

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = Llama(
    model_path= model_path + "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",  # Download the model file first
    n_ctx=n_ctx,  # The max sequence length to use - note that longer sequence lengths require much more resources
    n_threads=8,            # The number of CPU threads to use, tailor to your system and the resulting performance
    n_gpu_layers=n_gpu_layers, # The number of layers to offload to GPU, if you have GPU acceleration available
    n_batch=n_batch,
    f16_kv=True,
    chat_format="llama-2",
    verbose=False, #change to True if you want to investigate the logs
)

In [4]:
#test
prompt = "What is the capital city of British Columbia? Answer in 1 sentence"
# Simple inference example
output = llm(
    prompt = f"[INST] {prompt} [/INST]", # Prompt
    max_tokens=512,  # Generate up to 512 tokens
    #stop=[""],   # Example stop token - not necessarily correct for this specific model! Please check before using.
    echo=False        # Whether to echo the prompt
)
output['choices'][0]['text']

' Victoria is the capital city of British Columbia, a province on the western coast of Canada.'

In [5]:
# Chat Completion API

prompt_1 = "What is the capital city of British Columbia? Answer in 1 sentence"

prompt_2 = "What is your favorite city?"
# Generate a chat completion from a list of messages
output = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a writing assistant."},
        {"role": "user", "content": prompt_1},
        {"role": "user", "content": prompt_2}
    ],
)

output['choices'][0]['message']['content']
# output

" I don't have personal experiences or feelings, so I don't have a favorite city. But I can tell you that the capital city of British Columbia, Canada is Victoria."

In [6]:
# generate text from prompt
output = llm.create_completion(
    prompt,
    max_tokens=1000,
)
output['choices'][0]['text']

'.\n\nVictoria is the capital city of British Columbia, located on the southern end of Vancouver Island.'

# Data Loading

In [7]:
mini_dev_filepath = "../../data/mini_dataset/"
mini_dev_plos_filename = "PLOS_val_mini_milestone3.jsonl"
mini_dev_elife_filename = "eLife_val_mini_milestone3.jsonl"

# full_dev_filepath = "/content/drive/MyDrive/BioLaySumm2024_main/data/full_dev_dataset/"
# full_dev_plos_filename = "PLOS_val.jsonl"
# full_dev_elife_filename = "eLife_val.jsonl"

test_filepath = "../../milestone1/data/biolaysumm2024_data/"
test_plos_filename = "PLOS_test.jsonl"
test_elife_filename = "eLife_test.jsonl"


def read_jsonl(filepath, filename):
    df = pd.read_json(filepath + filename,
                      orient="records",
                      lines=True
                     )
    return df

In [8]:
mini_plos_df = read_jsonl(mini_dev_filepath, mini_dev_plos_filename)
mini_elife_df = read_jsonl(mini_dev_filepath, mini_dev_elife_filename)

test_plos_df = read_jsonl(test_filepath, test_plos_filename)
test_elife_df = read_jsonl(test_filepath, test_elife_filename)

# Prompt tunning

In [17]:
def get_gloss(text, llm=llm, max_length=n_ctx, quiet=False):
    """
        get a gloss of technical terms from a text using a LLM
    """
    tic = time.perf_counter()
    prompt = f"""
    [INST] Extract important technical terms and provide their gloss from the following medical paper to a JSON format:  {text[:max_length]} [/INST]
    """
    output = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a helpful assistant that outputs in JSON."},
        {"role": "user", "content": prompt}
        ],
    response_format={
        "type": "json_object",
        },
    )
    toc = time.perf_counter()
    if not quiet:
        print(f"extracted gloss in {toc - tic:0.4f} seconds")
        
    return output['choices'][0]['message']['content']

In [18]:
def lay_summarize_with_gloss(text, gloss, llm=llm, max_length=n_ctx, quiet=False):
    """
        summarize a text using a LLM with the aid of gloss of technical terms,
        with min_length and max_length are number of tokens limits for the output
    """
    tic = time.perf_counter()
    prompt = f"""[INST] Use simple English when you can for this task. 
    Using the gloss from this dictionary: {gloss}
    Simplify and summarize the following medical paper in around 300 words:  {text[:max_length]} [/INST]"""

    output = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a medical writing assistant who specialize in lay summarization."},
        {"role": "user", "content": prompt}
        ]
    )
    toc = time.perf_counter()
    if not quiet:
        print(f"extracted summary in {toc - tic:0.4f} seconds")
    return output['choices'][0]['message']['content']

In [24]:
def lay_summarize_train_of_thought(text, llm=llm, max_length=n_ctx, quiet=False):
    """
        summarize a text using a LLM with the aid of gloss of technical terms,
        with min_length and max_length are number of tokens limits for the output
    """
    tic = time.perf_counter()

    prompt_1 = f"""[INST] Extract important technical terms and provide their gloss from the following medical paper to a JSON format:  {text[:max_length]} [/INST]
    """
    output_1 = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a medical writing assistant who specialize in lay summarization."},
        {"role": "user", "content": prompt_1}
        ]
    )
    # if not quiet:
    #     print(output_1['choices'][0]['message']['content'])
    
    prompt_2 = f"""[INST] Use simple English when you can for this task. 
    Using the gloss from this dictionary: {output_1['choices'][0]['message']['content']}
    Simplify and summarize the following medical paper in around 300 words:  {text[:max_length]} [/INST]"""

    output_2 = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a medical writing assistant who specialize in lay summarization."},
        {"role": "user", "content": prompt_2}
        ]
    )
    toc = time.perf_counter()
    if not quiet:
        print(f"extracted summary in {toc - tic:0.4f} seconds")
    return output_2['choices'][0]['message']['content']

In [19]:
def lay_summarize(text, llm=llm, max_length=n_ctx, quiet=False):
    """
        summarize a text using a LLM with the aid of gloss of technical terms,
        with min_length and max_length are number of tokens limits for the output
    """
    tic = time.perf_counter()
    
    prompt = f"""[INST] Use simple English when you can for this task. 
    Simplify and summarize the following medical paper in around 300 words:  {text[:max_length]} [/INST]"""

    output = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a medical writing assistant who specialize in lay summarization."},
        {"role": "user", "content": prompt}
        ]
    )
    toc = time.perf_counter()
    if not quiet:
        print(f"extracted summary in {toc - tic:0.4f} seconds")
        
    return output['choices'][0]['message']['content']

## Some more testing

In [20]:
test = mini_elife_df.loc[0,"article"]
test_gloss = get_gloss(test)
test_gloss

extracted gloss in 60.9494 seconds


'{\n    "technical_terms": {\n        "Neural networks": "A network of interconnected neurons that process information and transmit signals.",\n        "Synaptic and cellular activity patterns": "The synchronized firing of neurons and the resulting electrical signals in the brain.",\n        "Cognition": "The mental process of acquiring knowledge and understanding through thought, experience, and the senses.",\n        "Developmentally regulated": "Controlled or influenced by developmental factors.",\n        "Postnatal time course": "The progression of events after birth.",\n        "High spatiotemporal resolution in vivo electrophysiology": "A research method used to measure the electrical activity of neurons in living organisms with high precision and accuracy.",\n        "Cortical processes": "Relating to the outermost layer of the brain, responsible for higher thought processes.",\n        "Volatile transition period": "A period of rapid and unpredictable change.",\n        "Neura

In [21]:
summary_w_gloss = lay_summarize_with_gloss(test, test_gloss)
summary_w_gloss

extracted summary in 55.2972 seconds


' The research investigates how neural networks develop and perform complex computations to support cognition. The study uses high-resolution electrophysiology in mice to observe the development of large-scale synaptic and cellular activity patterns. It was found that mature cortical processes emerge rapidly and simultaneously after a quiet period at the beginning of the second postnatal week in rodents. This transition is characterized by relative neural inactivity, followed by spatially distributed, temporally precise, and interconnected neural activity. A similar developmental trajectory was observed in humans, suggesting an evolutionarily conserved mechanism.\n\nThe research also examines the development of neural network properties in the immature brain. It was found that the disappearance of immature activity patterns, such as spindle bursts in rodents and delta brushes in humans, could herald the emergence of advanced neural network properties. During this developmental epoch, c

In [22]:
summary_wo_gloss = lay_summarize(test)
summary_wo_gloss

extracted summary in 51.4041 seconds


' This study investigates how immature neural networks develop into mature ones that can perform complex computations for cognition. It focuses on the development of large-scale synaptic and cellular activity patterns in mice and humans. In mice, a distinct but volatile transition period was found at the beginning of the second postnatal week. During this time, there is relative neural quiescence followed by spatially distributed, temporally precise, and internally organized activity. A similar developmental trajectory was observed in humans, suggesting an evolutionarily conserved mechanism.\n\nThe study also discusses the role of spindle-like oscillations (10-20 Hz) that occur in immature neural networks. These are triggered by peripheral stimuli and are important for neuronal survival, establishing sensory ensembles, and critical period plasticity. However, they disappear shortly after birth in humans and during the first postnatal week in rodents, indicating their transient role in 

In [25]:
lay_summarize_train_of_thought(test)

extracted summary in 162.2856 seconds


' This study investigates how the brain develops its complex functions during early life. Researchers used high-resolution electrophysiology in mice to find out when and how neural networks start performing advanced computations needed for cognition. They discovered that mature cortical processes, such as spatially distributed, temporally precise, and interconnected activity patterns, quickly emerge after a short transition period at the beginning of the second postnatal week in mice. This transition phase is marked by relative neural quietness before the appearance of coordinated cortical networks.\n\nIn both rodents and humans, this developmental trajectory appears to be evolutionarily conserved. The study suggests that a transient quiescent period might be necessary for the subsequent emergence of coordinated cortical networks. During this time, cortical microcircuits undergo significant changes in anatomical and functional connectivity, as well as an abrupt increase in synaptogenes

# Summary Generation
## train of thought

In [27]:
test_elife_df["mixtral_summary"] = test_elife_df["article"].apply(lambda text: lay_summarize_train_of_thought(text))
test_elife_df.head()

extracted summary in 191.1340 seconds
extracted summary in 96.1170 seconds
extracted summary in 118.3305 seconds
extracted summary in 118.9475 seconds
extracted summary in 130.9092 seconds
extracted summary in 135.1070 seconds
extracted summary in 101.4791 seconds
extracted summary in 104.5868 seconds
extracted summary in 99.1811 seconds
extracted summary in 94.0425 seconds
extracted summary in 85.4069 seconds
extracted summary in 156.8738 seconds
extracted summary in 130.3400 seconds
extracted summary in 113.9008 seconds
extracted summary in 108.8934 seconds
extracted summary in 110.4087 seconds
extracted summary in 201.5874 seconds
extracted summary in 87.8534 seconds
extracted summary in 145.4825 seconds
extracted summary in 117.8342 seconds
extracted summary in 149.5870 seconds
extracted summary in 121.1372 seconds
extracted summary in 171.5099 seconds
extracted summary in 103.7836 seconds
extracted summary in 111.5956 seconds
extracted summary in 131.6707 seconds
extracted summary

Unnamed: 0,article,headings,keywords,id,mixtral_summary
0,Acylation of diverse carbohydrates occurs acro...,"[Abstract, Introduction, Results and discussio...","[biochemistry and chemical biology, computatio...",elife-81547-v1,The AT3 domain is a type of protein involved ...
1,Honey bee ecology demands they make both rapid...,"[Abstract, Introduction, Results, Discussion, ...",[computational and systems biology],elife-86176-v2,Honey bees have advanced decision-making abil...
2,"Biguanides , including the world’s most prescr...","[Abstract, Introduction, Results, Discussion, ...",[genetics and genomics],elife-82210-v1,"Biguanides, medicines used to treat type 2 di..."
3,Ecological relationships between bacteria medi...,"[Abstract, Introduction, Results, Discussion, ...","[microbiology and infectious disease, ecology]",elife-83152-v2,The gut microbiomes of mammals are made up of...
4,Gamma oscillations are believed to underlie co...,"[Abstract, Introduction, Results, Discussion, ...",[neuroscience],elife-83044-v2,"Gamma oscillations, a type of brain wave, are..."


In [29]:
test_plos_df["mixtral_summary"] = test_plos_df["article"].apply(lambda text: lay_summarize_train_of_thought(text))
test_plos_df.head()

extracted summary in 138.4145 seconds
extracted summary in 108.7131 seconds
extracted summary in 122.4606 seconds
extracted summary in 133.5298 seconds
extracted summary in 108.1153 seconds
extracted summary in 98.9468 seconds
extracted summary in 55.9403 seconds
extracted summary in 101.2183 seconds
extracted summary in 130.8996 seconds
extracted summary in 85.4419 seconds
extracted summary in 220.9497 seconds
extracted summary in 161.2667 seconds
extracted summary in 130.4045 seconds
extracted summary in 115.1271 seconds
extracted summary in 130.7792 seconds
extracted summary in 117.3462 seconds
extracted summary in 130.9465 seconds
extracted summary in 90.4805 seconds
extracted summary in 118.9894 seconds
extracted summary in 79.7075 seconds
extracted summary in 89.2487 seconds
extracted summary in 128.3329 seconds
extracted summary in 101.0256 seconds
extracted summary in 104.3307 seconds
extracted summary in 112.2664 seconds
extracted summary in 140.8384 seconds
extracted summary 

Unnamed: 0,article,headings,keywords,id,mixtral_summary
0,Lung-resident ( LR ) mesenchymal stem and stro...,"[Abstract, Introduction, Results, Discussion, ...","[immune system, medical conditions, molecular ...",journal.ppat.1009789,Mesenchymal stem and stromal cells (MSCs) are...
1,Visceral leishmaniasis ( VL ) is endemic in So...,"[Abstract, Introduction, Methods, Results, Dis...","[neonates, clinical laboratory sciences, trans...",journal.pntd.0007992,"Visceral Leishmaniasis (VL), also known as Ka..."
2,A high burden of Salmonella enterica subspecie...,"[Abstract, Introduction, Methods, Results, Dis...","[pathogens, medical conditions, taxonomy, bact...",journal.pntd.0010704,The study looked at 310 samples of Salmonella...
3,Severe Acute Respiratory Syndrome Coronavirus-...,"[Abstract, Introduction, Results, Discussion, ...","[pathogens, amniotes, medical conditions, bind...",journal.ppat.1010691,"The SARS-CoV-2 virus, responsible for the COV..."
4,Many fungal species utilize hydroxyderivatives...,"[Abstract, Introduction, Results and discussio...","[taxonomy, proteins, chemistry, genetics, enzy...",journal.pgen.1009815,The yeast Candida parapsilosis can use hydrox...


# Output

In [28]:
def write_to_txt(df, output_path, output_file_txt):
    
    df["mixtral_summary"] = df["mixtral_summary"].apply(lambda text: (text
                                                              .replace("\n", "")
                                                              .strip('"')
                                                              .strip()
                                                              ))
    # write the baseline_summary column to txt file
    txt_df = df['mixtral_summary']
    txt_df.to_csv(output_path+output_file_txt,
                index=False,
                header=False,
                sep="\n"
              )
  print(f"Output {output_file_txt} completed")

In [30]:
output_path = './data/'
output_file_txt = "plos.txt"

write_to_txt(test_plos_df, output_path, output_file_txt)

output_file_txt = "elife.txt"

write_to_txt(test_elife_df, output_path, output_file_txt)

Output plos.txt completed
Output elife.txt completed


In [32]:
test_plos_df.to_csv(output_path+"plos_results.csv")
test_elife_df.to_csv(output_path+"elife_results.csv")