In [1]:
import os
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from langchain import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain

# Chroma
import chromadb 
from chromadb.utils import embedding_functions
from langchain.vectorstores import Chroma

# Sentence Transformers
from sentence_transformers import SentenceTransformer
from langchain.embeddings import SentenceTransformerEmbeddings

import time
from IPython.display import display, HTML, clear_output

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_paths = {
        "Dolly": "/home/nathan_2/DL2_Kratos_data-Science/models/Dolly",
        "Dolly_7": "/home/nathan_2/DL2_Kratos_data-Science/models/Dolly_7",
        "Mistral": "/home/nathan_2/DL2_Kratos_data-Science/models/Mistral"
    }

In [3]:
# Load your local model
model_path = "/home/nathan_2/DL2_Kratos_data-Science/models/Mistral"
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", load_in_4bit=True)#torch_dtype=torch.bfloat16)


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:00<00:00, 60.36s/it]


In [5]:
# Check the device of the model
print(model.device)

cuda:0


In [6]:
# Load your local tokenizer and set pad_token_id to eos_token_id
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")

In [7]:
model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'A list of colors: red, blue, green, yellow, orange, purple, pink,'

## Greedy Decoding Version
For greedy decoding, set do_sample to False, and ensure num_beams is set to 1 (or omitted, as 1 is the default value). In greedy decoding, top_p and top_k are not relevant.

This version will generate text using the most probable next word at each step. It's faster and more deterministic but may lack diversity in the generated text.

In [8]:
generate_text_greedy = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
    return_full_text=True,
    max_new_tokens=100,
    do_sample=False,  # Greedy decoding
    num_beams=1,  # Explicitly setting to single-beam (greedy) decoding
    pad_token_id=tokenizer.eos_token_id,
    model_kwargs={'load_in_4bit': False}
)


In [9]:
# Measure the start time
start_time_greedy = time.time()

# Generate text using Greedy Decoding
res_greedy = generate_text_greedy("Explain me in detail what is a neural network.")
print(res_greedy[0]["generated_text"])

# Measure the end time and calculate the duration
end_time_greedy = time.time()
duration_greedy = end_time_greedy - start_time_greedy
print(f"Greedy Decoding Time: {duration_greedy} seconds")

Explain me in detail what is a neural network.
A neural network is a type of machine learning algorithm that is designed to model complex relationships between inputs and outputs. It is composed of a series of interconnected nodes, called neurons, that are organized into layers. Each neuron receives input from other neurons in the previous layer, processes the input using a set of mathematical operations, and then sends output to neurons in the next layer.

The input to the neural network is typically a set of features or characteristics of the data that
Greedy Decoding Time: 1.8158848285675049 seconds


## Sample Decoding Version
For sample decoding, do_sample is set to True, and you can utilize top_p and top_k to control the sampling process. num_beams is not relevant in this scenario.

This version introduces randomness into the generation process, resulting in more diverse and less predictable text. The top_p and top_k parameters allow you to fine-tune the balance between randomness and coherence.

In [10]:
generate_text_sample = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
    return_full_text=True,
    max_new_tokens=100,
    do_sample=True,  # Sample decoding
    top_p=0.50,  # Controls the randomness in sample decoding
    top_k=50,  # Controls the randomness in sample decoding
    pad_token_id=tokenizer.eos_token_id,
    model_kwargs={'load_in_8bit': False}
)


In [11]:
# Measure the start time
start_time_sample = time.time()

# Generate text using Sample Decoding
res_sample = generate_text_sample("Explain me what is a neural network.")
print(res_sample[0]["generated_text"])

# Measure the end time and calculate the duration
end_time_sample = time.time()
duration_sample = end_time_sample - start_time_sample
print(f"Sample Decoding Time: {duration_sample} seconds")

Explain me what is a neural network.

A neural network is a type of artificial intelligence (AI) model that is designed to simulate the way the human brain works. It is composed of interconnected nodes, called neurons, that work together to process and analyze data.

Neural networks are trained on large amounts of data to recognize patterns and make predictions. The data is fed into the network, which processes it and outputs a result. The network is then adjusted based on the accuracy of the output to improve its performance
Sample Decoding Time: 1.872246265411377 seconds


## Beam Search Explained
Multiple Beams: Instead of just considering the single most probable next word at each step (as in greedy decoding), beam search keeps track of multiple possible sequences (beams) at each time step.
How It Works: At each step in the sequence, for each beam, the model considers multiple next-word options (each word is a possible extension of the beam). It then keeps only the most probable num_beams sequences for the next step.
Trade-offs: Beam search balances between exploring a variety of possible sequences (thus potentially finding a more optimal or coherent overall sequence) and computational efficiency. However, it is more computationally intensive than greedy decoding because it evaluates multiple sequences in parallel.

### Impact of Increasing num_beams
#### Quality of Output:  
Generally, increasing the number of beams can lead to better-quality outputs. The model has the chance to explore and compare more sequence options, potentially leading to more coherent and contextually appropriate text.

#### Computation Time: 
More beams mean more sequences to evaluate at each step, leading to higher computational overhead. This typically results in slower text generation compared to greedy decoding.

#### Balance Between Exploration and Determinism: 
With more beams, the model strikes a balance between the determinism of greedy decoding (which might miss contextually better but less obvious choices) and the randomness of sampling methods (which might be too diverse).

In [12]:
generate_text_beam_search = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
    return_full_text=True,
    max_new_tokens=100,
    do_sample=False,  # Using beam search, not greedy decoding
    num_beams=5,  # Increasing beams for beam search
    pad_token_id=tokenizer.eos_token_id,
    model_kwargs={'load_in_8bit': False}
)

In [13]:
# Measure the start time
start_time_sample = time.time()

# Generate text using Sample Decoding
res_sample = generate_text_beam_search("Explain me what is a neural network.")
print(res_sample[0]["generated_text"])

# Measure the end time and calculate the duration
end_time_sample = time.time()
duration_sample = end_time_sample - start_time_sample
print(f"Sample Decoding Time: {duration_sample} seconds")

Explain me what is a neural network.

A neural network is a type of artificial intelligence (AI) system that is modeled after the structure and function of the human brain. It is composed of interconnected nodes, called neurons, that process and transmit information in a way that allows the network to learn and make predictions or decisions based on input data.

Neural networks are used in a variety of applications, including image and speech recognition, natural language processing, and predictive analytics. They are particularly well-suited
Sample Decoding Time: 9.903196096420288 seconds
