## Dependencies

In [None]:
!pip install -q ctransformers>=0.2.24

In [None]:
# reinstall ctransformers with metal
!pip uninstall ctransformers --yes 
!CT_METAL=1 pip install ctransformers --no-binary ctransformers

In [None]:
!pip3 install -q huggingface-hub>=0.17.1
# !pip list


In [None]:
from ctransformers import AutoModelForCausalLM

In [None]:
import time

def measure_runtime(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        duration = end_time - start_time
        print(f"Function {func.__name__} executed in {duration:.4f} seconds\n")
        return result
    return wrapper

def model_config(model):
    print(model.model_path)
    print (model.config)
    print(model.model_type)
    print(model.context_length)

In [None]:
prompts = ["What is the capital of Sweden?",
           "Какой город является столицей Швеции?",
           "Почему люди предпочитают весну?",
           "Why people prefer spring?",
           "AI is going to",
           "Напиши рецепт лазаньи?",
           "Что такое python",
           "Белые медведи и пингвины могут встречаться вместе в природе?",
           "Is modest wine drinking not harmful for our health?"

           ]

@measure_runtime
def printout_prompt(model, prompt):
    print(f'{"-"*80}\nThe prompt: {prompt}\n')
    print(f'The ({model.model_path }) model answer is:\n')
    print(model(prompt))



### TheBloke/Llama-2-7b-Chat-GGUF

In [None]:
# !huggingface-cli download TheBloke/Llama-2-7b-Chat-GGUF --local-dir . --local-dir-use-symlinks False --include='*Q4_K_M.gguf'

!huggingface-cli download TheBloke/Llama-2-7b-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False


In [None]:
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm_7chat = AutoModelForCausalLM.from_pretrained(model_path_or_repo_id="/Users/velo1/SynologyDrive/GIT_syno/data/NLP/models/", model_file="llama-2-7b-chat.Q4_K_M.gguf", \
                                           model_type="llama", gpu_layers=0, local_files_only=True



                                                 )



In [None]:
# check the path
import os

model_path = "/Users/velo1/SynologyDrive/GIT_syno/data/NLP/models/llama-2-7b-chat.Q4_K_M.gguf"
if os.path.exists(model_path):
    print("Path exists!")
else:
    print("Path does not exist. Check the path.")


In [None]:
for prompt in prompts:
    printout_prompt(model=llm_7chat, prompt=prompt)


### rozek/LLaMA-2-7B-32K-Instruct_GGUF

In [None]:
!huggingface-cli download rozek/LLaMA-2-7B-32K-Instruct_GGUF LLaMA-2-7B-32K-Instruct-Q8_0.gguf --local-dir . --local-dir-use-symlinks False


In [None]:
%%time
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm_7_32 = AutoModelForCausalLM.from_pretrained(model_path_or_repo_id="/content/", model_file="LLaMA-2-7B-32K-Instruct-Q8_0.gguf", \
                                           model_type="llama", gpu_layers=0, local_files_only=True,\
                                              batch_size=4096,
                                              temperature = 0.4



                                                )


In [None]:
for prompt in prompts:
    printout_prompt(model=llm_7_32, prompt=prompt)

### TheBloke/OpenBuddy-Llama2-13B-v11.1-GGUF

In [None]:
!huggingface-cli download TheBloke/OpenBuddy-Llama2-13B-v11.1-GGUF openbuddy-llama2-13b-v11.1.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False

In [None]:
!ls

In [None]:
%%time
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm_13 = AutoModelForCausalLM.from_pretrained(model_path_or_repo_id="/content/", model_file="openbuddy-llama2-13b-v11.1.Q4_K_M.gguf", \
                                           model_type="llama", gpu_layers=30, local_files_only=True,
                                              max_new_tokens=100,
                                            #   top_k=50,
                                            #   top_p =1,
                                            #   batch_size=1024,
                                            #   temperature = 0.7
                                              )



In [None]:
for prompt in prompts:
    printout_prompt(model=llm_7chat, prompt=prompt)

In [None]:
for prompt in prompts:
    printout_prompt(model=llm_13, prompt=prompt)

## tuning

In [None]:
%%time
llm_7chat = AutoModelForCausalLM.from_pretrained(model_path_or_repo_id="/content/", model_file="llama-2-7b-chat.Q4_K_M.gguf", \
                                                                                      model_type="llama", gpu_layers=0, local_files_only=True,\
                                              max_new_tokens=50,
                                              top_k=5,
                                              top_p =1,
                                              batch_size=1024,
                                              temperature = 0.4
)

In [None]:
%%time
model_ = llm_7chat
# model_ = llm_7_32
# model_ = llm_13

# print(model_("Как приготовить суп?"))
print(model_("What is the capital of Sweden?"))
# print(model_("Why people prefer spring?"))
# print(model_("Какой город является столицей Швеции?"))
# print(model_("Какие основные ингридиенты в лазанье?"))
model_config(model_)


In [1]:
from sentence_transformers import SentenceTransformer
import pathlib
from beholder import print_methods
import os


### Download the model

In [2]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
dest_folder = pathlib.Path("data/llm_models")

# Split the model_name using the slash
parts = model_name.split("/")

# If there's a slash in the model_name, use the parts to determine subfolder and model folder
if len(parts) > 1:
    subfolder, model_folder = parts[0], parts[1]
    model_path = dest_folder / subfolder / model_folder
else:
    subfolder = ''
    model_path = dest_folder / model_name

# Check if the model exists
if model_path.exists():
    print(f"{model_name} is found on the local device")
    model = SentenceTransformer(str(model_path), ma)
else:
    model = SentenceTransformer(model_name)
    model.save(path=str(model_path))



sentence-transformers/all-MiniLM-L6-v2 is found on the local device


2023-10-15 09:33:47.517390: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# ?model

In [15]:
sentences = ["This is an example sentence", "Each sentence is converted"]


# model = SentenceTransformer(model_name)
embeddings = model.encode(sentences)

print(embeddings.shape,model.max_seq_length)


(2, 384) 1000


[model.max_seq_leng](https://stackoverflow.com/questions/75901231/max-seq-length-for-transformer-sentence-bert)

In [16]:
model.max_seq_length=512

In [17]:
model.get_max_seq_length()

512