In [2]:
from llama_cpp import Llama

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = Llama(
  model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",  # Download the model file first
  n_ctx=32768,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=8,            # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=48         # The number of layers to offload to GPU, if you have GPU acceleration available
)

# Simple inference example
output = llm(
  "<s>[INST] {prompt} [/INST]", # Prompt
  max_tokens=512,  # Generate up to 512 tokens
  stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
  echo=True        # Whether to echo the prompt
)

# Chat Completion API

llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2")  # Set chat_format according to the model you are using
llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a story writing assistant."},
        {
            "role": "user",
            "content": "Write a story about llamas."
        }
    ]
)



llama_print_timings:        load time =    1968.82 ms
llama_print_timings:      sample time =      66.02 ms /   480 runs   (    0.14 ms per token,  7270.74 tokens per second)
llama_print_timings: prompt eval time =    1968.76 ms /    32 tokens (   61.52 ms per token,    16.25 tokens per second)
llama_print_timings:        eval time =   74034.84 ms /   479 runs   (  154.56 ms per token,     6.47 tokens per second)
llama_print_timings:       total time =   77301.60 ms /   511 tokens


{'id': 'chatcmpl-0f393bee-2f02-477e-82e9-4e3c8fe84aa1',
 'object': 'chat.completion',
 'created': 1712116230,
 'model': './mistral-7b-instruct-v0.2.Q4_K_M.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': ' Title: The Magical Herd of Llamas\n\nOnce upon a time, in the heart of the Andes Mountains, there was a hidden valley filled with lush greenery and crystal-clear waters. This enchanting place was home to an extraordinary herd of llamas, known as the "Magical Herd." These were no ordinary llamas; they possessed unique abilities that made them the talk of the land.\n\nThe eldest and wisest llama in the herd was named Amaru. He had a coat as golden as the sun and eyes that shone like stars. Amaru was revered by all for his wisdom and kindness. He could communicate with other animals and even with the elements, making him the guardian and protector of his herd.\n\nOne day, as Amaru gazed over his herd, he sensed an impending danger. A dark cloud loomed 

In [3]:
from llama_cpp import Llama

# Initialize the Llama model
llm = Llama(
    model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    n_ctx=32768,
    n_threads=8,
    n_gpu_layers=35
)

# Define your prompt
prompt = "What is python?"

# Perform simple inference with the prompt
output = llm(
    f"<s>[INST] {prompt} [/INST]",  # Incorporate the prompt into the input
    max_tokens=512,
    stop=["</s>"],
    echo=True
)

# Print the generated output
print(output)

# If you want to use the Chat Completion API
llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2")

# Define a conversation prompt
conversation = [
    {"role": "system", "content": "You are a story writing assistant."},
    {"role": "user", "content": "Write a story about llamas."}
]

# Create chat completion based on the conversation
chat_output = llm.create_chat_completion(messages=conversation)

# Print the chat completion output
print(chat_output)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ./mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                

{'id': 'cmpl-745426ac-67d8-4135-ad0f-6c2ca11355d0', 'object': 'text_completion', 'created': 1712116501, 'model': './mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'text': "<s>[INST] What is python? [/INST] Python is a high-level, interpreted programming language that was created by Guido van Rossum and first released in 1991. It is known for its clear syntax and readability, making it an excellent choice for beginners to learn programming. Python supports multiple programming paradigms, including procedural, object-oriented, and functional programming.\n\nPython has a large standard library and a vast ecosystem of third-party libraries, allowing developers to build applications in various domains such as web development, data analysis, artificial intelligence, scientific computing, automation, and more. Some popular frameworks and libraries in Python include Django, Flask, NumPy, Pandas, TensorFlow, and Scikit-learn.\n\nPython runs on various operating systems such as Windows, mac

llm_load_tensors:        CPU buffer size =  4165.37 MiB
.................................................................................................
llama_new_context_with_model: n_ctx      = 512
llama_new_context_with_model: n_batch    = 512
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: freq_base  = 1000000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:        CPU KV buffer size =    64.00 MiB
llama_new_context_with_model: KV self size  =   64.00 MiB, K (f16):   32.00 MiB, V (f16):   32.00 MiB
llama_new_context_with_model:        CPU  output buffer size =    62.50 MiB
llama_new_context_with_model:        CPU compute buffer size =    73.00 MiB
llama_new_context_with_model: graph nodes  = 1060
llama_new_context_with_model: graph splits = 1
AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 |

{'id': 'chatcmpl-d2b242f6-0736-4352-937e-b1a8b21a112e', 'object': 'chat.completion', 'created': 1712116540, 'model': './mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': " Title: The Magical Herd of Llamas\n\nOnce upon a time, in the heart of the Andes Mountains, there was a hidden valley nestled between towering peaks. This enchanting valley was home to an extraordinary herd of llamas, known as the Magical Herd of Llamas. These were no ordinary llamas; they possessed unique abilities that brought joy and wonder to the people living nearby.\n\nThe eldest and most revered member of the herd was named Llama Llama, a wise and gentle llama with a flowing white mane and a coat as golden as the sun. He was respected and loved by all the other llamas and held the secret to their magical powers.\n\nOne sunny morning, as the herd grazed peacefully by the shimmering lake, a group of travelers arrived at the edge of the valley. Among them 

In [None]:
from llama_cpp import Llama

# Initialize the Llama model
llm = Llama(
    model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    n_ctx=32768,
    n_threads=8,
    n_gpu_layers=35
)

# Define your prompt
prompt = "generate a tweet for buying a new car with impressive emojis."

# Perform simple inference with the prompt
output = llm(
    f"<s>[INST] {prompt} [/INST]",  # Incorporate the prompt into the input
    max_tokens=1000,
    stop=["</s>"],
    echo=True
)

# Print the generated output
print(output)


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
 
# Load model
model_name = "TroyDoesAI/MermaidStable3B"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
 
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
 
# Define prompt text
prompt_text = "Create the mermaid diagram for the following code: print('Hello, World!')"
 
# Tokenize input text
input_ids = tokenizer.encode(prompt_text, return_tensors="pt")
 
# Generate text
max_length = 100  # Adjust the length of the generated text as needed
output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.7)
 
# Decode and print generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:\n", generated_text)
 

config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


configuration_stablelm_epoch.py:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/TroyDoesAI/MermaidStable3B:
- configuration_stablelm_epoch.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_stablelm_epoch.py:   0%|          | 0.00/38.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/TroyDoesAI/MermaidStable3B:
- modeling_stablelm_epoch.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/29.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/610M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/8.16k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Generated Text:
 Create the mermaid diagram for the following code: print('Hello, World!')

# In[2]:


# Create the mermaid diagram for the following code:

# print('Hello, World!')

# In[3]:


# Create the mermaid diagram for the following code:

# import random

# # Set up the constants:
# NUM_OF_DICE = 2
# MAX_DICE


In [2]:
!nvidia-smi


Wed Apr  3 17:29:07 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.86                 Driver Version: 551.86         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   48C    P8              9W /   95W |     340MiB /   4096MiB |      5%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import torch
torch.cuda.is_available()


True

In [5]:
!CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python

'CMAKE_ARGS' is not recognized as an internal or external command,
operable program or batch file.


In [1]:
import os
from transformers import pipeline
 
# Define the pipeline for text generation
text_generation_pipeline = pipeline("text-generation", model="TroyDoesAI/MermaidMixtral-2x7b")
 
# Generate the text for the diagram including odd-even number checking
prompt_text = """
Check if a number is odd or even:
 
    ```python
    def is_odd_even(num):
        if num % 2 == 0:
            return "Even"
        else:
            return "Odd"
 
    # Test the function
    number = 7
    result = is_odd_even(number)
    print(f"The number {number} is {result}")
    ```
"""
 
# Generate the diagram based on the prompt text
diagram = text_generation_pipeline(prompt_text, max_length=100, do_sample=False)[0]['generated_text']
 
# Print the generated diagram
print(diagram)

: 