In [None]:
#@title Install required packages and Auto - Restart Session

!pip install accelerate==0.32.1
# !pip install --upgrade transformers
!pip install transformers==4.44.0
!pip install bitsandbytes==0.43.3
!pip install sentencepiece==0.1.99
!pip install protobuf==3.20.3
!pip install flash-attn==2.6.3
!pip install gradio==4.36.1
from IPython.display import clear_output
clear_output()
import time
time.sleep(5)
import os
os.kill(os.getpid(), 9)

In [None]:
#@title <-- Play Audio to active the colab Notebook { display-mode: "form" }

%%html
<b>Press play on the music player to keep the tab alive, then run the cell below</b><br/>
<audio src="https://raw.githubusercontent.com/KoboldAI/KoboldAI-Client/main/colab/silence.m4a" controls>

In [None]:
#@title Import & Download Hermes-3-Llama-3.1-8B Model
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM
import bitsandbytes, flash_attn
import gc
import time
import subprocess

def is_gpu_memory_over_limit(limit_gb=14.5):
    # Run nvidia-smi and capture the output
    result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader'],
                            stdout=subprocess.PIPE, text=True)

    # Split the result into lines (for each GPU if there are multiple)
    memory_used_mb_list = result.stdout.strip().splitlines()

    # Convert memory used from MB to GB and check each GPU's memory usage
    for i, memory_used_mb in enumerate(memory_used_mb_list):
        memory_used_gb = int(memory_used_mb) / 1024.0
        print(f"GPU {i}: Current memory allocated: {memory_used_gb:.2f} GB")
        if memory_used_gb > limit_gb:
            print(f"GPU {i} memory usage exceeds {limit_gb} GB.")
            return True

    print("GPU memory usage is within safe limits.")
    return False


tokenizer=None
model=None
def load_Llama():
  global tokenizer,model
  try:
    if tokenizer is not None:
      del tokenizer
      tokenizer=None
    if model is not None:
      del model
      model=None
    gc.collect()
    torch.cuda.empty_cache()
    print("Free GPU memeory")
    time.sleep(2)
  except:
    pass
  tokenizer = AutoTokenizer.from_pretrained('NousResearch/Hermes-3-Llama-3.1-8B', trust_remote_code=True)
  model = LlamaForCausalLM.from_pretrained(
      "NousResearch/Hermes-3-Llama-3.1-8B",
      torch_dtype=torch.float16,
      device_map="auto",
      load_in_8bit=False,
      load_in_4bit=True,
      # use_flash_attention_2=True
  )
  return tokenizer,model

import re

def clean_response(text):
    # Remove newlines
    text = text.replace('\n', ' ')
    # Remove words inside * *
    cleaned_text = re.sub(r'\*.*?\*', '', text)
    # Remove extra spaces
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text.strip()

def Llama_Chat(system_role,user_msg):
  global tokenizer,model
  prompts = [
              f"""<|im_start|>system
          {system_role}<|im_end|>
          <|im_start|>user
          {user_msg}<|im_end|>
          <|im_start|>assistant""",
              ]
  ans=[]
  for chat in prompts:
      # print(chat)
      input_ids = tokenizer(chat, return_tensors="pt").input_ids.to("cuda")
      generated_ids = model.generate(input_ids, max_new_tokens=750, temperature=0.8, repetition_penalty=1.1, do_sample=True, eos_token_id=tokenizer.eos_token_id)
      response = tokenizer.decode(generated_ids[0][input_ids.shape[-1]:], skip_special_tokens=True, clean_up_tokenization_space=True)
      # print(f"Response: {response}")
      ans.append(response)
  reply=clean_response(ans[-1])
  return reply

tokenizer,model=load_Llama()
def run_llama(system_role,user_msg):
  global tokenizer,model
  if is_gpu_memory_over_limit():
    tokenizer,model=load_Llama()
  llama_reply=Llama_Chat(system_role,user_msg)
  return llama_reply


from IPython.display import clear_output
clear_output()




In [None]:
#@title Test Hermes-3-Llama-3.1-8B
system_role= "You are a helpful Assistant, friendly and fun, providing users with short and concise answers to their requests."  # @param {type: "string"}
user_msg = "Who are you?"  # @param {type: "string"}
llama_reply=run_llama(system_role,user_msg)
from IPython.display import clear_output
clear_output()
llama_reply

'I am an AI language model designed to assist and engage in conversation on a wide range of topics. I aim to provide accurate, relevant information in a friendly manner. How can I help you today?'

In [None]:
#@title Gradio API Interface
username = 'admin'  # @param {type: "string"}
password = 'admin'  # @param {type: "string"}
Debug = True  # @param {type: "boolean"}

import gradio as gr
gradio_examples = [["You are a helpful Assistant, friendly and fun, providing users with short and concise answers to their requests","Who are you?"]]
gradio_input=[gr.Textbox(label="System Role"),gr.Textbox(label="User Message")]
gradio_output=[gr.Textbox(label="Hermes-3-Llama-3.1-8B Response")]
gradio_interface = gr.Interface(fn=run_llama, inputs=gradio_input,outputs=gradio_output , title="Hermes-3-Llama-3.1-8B",examples=gradio_examples)
gradio_interface.queue().launch(share=True,debug=Debug,auth=(username, password))



In [None]:
#@title Gradio Chat Interface
# import time
# import gradio as gr


# def slow_echo(user_msg, history):
#     system_role= "You are a helpful Assistant, friendly and fun, providing users with short and concise answers to their requests."  # @param {type: "string"}
#     message=Llama_Chat(system_role,user_msg)
#     for i in range(len(message)):
#         time.sleep(0.05)
#         yield "" + message[: i + 1]


# demo = gr.ChatInterface(slow_echo)
# demo.launch(share=True)