<a href="https://colab.research.google.com/github/Nick088Official/Gemma-Google-Colab/blob/main/Gemma_UI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run Google Gemma AI Models
Check out the models: https://blog.google/technology/developers/gemma-open-models/

Made by [Nick088](https://linktr.ee/Nick088)

In [None]:
#@title Install & Load Dependencies, Model
!pip install einops
!pip install accelerate
!pip install huggingface_hub
import torch
from huggingface_hub import login
from IPython.display import clear_output
from huggingface_hub import hf_hub_download

if torch.cuda.is_available():
    device = "cuda"
    print("Using GPU")
else:
    device = "cpu"
    print("Using CPU")

Google_Gemma_Model = "gemma-2b" #@param ['gemma-2b', 'gemma-2b-it', 'gemma-7b', 'gemma-7b-it']

#@markdown The normal version are the official ones, the other ones are GGUF quantized, compressed to consume less ram.

GGUF_Format = True #@param {type:"boolean"}

#@markdown Go to https://huggingface.co/settings/tokens and make a token with the read role, and paste it here.
Hugging_Face_Read_Token = "" #@param {type:"string"}

# Authenticate
login(token=Hugging_Face_Read_Token)

if GGUF_Format == False:
  from transformers import AutoModelForCausalLM, AutoTokenizer
  if device == "cuda":
    model = AutoModelForCausalLM.from_pretrained(f"google/{Google_Gemma_Model}", torch_dtype="auto", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(f"google/{Google_Gemma_Model}", trust_remote_code=True)
  else:
    model = AutoModelForCausalLM.from_pretrained(f"google/{Google_Gemma_Model}", torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(f"google/{Google_Gemma_Model}", trust_remote_code=True)
else:
  if device == "cuda":
    !CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
  else:
    !pip install llama-cpp-python
  from huggingface_hub import hf_hub_download
  from llama_cpp import Llama
  model_name = f"google/{Google_Gemma_Model}"
  model_file = f"{Google_Gemma_Model}.gguf"
  model_path = hf_hub_download(model_name,
                             filename=model_file,
                             local_dir='/content',
                             token=Hugging_Face_Read_Token)
  llm = Llama(model_path=model_path,
            n_gpu_layers=-1)

clear_output()
if GGUF_Format == False:
  print(f"Downloaded {Google_Gemma_Model}")
else:
  print(f"Downloaded {Google_Gemma_Model} in GGUF Format")

In [None]:
#@title Run Gemma UI

def answer(user_prompt, system_prompt, max_new_tokens, repetition_penalty, temperature, top_p, top_k, seed):
  # Concatenate system and user prompts
  full_prompt = system_prompt + user_prompt
  if GGUF_Format == False:
    torch.manual_seed(seed)

    # Tokenize the combined prompt
    inputs = tokenizer(full_prompt, return_tensors="pt", return_attention_mask=False)

    # Generate text based on the combined prompt and additional inputs
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        repetition_penalty=repetition_penalty
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k
        )

    # Extract the generated text from the model output
    text = tokenizer.batch_decode(outputs)[0]
    dirty_text = text[len(system_prompt):].strip()  # Remove the system prompt from the generated text
    text = dirty_text.replace("onses", "Question:\n").replace("<eos>", "")
    return text
  else:
    response = llm(full_prompt, max_tokens=max_new_tokens, repeat_penalty=repetition_penalty, temperature=temperature, top_p=top_p, top_k=top_k, seed=seed)
    response = response['choices'][0]['text']
    return response

# give ui a title based on model and format
if GGUF_Format == False:
  UI_Title = f"{Google_Gemma_Model}"
else:
  UI_Title = f"{Google_Gemma_Model} GGUF Format"

# Define the gradio interface
interface = gr.Interface(
    fn=answer,
    inputs=[
        gr.Textbox(
            label="Prompt",
            interactive=True,
            ),
        gr.Textbox(
            label="System Prompt",
            interactive=True,
            ),
        gr.Slider(
            label="Max new tokens",
            value=2048,
            minimum=0,
            maximum=8192,
            step=1,
            interactive=True,
            info="The maximum numbers of new tokens, controls how long is the output",
            ),
        gr.Slider(
            label="Repetition penalty",
            value=1.2,
           minimum=1.0,
           maximum=2.0,
           step=0.05,
           interactive=True,
           info="Penalize repeated tokens, making the AI repeat less itself",
            ),
        gr.Slider(
            label="Temperature",
            value=0.9,
            minimum=0.0,
            maximum=1.0,
            step=0.05,
            interactive=True,
            info="Higher values produce more diverse outputs",
            ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            value=0.90,
            minimum=0.0,
            maximum=1,
            step=0.05,
            interactive=True,
            info="Higher values sample more low-probability tokens",
            ),
        gr.Slider(
            label="Top-k",
            value=1,
            minimum=0,
            maximum=100,
            step=1,
            interactive=True,
            info="Higher k means more diverse outputs by considering a range of tokens",
            ),
        gr.Number(
            label="Seed",
            value=42,
            interactive=True,
            info="A starting point to initiate the generation process"
            )
    ],
    outputs="text",
    title=f"{UI_Title}",
)


# Launch the gradio interface
interface.launch(share=True, debug=True)