<a href="https://colab.research.google.com/github/Nick088Official/zephyr-7b-gemma-v0.1_Google_Colab/blob/main/zephyr-7b-gemma-v0.1_Manual.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run zephyr-7b-gemma-v0.1 AI Model (Finetuned Version of Google Gemma 7B) NO WEB UI

Model Used: https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-v0.1

Google Colab Made by [Nick088](https://linktr.ee/Nick088)

In [None]:
#@title Install & Load Dependencies, Model

#@markdown If you wanna use GPU (faster, max 12 free hours daily limit): Set the Video Card from Edit -> Notebook Settings -> T4 GPU OR Any other GPUs based on your Google Colab Subscription
!pip install einops
!pip install accelerate
import torch
from IPython.display import clear_output

if torch.cuda.is_available():
    device = "cuda"
    print("Using GPU")
else:
    device = "cpu"
    print("Detected CPU, please change to GPU or it won't work.")

# Now you can use 'device' for your PyTorch operations

torch.set_default_device(device)

#@markdown The normal version is the official one (PAID GPU REQUIRED), the other ones are GGUF quantized (WORK FOR FREE COLAB TOO), compressed to consume less ram, see more info about them in the [Hugging Face repo of zephyr-7b-gemma-v0.1 GGUF](https://huggingface.co/LoneStriker/zephyr-7b-gemma-v0.1-GGUF)

model_version = "Q3_K_L" #@param ['normal', 'Q3_K_L', 'Q5_K_M', 'Q6_K', 'Q8_0']

if model_version == "normal":
  from transformers import AutoModelForCausalLM, AutoTokenizer
  if device == "cuda":
    model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-gemma-v0.1", torch_dtype="auto", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-gemma-v0.1", trust_remote_code=True)
  else:
    model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-gemma-v0.1", torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-gemma-v0.1", trust_remote_code=True)
else:
  !nvidia-smi
  if device == "cuda":
    !CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
  else:
    !pip install llama-cpp-python
  from huggingface_hub import hf_hub_download
  from llama_cpp import Llama
  model_name = "LoneStriker/zephyr-7b-gemma-v0.1-GGUF"
  model_file = f"zephyr-7b-gemma-v0.1-{model_version}.gguf"
  model_path = hf_hub_download(model_name,
                             filename=model_file,
                             local_dir='/content')
  llm = Llama(model_path=model_path,
            n_gpu_layers=-1)

clear_output()
print(f"Downladed zephyr-7b-gemma-v0.1 {model_version}")

In [None]:
#@title Run Inference
#@markdown Your Prompt
user_prompt = "What is finetuning?" #@param {type:"string"}

# Additional inputs
#@markdown Add your system prompt (prompt to personalize the AI) here or leave it empty if you wanna use the AI normally
system_prompt = "You are ShortAI, write short but concise responses"  #@param {type:"string"}
#@markdown The maximum number of tokens that the model will generate in response to your input
max_new_tokens = 3099 #@param {type:"slider", min:256, max:8192, step:1}
#@markdown Penalize repeated tokens, so make the AI repeat less of itself
repetition_penalty = 1.2 #@param {type:"slider", min:0.0, max:2, step:0.1}

# Concatenate system and user prompts
full_prompt = system_prompt + user_prompt

if model_version == "normal":
  # Tokenize the combined prompt
  inputs = tokenizer(full_prompt, return_tensors="pt", return_attention_mask=False)

  # Generate text based on the combined prompt and additional inputs
  outputs = model.generate(
      **inputs,
      max_new_tokens=max_new_tokens,
      repetition_penalty=repetition_penalty
      )

  text = tokenizer.batch_decode(outputs)[0]

  # Extract the generated text from the model output
  text = text[len(system_prompt):].strip()  # Remove the system prompt from the generated text
  clear_output()
  print(text)
else:
  response = llm(full_prompt, max_tokens=max_new_tokens, repeat_penalty=repetition_penalty)
  clear_output()
  print(response['choices'][0]['text'])