<a href="https://colab.research.google.com/github/Nick088Official/Phi-2-Super-Google-Colab/blob/main/Phi2_Super_UI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phi2 Super AI Microsoft Model WEB UI
 Demo Made by Simone Rizzo & Modified by Nick088

This notebook is a demo for testing the new Small Language Model of Microsoft, [Phi-2 article](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/), the modified version, [Phi 2 Super](https://huggingface.co/abacaj/phi-2-super) Model, trained with more dataset.

Demo Made by **Simone Rizzo**:
- [Youtube](https://www.youtube.com/channel/UCbMlkb79E12CwveGAtdFj-A)
- [Linkedin](https://www.linkedin.com/in/simone-rizzo-9851b7147/)
- [TikTok](https://www.tiktok.com/@simonerizzo98)

Modified by **Nick088**:
- [Youtube](https://www.youtube.com/channel/@Nick088Official)
- [TikTok](https://www.tiktok.com/@forgotforever)
- [Reddit](reddit.com/user/Nick088Real)
- [Twitter](https://twitter.com/Nick088Official)
- [Discord](https://discord.com/channels/@me/911742715019001897)

Follow and leave a like on my socials 😜

In [None]:
#@title Install & Load Dependencies, Model

#@markdown If you wanna use CPU (slower, no daily limit): Set the CPU from Edit -> Notebook Settings -> CPU

#@markdown Use GPU (faster, max 12 free hours daily limit): Set the Video Card from Edit -> Notebook Settings -> T4 GPU OR Any other GPUs based on your Google Colab Subscription

!pip install einops
!pip install accelerate
import torch
from IPython.display import clear_output

if torch.cuda.is_available():
    device = "cuda"
    print("Using GPU")
else:
    device = "cpu"
    print("Using CPU")

# Now you can use 'device' for your PyTorch operations

torch.set_default_device(device)

#@markdown The normal version is the official one from Microsoft, the other ones are GGUF quantized (GPU REQUIRED FOR THOSE), compressed to consume less ram, see more info about them [in the Hugging Face repo of Phi 2 Super GGUF](https://huggingface.co/sayhan/phi-2-super-GGUF)

phi_2_super_model_version = "normal" #@param ['normal', 'Q2_K', 'Q3_K_S', 'Q3_K_M', 'Q3_K_L', 'Q4_0', 'Q4_K_M', 'Q4_K_S', 'Q5_0', 'Q5_K_S', 'Q5_K_M', 'Q6_K', 'Q8_0', 'FP16']

if phi_2_super_model_version == "normal":
  from transformers import AutoModelForCausalLM, AutoTokenizer
  if device == "cuda":
    model = AutoModelForCausalLM.from_pretrained("abacaj/phi-2-super", model_file="phi-2-super.Q5_0.gguf", torch_dtype="auto", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained("abacaj/phi-2-super", trust_remote_code=True)
  else:
    model = AutoModelForCausalLM.from_pretrained("abacaj/phi-2-super", torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained("abacaj/phi-2-super", trust_remote_code=True)
else:
  !nvidia-smi
  !CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
  from huggingface_hub import hf_hub_download
  from llama_cpp import Llama
  model_name = "sayhan/phi-2-super-GGUF"
  model_file = f"phi-2-super.{phi_2_super_model_version}.gguf"
  model_path = hf_hub_download(model_name,
                               filename=model_file,
                               local_dir='/content')
  from llama_cpp import Llama
  llm = Llama(model_path=model_path,
            n_gpu_layers=-1)


clear_output()
print(f"Done downloading Phi 2 Super {phi_2_super_model_version}!")

In [None]:
#@title Run Phi 2 Super UI

def answer(user_prompt, system_prompt, max_new_tokens, repetition_penalty):
  if phi_2_model_version == "normal":
    # Concatenate system and user prompts
    full_prompt = system_prompt + user_prompt

    # Tokenize the combined prompt
    inputs = tokenizer(full_prompt, return_tensors="pt", return_attention_mask=False)

    # Generate text based on the combined prompt and additional inputs
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        repetition_penalty=repetition_penalty
    )

    # Extract the generated text from the model output
    text = tokenizer.batch_decode(outputs)[0]
    text = text[len(system_prompt):].strip()
    return text
  else:
    response = llm(full_prompt, max_tokens=max_new_tokens, repeat_penalty=repetition_penalty)
    response = response['choices'][0]['text']
    return response

# Define the gradio interface
interface = gr.Interface(
    fn=answer,
    inputs=[
        gr.Textbox(lines=2, label="Your Prompt"),
        gr.Textbox(lines=2, label="System Prompt"),
        gr.Slider(value=512, minimum=256, maximum=2048, step=1, interactive=True, label="Max Tokens"),
        gr.Slider(value=1.2, minimum=0.0, maximum=2.0, step=0.1, interactive=True, label="Repetition Penalty"),
    ],
    outputs="text",
    title="Phi 2 Super SLM",
)


# Launch the gradio interface
interface.launch(share=True, debug=True)