In [None]:
# 1. Installer les dépendances (à exécuter une seule fois dans Colab)
!pip install unsloth flask bitsandbytes accelerate pyngrok -q

# 2. Importations
from unsloth import FastLanguageModel
from transformers import BitsAndBytesConfig, AutoTokenizer
import torch
from flask import Flask, request, jsonify
from pyngrok import ngrok
import threading


# 4. Configuration BitsAndBytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True  # Offload CPU activé
)

# 5. Nom du modèle
model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B"

# 6. Chargement du modèle
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=1024,
    dtype=None,
    load_in_4bit=True,
    device_map="auto",
    quantization_config=bnb_config
)

model.eval()


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.6/146.6 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.5/31.5 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((409

In [None]:
from flask import Flask, request, jsonify

# 7. Création de l'application Flask
app = Flask(__name__)
@app.route("/", methods=["GET"])
def index():
    return "✅ L’API est active. Utilisez POST /predict pour envoyer des prompts."


@app.route("/predict", methods=["POST"])
def predict():
    try:
        data = request.get_json()
        prompt = data.get("prompt", "")

        # Encodage + envoi vers le bon device
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Génération
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=300,
                do_sample=True,
                temperature=0.7,
                top_p=0.9
            )

        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Nettoyage simple pour enlever les éventuelles balises ou tokens parasites
        cleaned_output = decoded_output.strip()  # Supprime les espaces inutiles
        # Si le modèle ajoute par exemple "<|assistant|> réponse", tu peux retirer ces tags manuellement :
        for tag in ["<|assistant|>", "<|endoftext|>", "<|system|>", "<|user|>"]:
            cleaned_output = cleaned_output.replace(tag, "")

        cleaned_output = cleaned_output.strip()  # Nettoyage final

        return jsonify({"response": cleaned_output})

    except Exception as e:
        return jsonify({"error": str(e)}), 500


In [None]:
from pyngrok import ngrok
import threading

ngrok.set_auth_token("2xVTW8Pz6Hdh9TDpcliYmEBkRLP_nQbUzaW3637bv5Aderfm")

# 8. Démarrer le serveur Flask + exposer via ngrok
port = 5000
public_url = ngrok.connect(port)
print(f"🚀 Serveur accessible publiquement à l’adresse : {public_url}")

def run():
    app.run(port=port)

threading.Thread(target=run).start()

🚀 Serveur accessible publiquement à l’adresse : NgrokTunnel: "https://ae59-34-87-18-149.ngrok-free.app" -> "http://localhost:5000"
