In [1]:
!pip install transformers
!pip install faiss-cpu
!pip install sentence-transformers




In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from retriever import initialize_retriever, retrieve_top_chunks  # Import functions from retriever.py

# Load the model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "speakleash/Bielik-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)

# Initialize retriever with paths to FAISS index and metadata
faiss_path = "/content/130_len_faiss_index.bin"
metadata_path = "/content/130_len_metadata.json"
index, metadata, retriever_model = initialize_retriever(faiss_path, metadata_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

FAISS index loaded successfully.
Metadata loaded successfully.
Retriever model initialized successfully.




In [3]:
def generate_response(query, index, metadata, retriever_model, initial_top_k=50, final_top_k=10):
    """Generates a response based on the retrieved chunks for the given query."""
    # Retrieve relevant chunks
    retrieved_chunks = retrieve_top_chunks(query, index, metadata, retriever_model, initial_top_k, final_top_k)

    # Concatenate the retrieved chunks' text to use as context
    context = " ".join(chunk['text'] for chunk in retrieved_chunks)
    if not context:
        return "Nie udało się znaleźć odpowiednich informacji."

    # Prepare messages for the model
    messages = [
        {"role": "system", "content": "Odpowiadaj krótko, precyzyjnie i wyłącznie w języku polskim."},
        {"role": "user", "content": query},
        {"role": "assistant", "content": context}
    ]

    # Tokenize input with truncation
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", truncation=True, max_length=512).to(device)

    # Ensure pad_token_id is set
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # Generate attention mask
    attention_mask = input_ids.ne(tokenizer.pad_token_id).to(device)

    # Generate response with lower max_new_tokens and deterministic mode
    generated_ids = model.generate(
        input_ids,
        max_new_tokens=100,  # Reduced for faster response generation
        do_sample=False,  # Deterministic generation
        pad_token_id=tokenizer.pad_token_id,
        attention_mask=attention_mask
    )

    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return response

In [None]:
# Test the generator with example questions
questions = [
    "Co jest najważniejsze dla pracowników?",
    "Dlaczego ważna jest odpowiedzialność w biznesie?"
]

for question in questions:
    answer = generate_response(question, index, metadata, retriever_model)
    print(f"Pytanie: {question}\nOdpowiedź: {answer}\n")

Query: co jest najważniejsze dla pracowników
Retrieved in 0.0844 seconds
Average similarity distance of results: 1.0523
Results count after filtering: 10

