In [1]:
import torch
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#model_id = "meta-llama/Llama-3.2-1B"
model_id = "meta-llama/Llama-3.2-1B-Instruct"


Data Ingestion

In [3]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    ServiceContext,
    load_index_from_storage
)
reader = SimpleDirectoryReader(input_files=["D:/LLama3/documents/test_2.txt"])
documents = reader.load_data()


In [4]:
len(documents)

1

In [5]:
documents[0].metadata

{'file_path': 'D:\\LLama3\\documents\\test_2.txt',
 'file_name': 'test_2.txt',
 'file_type': 'text/plain',
 'file_size': 189,
 'creation_date': '2024-10-24',
 'last_modified_date': '2024-10-24'}

In [6]:
documents[0]

Document(id_='c467ba90-ae31-4b51-b6eb-bee7e4e1e9a5', embedding=None, metadata={'file_path': 'D:\\LLama3\\documents\\test_2.txt', 'file_name': 'test_2.txt', 'file_type': 'text/plain', 'file_size': 189, 'creation_date': '2024-10-24', 'last_modified_date': '2024-10-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Wenn du mit der Rakete zu spät bist, dann kannst du 3 Monate Haft und eine hohe Geldstrafe erhalten.\r\nDu wirst ebenfalls in einen Hochsicherheitstrakt gebracht und muss nur Brokkoli essen.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

Chunking

In [7]:
from llama_index.core.node_parser import SentenceSplitter


In [8]:
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
nodes = text_splitter.get_nodes_from_documents(documents, show_progress=True)

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 999.60it/s]


In [9]:
len(nodes)

1

In [10]:
nodes[0].metadata


{'file_path': 'D:\\LLama3\\documents\\test_2.txt',
 'file_name': 'test_2.txt',
 'file_type': 'text/plain',
 'file_size': 189,
 'creation_date': '2024-10-24',
 'last_modified_date': '2024-10-24'}

Embedding model (modell für embeddings verwendet, z.b. vektorisieren der daten)

In [11]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


In [12]:
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")


LLM Model

In [13]:

#from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.llms.ollama import Ollama

In [14]:
service_context_llm = Ollama(model="local:" + model_id)



In [15]:
#llm = HuggingFaceLLM(
#    context_window=4096,
#    max_new_tokens=2048,
#    #generate_kwargs={"temperature": 0.0, "do_sample": False},
#    #system_prompt=system_prompt,
#    #tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
#    model_name=model_id,
#    device_map="auto",
#    # loading model in 8bit for reducing memory
#    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
#)

configure service context

In [16]:
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=service_context_llm)


  service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=service_context_llm)


Create Vector Store Index


In [17]:
vector_index = VectorStoreIndex.from_documents(documents, show_progress=True, service_context=service_context, node_parser=nodes)


Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 999.36it/s]
Generating embeddings: 100%|██████████| 1/1 [00:07<00:00,  7.28s/it]


Persist/Save Index


In [18]:
vector_index.storage_context.persist(persist_dir="D:/LLama3/vector/storage_mini")


Define Storage Context


In [19]:
storage_context = StorageContext.from_defaults(persist_dir="D:/LLama3/vector/storage_mini")


Load Index


In [20]:
index = load_index_from_storage(storage_context, service_context=service_context)


Define Query Engine


In [21]:
query_engine = index.as_query_engine(service_context=service_context)


Feed in user query


In [22]:
query = "Explain market bonds?"
resp = query_engine.query(query)

ConnectError: [WinError 10061] Es konnte keine Verbindung hergestellt werden, da der Zielcomputer die Verbindung verweigerte

Nur mit Model reden, ohne Data Ingestion

In [23]:
#model_id = "meta-llama/Llama-3.2-1B"
model_id = "meta-llama/Llama-3.2-1B-Instruct"


In [24]:
pipe = pipeline(
    "text-generation", 
    model=model_id, 
    torch_dtype=torch.bfloat16, 
    #device="cuda"
    device_map="auto"
)

In [None]:
messages = [
    {"role": "system", "content": "Du bist ein Anwalt, der immer nach deutschen Gesetzen korrekt antwortet!"},

    {"role": "user", "content": "Ich bin heute nicht mit meiner Rakete zum Mond gekommen. Mit was für Konsequenzen muss ich rechnen?"},
]
outputs = pipe(
    messages,
    max_new_tokens=2048,
)
print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
