In [None]:
def split_text(text, max_length=1000):
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    return chunks


In [None]:
def summarize_chunk(chunk):
    prompt = f"""
    Please summarize the main points discussed in the following text in Polish:
    {chunk}
    """

    response = client.Completion.create(
        engine="mistral-7b",
        prompt=prompt,
        max_tokens=150,
        n=1,
        stop=None,
        temperature=0.7,
    )

    summary = response.choices[0].text.strip()
    return summary


In [None]:
def summarize_text(text):
    chunks = split_text(text)
    summaries = [summarize_chunk(chunk) for chunk in chunks]
    combined_summaries = " ".join(summaries)
    final_summary = summarize_chunk(combined_summaries)
    return final_summary

# Przykład użycia
text = "Twój bardzo długi tekst..."
final_summary = summarize_text(text)
print(final_summary)


In [None]:
import os
import openai
from openai import OpenAI

HOME = os.environ.get("HOME")
CAFILE = HOME + "/cacert.pem"
API = "https://patagonia-aigen-srv:8080/llm-large/v1"

# Konfiguracja klienta OpenAI
openai.api_base = API
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.verify_ssl_certs = True
openai.ca_certs = CAFILE

client = OpenAI()

def split_text(text, max_length=1000):
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    return chunks

def summarize_chunk(chunk):
    prompt = f"""
    Please summarize the main points discussed in the following text in Polish:
    {chunk}
    """

    response = client.Completion.create(
        engine="mistral-7b",
        prompt=prompt,
        max_tokens=150,
        n=1,
        stop=None,
        temperature=0.7,
    )

    summary = response.choices[0].text.strip()
    return summary

def summarize_text(text):
    chunks = split_text(text)
    summaries = [summarize_chunk(chunk) for chunk in chunks]
    combined_summaries = " ".join(summaries)
    final_summary = summarize_chunk(combined_summaries)
    return final_summary

# Przykład użycia
text = "Twój bardzo długi tekst..."
final_summary = summarize_text(text)
print(final_summary)


## Vector embeddings

In [None]:
from sentence_transformers import SentenceTransformer

# Ładowanie modelu SBERT
embed_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [None]:
from sentence_transformers import SentenceTransformer

# Ładowanie modelu SBERT
embed_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [None]:
def split_text(text, max_length=1000):
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    return chunks


In [None]:
def split_text(text, max_length=1000):
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    return chunks


In [None]:
def embed_chunks(chunks):
    embeddings = embed_model.encode(chunks, convert_to_tensor=True)
    return embeddings


In [None]:
def summarize_embeddings(embeddings):
    summary_vector = embeddings.mean(axis=0)
    return summary_vector


In [None]:
def summarize_chunk(chunk):
    prompt = f"Please summarize the main points discussed in the following text in Polish:\n{chunk}\n"

    response = client.Completion.create(
        model=model,
        prompt=prompt,
        max_tokens=150,
        n=1,
        stop=None,
        temperature=0.7,
    )

    summary = response.choices[0].text.strip()
    return summary


In [None]:
def summarize_text(text):
    chunks = split_text(text)
    embeddings = embed_chunks(chunks)
    preliminary_summaries = [summarize_chunk(chunk) for chunk in chunks]
    combined_summaries = " ".join(preliminary_summaries)
    final_summary = summarize_chunk(combined_summaries)
    return final_summary

# Przykład użycia
text = "Twój bardzo długi tekst..."
final_summary = summarize_text(text)
print(final_summary)


### Mistral 7b do osadzania wektorów?

In [None]:
def get_embeddings(text):
    prompt = f"Generate embeddings for the following text:\n{text}\n"

    response = client.Embeddings.create(
        model=model,
        input=prompt,
    )

    embeddings = response['data'][0]['embedding']
    return embeddings


In [None]:
def split_text(text, max_length=1000):
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    return chunks


In [None]:
def embed_chunks(chunks):
    embeddings = [get_embeddings(chunk) for chunk in chunks]
    return embeddings


In [None]:
import numpy as np

def summarize_embeddings(embeddings):
    summary_vector = np.mean(embeddings, axis=0)
    return summary_vector


In [None]:
def summarize_chunk(chunk):
    prompt = f"Please summarize the main points discussed in the following text in Polish:\n{chunk}\n"

    response = client.Completion.create(
        model=model,
        prompt=prompt,
        max_tokens=150,
        n=1,
        stop=None,
        temperature=0.7,
    )

    summary = response.choices[0].text.strip()
    return summary


In [None]:
def summarize_text(text):
    chunks = split_text(text)
    embeddings = embed_chunks(chunks)
    preliminary_summaries = [summarize_chunk(chunk) for chunk in chunks]
    combined_summaries = " ".join(preliminary_summaries)
    final_summary = summarize_chunk(combined_summaries)
    return final_summary

# Przykład użycia
text = "Twój bardzo długi tekst..."
final_summary = summarize_text(text)
print(final_summary)


### wektory osadzone

In [None]:
from sentence_transformers import SentenceTransformer

# Ładowanie modelu SBERT
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [None]:
documents = [
    "This is a document about machine learning.",
    "Another document related to artificial intelligence.",
    "More text about data science and machine learning.",
    "Different topic about natural language processing.",
    "Last document about deep learning."
]

# Generowanie osadzonych wektorów dla dokumentów
embeddings = model.encode(documents)


In [None]:
import faiss
import numpy as np

# Tworzenie indeksu FAISS
dimension = embeddings.shape[1]  # Wymiar wektorów osadzonych
index = faiss.IndexFlatL2(dimension)

# Dodawanie osadzonych wektorów do indeksu
index.add(embeddings)
print(f"Number of vectors in the index: {index.ntotal}")


In [None]:
# Przykładowe zapytanie
query = "Information about AI and machine learning."
query_embedding = model.encode([query])

# Znalezienie 3 najbardziej podobnych dokumentów
k = 3
distances, indices = index.search(query_embedding, k)

# Wyświetlenie wyników
print(f"Query: {query}")
for i, idx in enumerate(indices[0]):
    print(f"Result {i+1}: {documents[idx]} (Distance: {distances[0][i]})")


FAISS - wector embedding

In [None]:
from sentence_transformers import SentenceTransformer

# Ładowanie modelu SBERT
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [None]:
documents = [
    "This is a document about machine learning.",
    "Another document related to artificial intelligence.",
    "More text about data science and machine learning.",
    "Different topic about natural language processing.",
    "Last document about deep learning."
]

# Generowanie osadzonych wektorów dla dokumentów
embeddings = model.encode(documents)


In [None]:
import faiss
import numpy as np

# Tworzenie indeksu FAISS
dimension = embeddings.shape[1]  # Wymiar wektorów osadzonych
index = faiss.IndexFlatL2(dimension)  # Używamy metryki L2 (odległość euklidesowa)

# Sprawdzenie czy indeks jest pusto
print(f"Is index trained? {index.is_trained}")


In [None]:
# Dodawanie osadzonych wektorów do indeksu
index.add(embeddings)

# Sprawdzenie liczby wektorów w indeksie
print(f"Number of vectors in the index: {index.ntotal}")


In [None]:
# Przykładowe zapytanie
query = "Information about AI and machine learning."
query_embedding = model.encode([query])

# Znalezienie 3 najbardziej podobnych dokumentów
k = 3
distances, indices = index.search(query_embedding, k)

# Wyświetlenie wyników
print(f"Query: {query}")
for i, idx in enumerate(indices[0]):
    print(f"Result {i+1}: {documents[idx]} (Distance: {distances[0][i]})")


In [None]:
# Zapisanie indeksu do pliku
faiss.write_index(index, "vector_index.faiss")

# Ładowanie indeksu z pliku
index = faiss.read_index("vector_index.faiss")

# Sprawdzenie liczby wektorów w załadowanym indeksie
print(f"Number of vectors in the loaded index: {index.ntotal}")


### Wector embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

# Ładowanie modelu SBERT
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def load_documents_from_folder(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                documents.append(file.read())
    return documents

# Ścieżka do folderu z plikami tekstowymi
folder_path = './documents'
documents = load_documents_from_folder(folder_path)

# Generowanie osadzonych wektorów dla dokumentów
embeddings = model.encode(documents)

# Tworzenie indeksu FAISS
dimension = embeddings.shape[1]  # Wymiar wektorów osadzonych
index = faiss.IndexFlatL2(dimension)  # Używamy metryki L2 (odległość euklidesowa)

# Dodawanie osadzonych wektorów do indeksu
index.add(embeddings)

# Sprawdzenie liczby wektorów w indeksie
print(f"Number of vectors in the index: {index.ntotal}")

# Przykładowe zapytanie
query = "Information about AI and machine learning."
query_embedding = model.encode([query])

# Znalezienie 3 najbardziej podobnych dokumentów
k = 3
distances, indices = index.search(query_embedding, k)

# Wyświetlenie wyników
print(f"Query: {query}")
for i, idx in enumerate(indices[0]):
    print(f"Result {i+1}: {documents[idx]} (Distance: {distances[0][i]})")

# Zapisanie indeksu do pliku
faiss.write_index(index, "vector_index.faiss")

# Ładowanie indeksu z pliku
index = faiss.read_index("vector_index.faiss")

# Sprawdzenie liczby wektorów w załadowanym indeksie
print(f"Number of vectors in the loaded index: {index.ntotal}")


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

# Ładowanie modelu SBERT
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def load_documents_from_folder(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                documents.append(file.read())
    return documents

def split_text_into_chunks(text, max_length=1000):
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    return chunks

def generate_embeddings_for_chunks(documents, max_length=1000):
    all_embeddings = []
    for document in documents:
        chunks = split_text_into_chunks(document, max_length)
        embeddings = model.encode(chunks)
        all_embeddings.extend(embeddings)
    return np.array(all_embeddings)

# Ścieżka do folderu z plikami tekstowymi
folder_path = './documents'
documents = load_documents_from_folder(folder_path)

# Generowanie osadzonych wektorów dla fragmentów
embeddings = generate_embeddings_for_chunks(documents)

# Tworzenie indeksu FAISS
dimension = embeddings.shape[1]  # Wymiar wektorów osadzonych
index = faiss.IndexFlatL2(dimension)  # Używamy metryki L2 (odległość euklidesowa)

# Dodawanie osadzonych wektorów do indeksu
index.add(embeddings)

# Sprawdzenie liczby wektorów w indeksie
print(f"Number of vectors in the index: {index.ntotal}")

# Przykładowe zapytanie
query = "Information about AI and machine learning."
query_embedding = model.encode([query])

# Znalezienie 3 najbardziej podobnych fragmentów
k = 3
distances, indices = index.search(query_embedding, k)

# Wyświetlenie wyników
print(f"Query: {query}")
for i, idx in enumerate(indices[0]):
    print(f"Result {i+1}: Fragment {idx} (Distance: {distances[0][i]})")

# Zapisanie indeksu do pliku
faiss.write_index(index, "vector_index.faiss")

# Ładowanie indeksu z pliku
index = faiss.read_index("vector_index.faiss")

# Sprawdzenie liczby wektorów w załadowanym indeksie
print(f"Number of vectors in the loaded index: {index.ntotal}")


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

# Ładowanie modelu SBERT
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def load_documents_from_folder(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                documents.append(file.read())
    return documents

def split_text_into_chunks(text, max_length=1000):
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    return chunks

def generate_embeddings_for_chunks(documents, max_length=1000):
    all_embeddings = []
    for document in documents:
        chunks = split_text_into_chunks(document, max_length)
        embeddings = model.encode(chunks)
        all_embeddings.extend(embeddings)
    return np.array(all_embeddings)

# Ścieżka do folderu z plikami tekstowymi
folder_path = 'path_to_your_folder/documents'
documents = load_documents_from_folder(folder_path)

# Generowanie osadzonych wektorów dla fragmentów
embeddings = generate_embeddings_for_chunks(documents)

# Tworzenie indeksu FAISS
dimension = embeddings.shape[1]  # Wymiar wektorów osadzonych
index = faiss.IndexFlatL2(dimension)  # Używamy metryki L2 (odległość euklidesowa)

# Dodawanie osadzonych wektorów do indeksu
index.add(embeddings)

# Sprawdzenie liczby wektorów w indeksie
print(f"Number of vectors in the index: {index.ntotal}")

# Przykładowe zapytanie
query = "Information about AI and machine learning."
query_embedding = model.encode([query])

# Znalezienie 3 najbardziej podobnych fragmentów
k = 3
distances, indices = index.search(query_embedding, k)

# Wyświetlenie wyników
print(f"Query: {query}")
for i, idx in enumerate(indices[0]):
    print(f"Result {i+1}: Fragment {idx} (Distance: {distances[0][i]})")

# Zapisanie indeksu do pliku
faiss.write_index(index, "vector_index.faiss")

# Ładowanie indeksu z pliku
index = faiss.read_index("vector_index.faiss")

# Sprawdzenie liczby wektorów w załadowanym indeksie
print(f"Number of vectors in the loaded index: {index.ntotal}")


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

# Ładowanie modelu SBERT
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def load_documents_from_folder(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                documents.append(file.read())
    return documents

def split_text_into_chunks(text, max_length=1000):
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    return chunks

def generate_embeddings_for_chunks(documents, max_length=1000):
    all_embeddings = []
    for document in documents:
        chunks = split_text_into_chunks(document, max_length)
        embeddings = model.encode(chunks)
        all_embeddings.extend(embeddings)
    return np.array(all_embeddings)

# Ścieżka do folderu z plikami tekstowymi
folder_path = 'path_to_your_folder/documents'
documents = load_documents_from_folder(folder_path)

# Generowanie osadzonych wektorów dla fragmentów
embeddings = generate_embeddings_for_chunks(documents)

# Tworzenie indeksu FAISS
dimension = embeddings.shape[1]  # Wymiar wektorów osadzonych
index = faiss.IndexFlatL2(dimension)  # Używamy metryki L2 (odległość euklidesowa)

# Dodawanie osadzonych wektorów do indeksu
index.add(embeddings)

# Sprawdzenie liczby wektorów w indeksie
print(f"Number of vectors in the index: {index.ntotal}")

# Przykładowe zapytanie
query = "Information about AI and machine learning."
query_embedding = model.encode([query])

# Znalezienie 3 najbardziej podobnych fragmentów
k = 3
distances, indices = index.search(query_embedding, k)

# Wyświetlenie wyników
print(f"Query: {query}")
for i, idx in enumerate(indices[0]):
    print(f"Result {i+1}: Fragment {idx} (Distance: {distances[0][i]})")

# Zapisanie indeksu do pliku
faiss.write_index(index, "vector_index.faiss")

# Ładowanie indeksu z pliku
index = faiss.read_index("vector_index.faiss")

# Sprawdzenie liczby wektorów w załadowanym indeksie
print(f"Number of vectors in the loaded index: {index.ntotal}")
