# Инициализация

In [1]:
import chromadb
import requests

### Поиск эмбеддингов

In [2]:
def query_to_chromadb(query, n_results=2, path="./chroma_storage", collection_name="magic_document"):
    """Запрос в chromadb"""
    client = chromadb.PersistentClient(path=path)
    collection = client.get_or_create_collection(name=collection_name)

    results = collection.query(
        query_texts=[query],
        n_results=n_results  # Number of results
    )

    return results

def get_elements_by_ids(ids, path="./chroma_storage", collection_name="magic_document"):
    """Получение элементов из chromadb по их идентификаторам (IDs)"""
    client = chromadb.PersistentClient(path=path)
    collection = client.get_or_create_collection(name=collection_name)

    # Получение данных по IDs
    results = collection.get(ids=ids)

        # Если результатов нет, возвращаем -1
    if not results or not results.get("documents"):
        return -1

    return results

### LLM

In [8]:
def get_server_health(base_url='http://localhost:8080'):
    """{'status': 'ok'}"""
    response = requests.get(f'{base_url}/health')
    return response.json()


def query_to_llm(user_input, context, base_url='http://localhost:8080'):

    health_status = get_server_health(base_url=base_url)
    if not health_status['status'] == 'ok':
        return "Error processing your request. Please try again.\n{health_status}"

    # prompt = f"{context}\nUser: {user_input}\nAssistant:"
    prompt = user_input
    data = {
        'prompt': prompt,
        'temperature': 0.1,
        'top_k': 35,
        'top_p': 0.95,
        'n_predict': 100,
        'stop': ["</s>", "Assistant:", "User:"]
    }
    headers = {'Content-Type': 'application/json'}
    response = requests.post(f'{base_url}/completion', json=data, headers=headers)
    if response.status_code == 200:
        # return response.json()['content'].strip()
        return response.json()
    else:
        return "Error processing your request. Please try again."

In [4]:
# query_to_llm("Who are created you?", {}, base_url='http://localhost:8080')['content']

# Тестирование

In [13]:
query_db = """For what tasks are decision trees most suitable?"""

query_result = query_to_chromadb(query_db, n_results=5)    # query_result['documents'][0][:1]

page = int(query_result['ids'][0][0].split()[1][:-1])
doc = query_result['ids'][0][0].split()[0]

id = doc + f" {page-1})"
dop_doc_1 = get_elements_by_ids(id)
id = doc + f" {page+1})"
dop_doc_2 = get_elements_by_ids(id)

docs = []
if dop_doc_1 != -1:
    docs = [dop_doc_1['documents'][0]]
    docs.append(query_result['documents'][0][0])
if dop_doc_2 != -1:
    dop_doc_2 = dop_doc_2['documents'][0]
    docs.append(dop_doc_2)

# context = ' '.join(query_result['documents'][0][:1])
context = ' '.join(docs)

# query_llm = f"""Your task is to answer the question according to the context. If the context does not contain information about the question, then report it.
#             question: {query_db}
#             context: {context}"""

query_llm = f"""{query_db}\nUse the information from the hint: {context}"""


result = query_to_llm(query_llm, {}, base_url='http://localhost:8080')#['content']

result

{'content': ' Fig. illustrates the attribute selection process. The decision tree algorithm starts at the root with the set of all target outcomes in the training data set. Each of the column attributes is evaluated to determine how it partitions the target outcomes. An impurity measure is used to determine which attribute best partitions the target outcomes. Ideally, for each attribute value, the corresponding outcome subset contains identical target values. B ? A ? a a b b C ? c c t, t B ? b b',
 'id_slot': 0,
 'stop': True,
 'model': 'C:\\Users\\Igorexy\\.ai-navigator\\models\\openchat\\openchat-3.5-0106\\OpenChat-3.5-0106_Q8_0.gguf',
 'tokens_predicted': 100,
 'tokens_evaluated': 1254,
 'generation_settings': {'n_ctx': 8192,
  'n_predict': -1,
  'model': 'C:\\Users\\Igorexy\\.ai-navigator\\models\\openchat\\openchat-3.5-0106\\OpenChat-3.5-0106_Q8_0.gguf',
  'seed': 4294967295,
  'seed_cur': 1535966156,
  'temperature': 0.10000000149011612,
  'dynatemp_range': 0.0,
  'dynatemp_expon

In [55]:
# page = int(query_result['ids'][0][0].split()[1][:-1])
# doc = query_result['ids'][0][0].split()[0]

# id = doc + f" {page-1})"
# get_elements_by_ids(id)

{'ids': ['1806.05886v2 7)'],
 'embeddings': None,
 'documents': ['. A Deeper Look into the Operation of the Framework In order to visualize how the reinforcement learning framework preprocesses distorted images, we run another experiment on MNIST with coarser distortions, in particular rotations are performed with large angles degrees and ipping operations as in the previous experiment. For each distorted image, we trace the operation of the framework and obtain the transformation chain that the framework automatically generates for the image. An illustration for a few images is shown in Figure . It is interesting that most images are either classied directly or transformed to their original version before being classied. The exact recovery is possible thanks to the symmetry property of transformation actions. Although the framework is able to recover distorted images, it is not guaranteed to nd the optimal chain of transformations in term of the shortest recovery path. In addition, th

In [15]:
print(result['content'])

 Fig. illustrates the attribute selection process. The decision tree algorithm starts at the root with the set of all target outcomes in the training data set. Each of the column attributes is evaluated to determine how it partitions the target outcomes. An impurity measure is used to determine which attribute best partitions the target outcomes. Ideally, for each attribute value, the corresponding outcome subset contains identical target values. B ? A ? a a b b C ? c c t, t B ? b b


In [26]:
with open('temp.txt', 'a') as f:
    f.write(result['content'] + '\n')

In [25]:
query_llm = f"""Translate the text from English into Russian.
Text:
{result['content']}"""

# query_llm = f"""Переведи текст с английского языка на русский.
# текст:
# {result['content']}"""

res_translate = query_to_llm(query_llm, {}, base_url='http://localhost:8080')#['content']

res_translate['content']

' the decision tree T with k leaf nodes. The split trees, T, k, s, ks, bks (T, k, l, h, s, Xk, Ys, ks, bks), are generated from T with the split information. The Grow algorithm can be represented as follows. Grow algorithm is a tree growing algorithm where decision trees are generated from other trees in an online manner. The input is a decision tree T and the output is the search set of decision trees, denoted by S. The Grow algorithm is defined in Algorithm . The heuristic function h is used for partitioning the training data in an optimal way, so that the most informative partitioning of the data is found. The function GetSplitLeavesT, l, h takes an input decision tree T and computes l leaf nodes of T with the largest heuristic hs from LT . The function GenerateSplitRuless takes an input decision tree T and a number of decision trees B and generates the split information for each decision tree. The split information is a set of ks, bks. The Xk s denotes the subset of the decision tr