In [2]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-06-29 16:53:00--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3,7K) [text/plain]
Saving to: 'minsearch.py'

     0K ...                                                   100%  848K=0,004s

2024-06-29 16:53:00 (848 KB/s) - 'minsearch.py' saved [3832/3832]



In [3]:
import requests 
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x17860b635b0>

In [4]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [5]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [6]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [7]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [7]:
llm('write that this is a test')

" This is a test message to illustrate the creation of a simple text. If there's anything specific you would like me to elaborate on or create, please let me know!"

In [13]:
llm(
"""
Explain the idea behind all below queries of SQL developer. Make a descriptive summary of what the SQL developer wanted to achiave:

- Create a table to store deduplicated keys (if not exists)
CREATE TABLE IF NOT EXISTS deduplication_keys (
    hash STRING
);

-- Iterate over each partition set (year, month, day, hour)
INSERT OVERWRITE TABLE destination_table PARTITION (year, month, day, hour)
SELECT 
    year,
    month,
    day,
    hour,
    account_number,
    external_id,
    amount,
    title,
    MD5(CONCAT(
        COALESCE(account_number, ''),
        COALESCE(external_id, ''),
        COALESCE(amount, ''),
        COALESCE(title, '')
    )) AS hash
FROM (
    SELECT 
        year,
        month,
        day,
        hour,
        account_number,
        external_id,
        amount,
        title
    FROM source_table
) temp
WHERE NOT EXISTS (
    SELECT 1
    FROM deduplication_keys dk
    WHERE dk.hash = MD5(CONCAT(
        COALESCE(temp.account_number, ''),
        COALESCE(temp.external_id, ''),
        COALESCE(temp.amount, ''),
        COALESCE(temp.title, '')
    ))
);

-- Insert deduplicated hash values into deduplication_keys table
INSERT INTO TABLE deduplication_keys
SELECT hash
FROM destination_table;



-- Create a table to store deduplicated keys (if not exists)
CREATE TABLE IF NOT EXISTS deduplication_keys (
    hash STRING
);

-- Iterate over each partition set (year, month, day, hour)
INSERT OVERWRITE TABLE destination_table PARTITION (year, month, day, hour)
SELECT 
    year,
    month,
    day,
    hour,
    account_number,
    external_id,
    amount,
    title,
    MD5(CONCAT(account_number, external_id, amount, title)) AS hash
FROM (
    SELECT 
        year,
        month,
        day,
        hour,
        account_number,
        external_id,
        amount,
        title
    FROM source_table
) temp
WHERE NOT EXISTS (
    SELECT 1
    FROM deduplication_keys dk
    WHERE dk.hash = MD5(CONCAT(temp.account_number, temp.external_id, temp.amount, temp.title))
);

-- Insert deduplicated hash values into deduplication_keys table
INSERT INTO TABLE deduplication_keys
SELECT dt.hash
FROM destination_table AS dt
LEFT JOIN deduplication_keys AS dk
ON dt.hash = dk.hash
WHERE dt.hash IS NULL;

;


########################
-- Create staging table
CREATE TABLE IF NOT EXISTS staging_destination_table (
    year INT,
    month INT,
    day INT,
    hour INT,
    account_number STRING,
    external_id STRING,
    amount DECIMAL,
    title STRING,
    hash STRING
);

-- Deduplicate and insert into staging table
INSERT OVERWRITE TABLE staging_destination_table
SELECT 
    year,
    month,
    day,
    hour,
    account_number,
    external_id,
    amount,
    title,
    MD5(CONCAT(
        COALESCE(account_number, ''),
        COALESCE(external_id, ''),
        COALESCE(amount, ''),
        COALESCE(title, '')
    )) AS hash
FROM (
    SELECT 
        year,
        month,
        day,
        hour,
        account_number,
        external_id,
        amount,
        title
    FROM source_table
) temp
WHERE NOT EXISTS (
    SELECT 1
    FROM deduplication_keys dk
    WHERE dk.hash = MD5(CONCAT(
        COALESCE(temp.account_number, ''),
        COALESCE(temp.external_id, ''),
        COALESCE(temp.amount, ''),
        COALESCE(temp.title, '')
    ))
);

-- Insert deduplicated data into final destination table
INSERT INTO TABLE destination_table
PARTITION (year, month, day, hour)
SELECT 
    year,
    month,
    day,
    hour,
    account_number,
    external_id,
    amount,
    title
FROM staging_destination_table;

-- Insert deduplicated hash values into deduplication_keys table
INSERT INTO TABLE deduplication_keys
SELECT DISTINCT hash
FROM staging_destination_table;

-- Insert deduplicated data into final destination table
INSERT INTO TABLE destination_table
PARTITION (year, month, day, hour)
SELECT DISTINCT
    year,
    month,
    day,
    hour,
    account_number,
    external_id,
    amount,
    title
FROM staging_destination_table;
"""
)

" The given SQL queries are designed to create a deduplication system that removes duplicate entries in a dataset based on certain key columns. Here's a descriptive summary of what the entire process achieves:\n\n1. **Create Deduplication Tables**: First, two tables (deduplication_keys and destination_table) are created to store deduplicated keys (`hash`) and data respectively. The `destination_table` will eventually hold cleaned dataset with no duplicates.\n\n2. **Iterate Over Partition Sets**: This system operates on a partitioned table (partitioning by year, month, day, hour). For each combination of these partitions, deduplication is performed and results are inserted into the destination_table using an `OVERWRITE` clause to replace existing data with de-duplicated values.\n\n3. Administer Partition Set Iteration: Each partition set's rows go through a selection process where they generate hashes by concatenating their non-null columns (account number, external ID, amount, title), 

In [9]:
llm(
"""
What happened in Warsaw, Poland at 1945?
"""
)

" In 1945, Warsaw, Poland experienced a series of significant and tragic events. One of the most notable is the Warsaw Uprising (Akcja Wyzwolenia warszawskiego), which occurred from August 1 to October 2, 1944. This was a large act of resistance by the Polish Home Army against the occupying German forces during World War II. The aim was to liberate Warsaw before the arrival of the Soviet Red Army; however, it ended in failure as the uprising was crushed and much of the city was destroyed.\n\nAfter several months following this initial uprising, on January 17, 1945, Soviet forces entered Warsaw after fierce fighting with German units. Although the Soviets were allies of the Polish government in exile, their entrance into the city led to further tensions between them and the newly established pro-Soviet provisional government installed by the Red Army.\n\nLater that year on August 1, 1945, a new uprising known as the Ghetto Uprising (Gwarecki Powstanie) took place within Warsaw's Jewish 

In [18]:
llm('Check grammar of following text: What happened in Warsaw, Poland at 1945?')

' The given text is grammatically correct. However, for clarity and specificity, it could be slightly rephrased as:\n\n"What occurred in Warsaw, Poland, in the year 1945?"\n\nThis maintains the original meaning but adds a bit more detail on how events unfolded during that time period, assuming you\'re referring to World War II and its aftermath.'

In [13]:
print(_)

 This statement serves as an example to verify the functionality of various systems, such as text processing software or programming functions. It's commonly used by developers during debugging sessions to ensure commands are working correctly without producing any unintended output.

Here's how you might include it in different contexts:

**1. Using it as a command line test in a script:**
If writing a shell script or using command-line tools, the statement can be inserted directly to demonstrate functionality. For instance, using `echo` on Unix-like systems:
```bash
#!/bin/bash
echo "This is a test"
echo "This is also a test for confirmation."
```

**2. Inserting it into a programming function as a placeholder or comment (in Python):**
As a comment in code to remind future developers that the block can be replaced with actual implementation:
```python
def process_text(input_string):
    # Test input: "This is a test"
    print("Testing...")
    # Replace this line with your processin