# Train setup preprocess part

## file -> add input -> add test.csv

## add-ons -> secrets ->activate HF_TOKEN

# LLM prediction part

## Installation preparations

In [1]:
# %%capture here
!curl -LsSf https://astral.sh/uv/install.sh | sh

downloading uv 0.5.8 x86_64-unknown-linux-gnu
no checksums to verify
installing to /root/.local/bin
  uv
  uvx
everything's installed!

To add $HOME/.local/bin to your PATH, either restart your shell or run:

    source $HOME/.local/bin/env (sh, bash, zsh)
    source $HOME/.local/bin/env.fish (fish)


In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")
import os
import re
import json
os.environ["PATH"] = "/root/.local/bin:" + os.environ["PATH"]
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # magic downloads https://huggingface.co/docs/hub/models-downloading
os.environ["DSP_CACHEBOOL"] = "False" # for quick dev

In [3]:
%%capture here
!curl -LsSf https://astral.sh/uv/install.sh | sh

In [4]:
%%capture here
!uv pip install scikit-learn --system

In [5]:
%%capture here
!uv pip uninstall torch torchvision torchaudio --system
!uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --system

In [6]:
%%capture here
!uv pip install proto-plus==1.24.0.dev1 dataset --system
!uv pip install --upgrade datasets --system
!uv pip install vllm --system

In [7]:
%%capture here
!uv pip install "huggingface_hub[hf_transfer]" --system
!uv pip install dspy==2.5.34 datasets bitsandbytes triton hf_transfer --system # vllm

In [8]:
# %%capture here
!uv pip install ollama --system

[2mUsing Python 3.10.14 environment at: /opt/conda[0m
[2K[2mResolved [1m13 packages[0m [2min 407ms[0m[0m                                        [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m     0 B/12.85 KiB                     [1A
[2K[2mPrepared [1m1 package[0m [2min 19ms[0m[0m                                                   [1A
[2K[2mInstalled [1m1 package[0m [2min 2ms[0m[0m                                  [0m
 [32m+[39m [1mollama[0m[2m==0.4.4[0m


In [9]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")
import subprocess
import dspy
import pandas as pd
import ollama
os.environ["HF_TOKEN"] = secret_value_0

In [10]:
# %%capture here
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%                                                             11.3%###########                                      50.5%########################################                        70.5%#####################################                        70.8%########################################                     73.7%######################################################          88.9%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> NVIDIA GPU installed.
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


# Dataset preparation part

In [11]:
import pandas as pd
import json
import re

In [12]:
df=pd.read_csv("/kaggle/input/test-csv/test.csv",encoding='UTF-8',index_col=False)

In [13]:
STOP_WORDS = {
    # Prepositions
     'під', 'над', 'зі', 'біля',
    'для', 'між', 'перед', 'після', 'без', 'крізь',
    # Conjunctions
    'але', 'проте', 'однак', 'чи', 'якщо', 'коли',
    'щоб', 'бо', 'тому що', 'оскільки',
    # Articles and particles
    'же', 'би', 'б', 'хіба', 'невже',
    #additional
    'що','...'
}
def decode_entities(json_str):
    return json.loads(json_str)


def clean_text(text):
    # Replace all newline characters with a single space
    cleaned_text = re.sub(r'\n\n', ' ', text)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    # Remove commas and em dashes, replacing them with a single space
    cleaned_text = re.sub(r'[,—]', ' ', cleaned_text)
    # Replace consecutive spaces with a single space, iteratively
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    return cleaned_text

In [14]:
df["text"] = df["text"].apply(clean_text)

In [15]:
def split_text_into_sentences(text):
    # Split text into sentences using regular expressions
    sentences = re.split(r'(?<=[.!?])\s*', text)  # Split by period, exclamation, or question mark
    return [s.strip() for s in sentences if s.strip()]  # Remove empty strings

def chunk_sentences(sentences, chunk_size, max_length):
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence)
        # Check if adding the current sentence exceeds the chunk size or max length
        if len(current_chunk) < chunk_size and (current_length + sentence_length) <= max_length:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            # Finalize the current chunk and start a new one
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = [sentence]
            current_length = sentence_length

    # Add the last chunk if it contains any sentences
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

def process_dataframe(df, chunk_size, max_length):
    # Apply splitting and chunking to the specified column of the DataFrame
    df['split_text'] = df['text'].apply(
        lambda text: chunk_sentences(split_text_into_sentences(text), chunk_size, max_length)
    )
    return df

df = pd.DataFrame(df)
chunk_size = 6  # Number of sentences per chunk
max_length = 600  # Maximum character length per chunk

# Process the DataFrame
df = process_dataframe(df, chunk_size, max_length)

In [16]:
# Flatten the dataframe: each chunk of sentences becomes a new row, rest of columns are duplicated
expanded_df = df.explode('split_text', ignore_index=True)

In [17]:
print(len(expanded_df))
print(len(df))

1385
169


In [18]:
# for i in expanded_df['split_text']:
#     print(i)

In [19]:
# df.columns

In [20]:
# drop ['']
expanded_df = expanded_df[expanded_df['split_text'].apply(lambda x: x != [''])]

In [21]:
# tokenize right here
# Define a function to tokenize and flatten the list of sentences into a single list of tokens
def tokenize_text(text_list):
    # Regex to match words, handling special characters
    tokenized_list = [re.findall(r'\S+|\n', text) for text in text_list]
    # Flatten the list of lists into a single list
    return [token for sublist in tokenized_list for token in sublist]

# Apply this function to the 'split_text' column
expanded_df['tokenized_text'] = expanded_df['split_text'].apply(tokenize_text)


In [22]:
expanded_df=expanded_df[['id','tokenized_text']]

In [23]:
# expanded_df.to_csv('test_preprocessed.csv',encoding='UTF-8',index=False)

In [24]:
df=expanded_df

## Model predict part

In [25]:
# %%capture here
process = subprocess.Popen("ollama serve", shell=True,) #runs on a different thread #add "--tensor_parallel_size", "2",

In [26]:
%%capture here
!ollama pull gemma2:9b-instruct-fp16

Couldn't find '/root/.ollama/id_ed25519'. Generating new private key.
Your new public key is: 

ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPWnyXneAvTfuqit3zdU0Chw1TRWf5yk79VxlwWsnMpR



2024/12/13 22:28:41 routes.go:1195: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/root/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
time=2024-12-13T22:28:41.932Z level=INF

[GIN] 2024/12/13 - 22:28:43 | 200 |      47.832µs |       127.0.0.1 | HEAD     "/"


time=2024-12-13T22:28:43.601Z level=INFO source=download.go:175 msg="downloading 76c95736fd14 in 19 1 GB part(s)"
time=2024-12-13T22:30:02.819Z level=INFO source=download.go:175 msg="downloading 109037bec39c in 1 136 B part(s)"
time=2024-12-13T22:30:04.238Z level=INFO source=download.go:175 msg="downloading 097a36493f71 in 1 8.4 KB part(s)"
time=2024-12-13T22:30:05.425Z level=INFO source=download.go:175 msg="downloading 2490e7468436 in 1 65 B part(s)"
time=2024-12-13T22:30:06.623Z level=INFO source=download.go:175 msg="downloading 6c29221808fb in 1 486 B part(s)"


[GIN] 2024/12/13 - 22:31:24 | 200 |         2m41s |       127.0.0.1 | POST     "/api/pull"


In [27]:
import time
time.sleep(60*7)

In [28]:
lm = dspy.LM('ollama_chat/gemma2:9b-instruct-fp16', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [29]:
# Convert DataFrame rows into dspy.Example objects
def create_examples(dataset):
    examples = [
        dspy.Example(
            # id = row["id"],
            tokens=row["tokenized_text"],
            #expected_entities=row["filtered_entities"]
        ).with_inputs("tokenized_text")
        for _, row in dataset.iterrows()
    ]
    return examples

# Example usage with different datasets
train = create_examples(df)
# test = create_examples(df)


In [30]:
# train

In [31]:
class Expert_Linguist(dspy.Signature):
    """EXTRACT ENTITIES:
    LEAVE THE WORD ENDINGS OR DECLENSIONS OF ANY ENTITY AS IS!!!.
    JOB: roles/positions
    PERS: names
    ORG: organizations
    LOC: locations
    ART: artifacts
    DATE: dates
    TIME: times
    PERIOD: durations
    MONEY: amounts
    PCT: percentages
    QUANT: quantities
    MISC: other named
    DOC: documents
    - LEAVE THE WORD ENDINGS OR DECLENSIONS OF ANY ENTITY AS IS!!!. 
      The original form of the entity (with correct case, gender, and declension) should remain intact.
    - Value for "JOB" e.g. can be "фахівець із IT"
    - Value for "LOC" e.g. can be "Чернівців"
    - Value for "ART" e.g. can be "Різдвяна історія з Тіною Кароль"
    - Value for "ART" e.g. can be "олешківської міськрайонки"
    - Value for "PERS" e.g. can be "Іван Підгорський").
    - RETURN empty list IF THERE IS NO ENTITIES"""
    tokens:list=dspy.InputField()
    entities =dspy.OutputField(desc="List[Dict[str, str]]. like so [{'JOB':'entity entity entity'}, {'ORG':'entity entity'}, {'MONEY':'entity'}] LEAVE THE WORD ENDINGS OR DECLENSIONS OF ANY ENTITY AS IS!!!.")


In [32]:
# len(train)

In [33]:
just_extractor = dspy.ChainOfThought(Expert_Linguist)

In [34]:
%%capture here
start = 0       # Початковий індекс
batch_size = 3 # # Розмір слайсу 1385 

result=pd.DataFrame(
    [
        {"id":row["id"],
         "entities": just_extractor(tokens=dspy.Example(tokens=row["tokenized_text"])).entities
        } for _, row in df.iloc[start:start+batch_size].iterrows()
    ])

[92m22:38:25 - LiteLLM:INFO[0m: utils.py:2749 - 
LiteLLM completion() model= gemma2:9b-instruct-fp16; provider = ollama_chat
time=2024-12-13T22:38:25.822Z level=INFO source=sched.go:730 msg="new model will fit in available VRAM, loading" model=/root/.ollama/models/blobs/sha256-76c95736fd1483b32c8ad704594349e92fa3ec947c8fea45942caa5bd28df08d library=cuda parallel=4 required="24.0 GiB"
time=2024-12-13T22:38:26.055Z level=INFO source=server.go:105 msg="system memory" total="31.4 GiB" free="29.7 GiB" free_swap="0 B"
time=2024-12-13T22:38:26.056Z level=INFO source=memory.go:356 msg="offload to cuda" layers.requested=-1 layers.model=43 layers.offload=43 layers.split=22,21 memory.available="[14.6 GiB 14.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="24.0 GiB" memory.required.partial="24.0 GiB" memory.required.kv="2.6 GiB" memory.required.allocations="[12.8 GiB 11.1 GiB]" memory.weights.total="18.1 GiB" memory.weights.repeating="16.4 GiB" memory.weights.nonrepeating="1.7 GiB" memory

[GIN] 2024/12/13 - 22:39:50 | 200 |         1m25s |       127.0.0.1 | POST     "/api/chat"
[GIN] 2024/12/13 - 22:39:50 | 404 |    8.792349ms |       127.0.0.1 | POST     "/api/show"


[92m22:40:12 - LiteLLM:INFO[0m: utils.py:944 - Wrapper: Completed Call, calling success_handler
[92m22:40:12 - LiteLLM:INFO[0m: utils.py:2749 - 
LiteLLM completion() model= gemma2:9b-instruct-fp16; provider = ollama_chat


[GIN] 2024/12/13 - 22:40:12 | 200 | 21.588550618s |       127.0.0.1 | POST     "/api/chat"
[GIN] 2024/12/13 - 22:40:12 | 404 |     123.722µs |       127.0.0.1 | POST     "/api/show"


[92m22:40:22 - LiteLLM:INFO[0m: utils.py:944 - Wrapper: Completed Call, calling success_handler


[GIN] 2024/12/13 - 22:40:22 | 200 | 10.077518694s |       127.0.0.1 | POST     "/api/chat"
[GIN] 2024/12/13 - 22:40:22 | 404 |     154.641µs |       127.0.0.1 | POST     "/api/show"


In [35]:
for i in result['entities']:
    print()
    print(i)


[{'PERS': 'Тарас Шевченко'}, {'ORG': 'департамент культури і туризму Кіровоградської ОДА'}, {'LOC': 'Кіровоградщини'}, {'LOC': 'Черкащини'}, {'LOC': 'Моринці'}, {'LOC': 'Черкаської області'}]

[{'PERS': 'Тарас Шевченко'}, {'PERS': 'Олександр Кониський'}, {'PERS': 'Катерина Бойко'}]

[{'PERS': 'Тарас Шевченко'}, {'LOC': 'Кирилівка'}, {'LOC': 'Шевченкове'}, {'LOC': 'Звенигородського району'}, {'ORG': 'літературно-меморіальний музей'}]


In [36]:
# # Шлях до існуючого CSV
# file_path = 'combined.csv'

# # Додавання даних у файл
# result.to_csv(file_path, mode='a', header=False, index=False)

# print("Рядки успішно додано!")


## Postprocessing of test dataset

In [37]:
df=pd.read_csv('../input/combined-csv/combined.csv',encoding='UTF-8',index_col=False,)

In [38]:
# Group by 'id' and concatenate 'filtered_entities'
df = df.groupby('id', as_index=False).agg({
    'filtered_entities': lambda x: ', '.join(x)  # Concatenate strings
})

# Clean up concatenated string using regex
df['filtered_entities'] = df['filtered_entities'].apply(
    lambda x: re.sub(r'^\[|\]$', '', x).replace('][', ', ')
)

In [39]:
def convert_to_label_text_format(entity_str):
    # Regex to match key-value pairs in the new format
    pattern = r"\{\'([A-Za-z0-9_]+)\':\s*\'([^\']+)\'\}"
    
    # Find all matches
    matches = re.findall(pattern, entity_str)
    
    # Convert each match into a dictionary with 'label' and 'text'
    converted_entities = [{'label': key, 'text': value} for key, value in matches]
    
    # Return the converted entities in string format
    return str(converted_entities)

# Apply the function to the 'filtered_entities' column
df['filtered_entities'] = df['filtered_entities'].apply(convert_to_label_text_format)

In [40]:
# Function to convert list of dictionaries to JSON string with escaped Unicode characters
def convert_to_json(entity_str):
    # Remove single quotes and handle escape sequences properly
    entity_str = entity_str.replace("'", "\"")  # Replace single quotes with double quotes
    entities = json.loads(entity_str)  # Parse the string to a Python object
    return json.dumps(entities, ensure_ascii=True)  # Convert to JSON with escaped Unicode

# Apply the function to the 'filtered_entities' column
df['filtered_entities'] = df['filtered_entities'].apply(convert_to_json)

In [41]:
df = df.rename(columns={'filtered_entities': 'entities'})
df.to_csv('json_out.csv', index=False)

In [42]:
# Читання CSV файлу
df = pd.read_csv('json_out.csv')

# Запис у формат JSONL
df.to_json('JSONL.jsonl', orient='records', lines=True)


# THE END