# Train part

## add-ons -> secrets ->activate HF_TOKEN

## Installation preparations

In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")
import os
os.environ["PATH"] = "/root/.local/bin:" + os.environ["PATH"]
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # magic downloads https://huggingface.co/docs/hub/models-downloading
os.environ["DSP_CACHEBOOL"] = "False" # for quick dev

In [2]:
%%capture here
!curl -LsSf https://astral.sh/uv/install.sh | sh

In [3]:
%%capture here
!uv pip install scikit-learn --system

In [4]:
%%capture here
!uv pip uninstall torch torchvision torchaudio --system
!uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --system

In [5]:
%%capture here
!uv pip install proto-plus==1.24.0.dev1 dataset --system
!uv pip install --upgrade datasets --system
!uv pip install vllm --system

In [6]:
%%capture here
!uv pip install "huggingface_hub[hf_transfer]" --system
!uv pip install dspy==2.5.34 datasets bitsandbytes triton hf_transfer --system # vllm

# Dataset prep

In [7]:
# !uv pip install pandas --system

In [8]:
import pandas as pd
import json
import re
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [92]:
df=pd.read_csv("/kaggle/input/train-csv/train.csv",encoding='UTF-8',index_col=False)

In [93]:
STOP_WORDS = {
    # Prepositions
     'під', 'над', 'зі', 'біля',
    'для', 'між', 'перед', 'після', 'без', 'крізь',
    # Conjunctions
    'але', 'проте', 'однак', 'чи', 'якщо', 'коли',
    'щоб', 'бо', 'тому що', 'оскільки',
    # Articles and particles
    'же', 'би', 'б', 'хіба', 'невже',
    #additional
    'що','...'
}
def decode_entities(json_str):
    return json.loads(json_str)


def clean_text(text):
    # Replace all newline characters with a single space
    cleaned_text = re.sub(r'\n\n', ' ', text)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    # Remove commas and em dashes, replacing them with a single space
    cleaned_text = re.sub(r'[,—]', ' ', cleaned_text)
    # Replace consecutive spaces with a single space, iteratively
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    return cleaned_text

In [94]:
# decoding of /u000 and saving as [{'MISC': 'КСВ'},...]
df["entities"] = (
    df["entities"]
    .apply(decode_entities)
    .apply(lambda entities: [{d["label"]: d["text"]} for d in entities])
)
df["text"] = df["text"].apply(clean_text)

In [95]:
def split_text_into_sentences(text):
    # Split text into sentences using regular expressions
    sentences = re.split(r'(?<=[.!?])\s*', text)  # Split by period, exclamation, or question mark
    return [s.strip() for s in sentences if s.strip()]  # Remove empty strings

def chunk_sentences(sentences, chunk_size, max_length):
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence)
        # Check if adding the current sentence exceeds the chunk size or max length
        if len(current_chunk) < chunk_size and (current_length + sentence_length) <= max_length:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            # Finalize the current chunk and start a new one
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = [sentence]
            current_length = sentence_length

    # Add the last chunk if it contains any sentences
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

def process_dataframe(df, chunk_size, max_length):
    # Apply splitting and chunking to the specified column of the DataFrame
    df['split_text'] = df['text'].apply(
        lambda text: chunk_sentences(split_text_into_sentences(text), chunk_size, max_length)
    )
    return df

df = pd.DataFrame(df)
chunk_size = 6  # Number of sentences per chunk
max_length = 600  # Maximum character length per chunk

# Process the DataFrame
df = process_dataframe(df, chunk_size, max_length)

In [96]:
# Flatten the dataframe: each chunk of sentences becomes a new row, rest of columns are duplicated
expanded_df = df.explode('split_text', ignore_index=True)

In [97]:
expanded_df=expanded_df[['id','split_text','entities']]

In [15]:
# expanded_df.columns

In [16]:
# for i in expanded_df['split_text']:
#     print()
#     print(i)

In [17]:
# len(expanded_df)

In [100]:
# drop ['']
expanded_df = expanded_df[expanded_df['split_text'].apply(lambda x: x != [''])]

In [101]:
def filter_entities_based_on_text(split_text, entities):
    # Combine all split text pieces into one string and convert to lowercase
    combined_text = ' '.join(split_text).lower()
    
    # Initialize list for filtered entities
    filtered_entities = []
    
    # Create a dictionary to track occurrences of each entity
    entity_counts = {}
    
    # First pass: count occurrences of each unique entity in the text
    for entity in entities:
        for entity_type, value in entity.items():
            value_lower = value.lower()
            # Count actual occurrences in text
            count = combined_text.count(value_lower)
            if count > 0:
                entity_id = f"{entity_type}_{value_lower}"
                entity_counts[entity_id] = count
    
    # Second pass: add entities the correct number of times
    for entity in entities:
        for entity_type, value in entity.items():
            value_lower = value.lower()
            entity_id = f"{entity_type}_{value_lower}"
            
            # If we still have occurrences to account for
            if entity_id in entity_counts and entity_counts[entity_id] > 0:
                filtered_entities.append(entity)
                entity_counts[entity_id] -= 1
    
    return filtered_entities

# Apply the filter function to the DataFrame
expanded_df['filtered_entities'] = expanded_df.apply(
    lambda row: filter_entities_based_on_text(row['split_text'], row['entities']), 
    axis=1
)


In [102]:
# tokenize right here (just in case)
# Define a function to tokenize and flatten the list of sentences into a single list of tokens
def tokenize_text(text_list):
    # Regex to match words, handling special characters
    tokenized_list = [re.findall(r'\S+|\n', text) for text in text_list]
    # Flatten the list of lists into a single list
    return [token for sublist in tokenized_list for token in sublist]

# Apply this function to the 'split_text' column
expanded_df['tokenized_text'] = expanded_df['split_text'].apply(tokenize_text)


In [103]:
# Splitting the dataset
X_train, X_test = train_test_split(expanded_df, test_size=0.3, random_state=42)
X_train=X_train[['id','tokenized_text','filtered_entities']]
X_test=X_test[['id','tokenized_text','filtered_entities']]

# Model part

## Installation and  setup

In [22]:
os.environ["PATH"] = "/root/.local/bin:" + os.environ["PATH"]
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 

In [23]:
%%capture here
!uv pip install ollama --system

In [24]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")
import dspy
import pandas as pd
import ollama
os.environ["HF_TOKEN"] = secret_value_0
import subprocess
import time


In [26]:
%%capture here
!curl -fsSL https://ollama.com/install.sh | sh

In [42]:
%%capture here
!ollama pull gemma2:9b-instruct-fp16 # 8 хв чекаєм

[GIN] 2024/12/13 - 20:11:56 | 200 |      57.186µs |       127.0.0.1 | HEAD     "/"
[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest 
pulling 76c95736fd14...   0% ▕                ▏    0 B/ 18 GB                  [?25h

time=2024-12-13T20:11:57.119Z level=INFO source=download.go:175 msg="downloading 76c95736fd14 in 19 1 GB part(s)"


[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14...   0% ▕                ▏    0 B/ 18 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14...   0% ▕                ▏ 1.4 MB/ 18 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14...   0% ▕                ▏  48 MB/ 18 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14...   0% ▕                ▏  87 MB/ 18 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14...   1% ▕                ▏ 157 MB/ 18 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14...   1% ▕                ▏ 228 MB/ 18 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14...   1% ▕                ▏ 264 MB/ 18 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14...   2% ▕                ▏ 330 MB/ 18 GB

time=2024-12-13T20:13:00.384Z level=INFO source=download.go:175 msg="downloading 109037bec39c in 1 136 B part(s)"


[?25l[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         [?25h[?25l[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         [?25h[?25l[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         [?25h[?25l[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         [?25h[?25l[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                

time=2024-12-13T20:13:01.643Z level=INFO source=download.go:175 msg="downloading 097a36493f71 in 1 8.4 KB part(s)"


[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         
pulling 097a36493f71...   0% ▕                ▏    0 B/8.4 KB                  [?25h[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         
pulling 097a36493f71...   0% ▕                ▏    0 B/8.4 KB                  [?25h[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         
pulling 097a36493f71...   0% ▕                ▏    0 B/8.4 KB                  [?25h[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulli

time=2024-12-13T20:13:02.858Z level=INFO source=download.go:175 msg="downloading 2490e7468436 in 1 65 B part(s)"


[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         
pulling 097a36493f71... 100% ▕████████████████▏ 8.4 KB                         
pulling 2490e7468436...   0% ▕                ▏    0 B/  65 B                  [?25h[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         
pulling 097a36493f71... 100% ▕████████████████▏ 8.4 KB                         
pulling 2490e7468436... 100% ▕████████████████▏   65 B                         [?25h[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████

time=2024-12-13T20:13:04.050Z level=INFO source=download.go:175 msg="downloading 6c29221808fb in 1 486 B part(s)"


[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         
pulling 097a36493f71... 100% ▕████████████████▏ 8.4 KB                         
pulling 2490e7468436... 100% ▕████████████████▏   65 B                         
pulling 6c29221808fb...   0% ▕                ▏    0 B/ 486 B                  [?25h[?25l[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1G[A[2K[1Gpulling manifest 
pulling 76c95736fd14... 100% ▕████████████████▏  18 GB                         
pulling 109037bec39c... 100% ▕████████████████▏  136 B                         
pulling 097a36493f71... 100% ▕████████████████▏ 8.4 KB                         
pulling 2490e7468436... 100% ▕████████████████▏   65 B                         
pulling 6c29221808fb...   0% ▕                ▏    0 B/ 486 B                  [?25h[?25l[2K[1G[

In [None]:
time.sleep(60*8)

## Model predict part

In [43]:
# %%capture here
process = subprocess.Popen("ollama serve", shell=True) #runs on a different thread #add "--tensor_parallel_size", "2",


Error: listen tcp 127.0.0.1:11434: bind: address already in use


In [44]:
lm = dspy.LM('ollama_chat/gemma2:9b-instruct-fp16', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [105]:
# Convert DataFrame rows into dspy.Example objects
def create_examples(dataset):
    examples = [
        dspy.Example(
            # id = row["id"],
            tokens=row["tokenized_text"],
            expected_entities=row["filtered_entities"]
        ).with_inputs("tokenized_text")
        for _, row in dataset.iterrows()
    ]
    return examples

# Example usage with different datasets
train = create_examples(X_train[:100]) # 100 samples#############
test = create_examples(X_test[:33]) # 33 samples ################


In [116]:
class Expert_Linguist(dspy.Signature):
    """EXTRACT ENTITIES:
    LEAVE THE WORD ENDINGS OR DECLENSIONS OF ANY ENTITY AS IS!!!.
    JOB: roles/positions
    PERS: names
    ORG: organizations
    LOC: locations
    ART: artifacts
    DATE: dates
    TIME: times
    PERIOD: durations
    MONEY: amounts
    PCT: percentages
    QUANT: quantities
    MISC: other named
    DOC: documents
    - LEAVE THE WORD ENDINGS OR DECLENSIONS OF ANY ENTITY AS IS!!!. 
      The original form of the entity (with correct case, gender, and declension) should remain intact.
    - Value for "JOB" e.g. can be "фахівець із IT"
    - Value for "LOC" e.g. can be "Чернівців"
    - Value for "ART" e.g. can be "Різдвяна історія з Тіною Кароль"
    - Value for "ART" e.g. can be "олешківської міськрайонки"
    - Value for "PERS" e.g. can be "Іван Підгорський").
    - RETURN empty list IF THERE IS NO ENTITIES"""
    tokens=dspy.InputField()
    entities =dspy.OutputField(desc="List[Dict[str, str]]. like so [{'JOB':'entity entity entity'}, {'ORG':'entity entity'}, {'MONEY':'entity'}] LEAVE THE WORD ENDINGS OR DECLENSIONS OF ANY ENTITY AS IS!!!.")


In [117]:
just_extractor = dspy.ChainOfThought(Expert_Linguist)

In [53]:
df2=X_train[['id', 'tokenized_text', 'filtered_entities']]

In [120]:
%%capture
start = 1       # Початковий індекс
batch_size = 2# # Розмір слайсу 1385 

result=pd.DataFrame(
    [
        {"id":row["id"],
         "entities": just_extractor(tokens=dspy.Example(tokens=row["tokenized_text"])).entities
        } for _, row in df2.iloc[start:start+batch_size].iterrows()
    ])

[92m20:52:13 - LiteLLM:INFO[0m: utils.py:2749 - 
LiteLLM completion() model= gemma2:9b-instruct-fp16; provider = ollama_chat
time=2024-12-13T20:52:13.970Z level=INFO source=sched.go:730 msg="new model will fit in available VRAM, loading" model=/root/.ollama/models/blobs/sha256-76c95736fd1483b32c8ad704594349e92fa3ec947c8fea45942caa5bd28df08d library=cuda parallel=4 required="24.0 GiB"
time=2024-12-13T20:52:14.178Z level=INFO source=server.go:105 msg="system memory" total="31.4 GiB" free="29.4 GiB" free_swap="0 B"
time=2024-12-13T20:52:14.179Z level=INFO source=memory.go:356 msg="offload to cuda" layers.requested=-1 layers.model=43 layers.offload=43 layers.split=22,21 memory.available="[14.6 GiB 14.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="24.0 GiB" memory.required.partial="24.0 GiB" memory.required.kv="2.6 GiB" memory.required.allocations="[12.8 GiB 11.1 GiB]" memory.weights.total="18.1 GiB" memory.weights.repeating="16.4 GiB" memory.weights.nonrepeating="1.7 GiB" memory

[GIN] 2024/12/13 - 20:52:36 | 200 | 22.990841169s |       127.0.0.1 | POST     "/api/chat"
[GIN] 2024/12/13 - 20:52:36 | 404 |      111.42µs |       127.0.0.1 | POST     "/api/show"


[92m20:52:58 - LiteLLM:INFO[0m: utils.py:944 - Wrapper: Completed Call, calling success_handler


[GIN] 2024/12/13 - 20:52:58 | 200 | 21.829069719s |       127.0.0.1 | POST     "/api/chat"
[GIN] 2024/12/13 - 20:52:58 | 404 |     125.062µs |       127.0.0.1 | POST     "/api/show"


In [121]:
for i in result.entities:
    print()
    print(i)


[{'PERS': 'Ілія'}, {'ART': 'замок'}, {'ART': 'золото'}, {'ART': 'безсмертя'}]

[{'JOB': 'фахівець із IT'}, {'ORG': 'ДП «Шахтоуправління «Південнодонбаське №1»'}, {'ORG': 'ТОВ «Компанія Промторг»'}, {'ORG': '«Вісник державних закупівель»'}, {'ORG': "«Об'єднана компанія «Укрвуглереструктуризація»"}, {'ART': 'ШСС-1П'}, {'MONEY': '13 34 млн грн.'}, {'MONEY': '7408 грн.'}, {'MONEY': '6734 грн.'}, {'DATE': 'лютого'}, {'DATE': '20156 року'}, {'TIME': 'листопаді'}, {'TIME': 'минулого року'}]


# THE END

In [118]:
# def extraction_correctness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> bool:
#     """
#     Computes correctness of entity extraction predictions.
    
#     Args:
#         example (dspy.Example): The dataset example containing expected people entities.
#         prediction (dspy.Prediction): The prediction from the DSPy people extraction program.
#         trace: Optional trace object for debugging.
    
#     Returns:
#         bool: True if predictions match expectations, False otherwise.
#     """
#     return prediction.entities == example.expected_entities

# evaluate_correctness = dspy.Evaluate(
#     devset=test,
#     metric=extraction_correctness_metric,
#     num_threads= 48,# 24
#     display_progress=True,
#     display_table=True
# )

In [119]:
# evaluate_correctness(Expert_Linguist, devset=test)

2024/12/13 20:46:42 ERROR dspy.utils.parallelizer: Error processing item Example({'tokens': ['Цей', 'фактор', 'звичайно', 'важливий', 'проте', 'такий', 'вибір', 'не', 'завжди', 'може', 'бути', 'найвигіднішим', '.', 'Для', 'того', 'щоби', 'зробити', 'оптимальний', 'вибір', 'варто', 'звернути', 'увагу', 'на', 'такі', 'нюанси', '.', 'Щорічно', 'Пенсійний', 'Фонд', 'визначає', 'перелік', 'банків', 'які', 'можуть', 'надавати', 'послуги', 'з', 'виплати', 'пенсій', '.', 'Теоретично', 'Пенсійний', 'Фонд', 'вибирає', 'для', 'співпраці', 'надійні', 'банки', 'з', 'хорошими', 'фінансовими', 'показниками', '.'], 'expected_entities': [{'ORG': 'Пенсійний Фонд'}, {'ORG': 'Пенсійний Фонд'}]}) (input_keys={'tokenized_text'}): 2 validation errors for Expert_Linguist
tokens
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing
entities
  Field required [type=missing, input_value={}, input_type=dict]
    For further inf

  0%|          | 0/33 [00:00<?, ?it/s]

ValidationError: 2 validation errors for Expert_Linguist
tokens
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing
entities
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing

In [76]:
# mipro_optimizer = dspy.MIPROv2(
#     metric=extraction_correctness_metric,
    
# num_candidates= 2 #default
#     #auto="medium",
# )
# optimized_people_extractor = mipro_optimizer.compile(
#     Expert_Linguist,
#     trainset=train,
#     max_bootstrapped_demos=2,#4
#     num_trials=1,#default
#     requires_permission_to_run=False,
#     minibatch=True,
# )

ValueError: Minibatch size cannot exceed the size of the valset. Valset size: 8.

In [None]:
evaluate_correctness(Expert_Linguist, devset=X_test)

In [None]:
dspy.inspect_history(n=1)