In [95]:
import pandas as pd
import numpy as np
import re
from datasets import load_dataset, Dataset
from tqdm import tqdm
import json

In [96]:
ds = load_dataset("koen430/relevant_selected_stock_news")

## data chunking

In [97]:
train = ds["train"]
val= ds["val"]
test=ds["test"]

In [98]:
#Sentence Splitting & Chunking
def split_sentences(text: str):
    if not text:
        return []
    sents = re.split(r'(?<=[。！？.!?])\s*', text)
    return [s.strip() for s in sents if s.strip()]

def slice_into_chunks(text: str, n_sent: int = 4):
    sents = split_sentences(text)
    if not sents:
        return [text]
    chunks = []
    for i in range(0, len(sents), n_sent):
        chunks.append(" ".join(sents[i:i+n_sent]))
    return chunks

# Expand rows(with sliced text and original features)
def data_chunking(data: Dataset):
    new_rows = []
    col_order = [
    "ticker", "prompt", "text", "url", "result_1", "result_1_bin",
    "relevance", "token_count", "__index_level_0__", "slice_id"
    ]

    for row in tqdm(data):
        chunks = slice_into_chunks(row["text"], n_sent=4)
        for j, ch in enumerate(chunks):
            new_row = dict(row)        
            new_row["text"] = ch      
            new_row["slice_id"] = j    
            new_rows.append(new_row)

    # None check of features
    new_rows = [{c: r.get(c, None) for c in col_order} for r in new_rows]

    # constrcut new Dataset
    sliced_ds = Dataset.from_list(new_rows)
    return sliced_ds


In [99]:
train_1=data_chunking(train)
val_1=data_chunking(val)
test_1=data_chunking(test)

100%|██████████| 3600/3600 [00:00<00:00, 4998.33it/s]
100%|██████████| 200/200 [00:00<00:00, 3697.46it/s]
100%|██████████| 200/200 [00:00<00:00, 3980.17it/s]


## Transform Dataset to Json for STF

In [100]:
def coerce_text(x: str):
    """Convert any value to string safely."""
    if x is None:
        return ""
    if isinstance(x, str):
        return x.strip()
    try:
        if isinstance(x, float) and x.is_integer():
            return str(int(x))
        return str(x)
    except Exception:
        return ""

In [101]:
def build_json(row: dict):
    ticker = row.get("ticker", "UNKNOWN")
    text = row.get("text", "")
    result_1 = row.get("result_1", "uncertain")

    messages = [
        {"role": "system", "content": "You are an experienced investment manager."},
        {"role": "user", "content": f"Can you give me some advice on the stock {ticker}?"},
        {"role": "assistant", "content": "Sure. Could you please provide me with some specific information about this stock?"},
        {"role": "user", "content": text},
        {"role": "assistant", "content": f"This stock is expected to be {result_1} tomorrow."}
    ]

    return {"messages": messages}

In [102]:
def build_json(data: Dataset):
    n=len(data)
    with open(OUTPUT_JSONL, "w", encoding="utf-8") as fw:
        for row in tqdm(data, total=n, desc="Building ChatML JSONL", unit="row"):
            ticker = coerce_text(row.get("ticker", ""))
            text = coerce_text(row.get("text", ""))
            result_1 = coerce_text(row.get("result_1", ""))

            if not ticker:
                ticker = "unknown"
            if not result_1:
                result_1 = "uncertain"

            messages = build_messages(ticker, text, result_1)
            fw.write(json.dumps({"messages": messages}, ensure_ascii=False) + "\n")

In [103]:
OUTPUT_JSONL = "./stf/qwen_sft_train.jsonl" 
MAX_TEXT_CHARS = None 
build_json(train_1)

Building ChatML JSONL: 100%|██████████| 37166/37166 [00:05<00:00, 6469.36row/s]


In [104]:
OUTPUT_JSONL = "./stf/qwen_sft_val.jsonl" 
build_json(val_1)

Building ChatML JSONL: 100%|██████████| 2165/2165 [00:00<00:00, 6572.38row/s]


In [105]:
OUTPUT_JSONL = "./stf/qwen_sft_test.jsonl" 
build_json(test_1)

Building ChatML JSONL: 100%|██████████| 2004/2004 [00:00<00:00, 5849.92row/s]
