# Mini‑Capstone Solution Notebook — CSV → RAG & Instruction Datasets
**This notebook contains one possible reference solution.** It follows the capstone tasks step‑by‑step, producing the required artifacts.

In [1]:
import shutil
from pathlib import Path
import pandas as pd, numpy as np, regex as re, json, orjson, hashlib, itertools, math, random
from datetime import datetime, timezone
from tqdm import tqdm
from glob import glob

BASE = Path('.')
ART = Path('artifacts')
for p in [ART/'jsonl', ART/'samples', ART/'stats', ART/'prompts', ART/'datasets', ART/'tokenizer']:
    p.mkdir(parents=True, exist_ok=True)

source = Path('ask_assistant_corpus.csv')
DATA = Path('data')
DATA.mkdir(parents=True, exist_ok=True)
shutil.copy2(source, DATA)

CSV_FILE = DATA / 'ask_assistant_corpus.csv'
CSV_FILE


PosixPath('data/ask_assistant_corpus.csv')

## Part A — Ingest & Hygiene

In [2]:

# Robust load
df = pd.read_csv(CSV_FILE, dtype={
    'id': str, 'category': str, 'topic': str, 'difficulty': str,
    'question': str, 'answer': str, 'source': str, 'created_at': str
})
len(df), df.head(2)


(10000,
   id             category    topic difficulty                         source  \
 0  1  security_compliance  printer     medium           seed://developer/api   
 1  2  security_compliance      sso       easy  seed://product_faq/dashboards   
 
    created_at                                           question  \
 0  2025-04-03         FYI, How do I reset my AcmeCloud password?   
 1  2025-04-29  Sorry if this is basic—How do I cancel my subs...   
 
                                               answer  
 0  Yes. Visit Integrations and connect **Make**. ...  
 1  Enable 2FA in Settings → Security. Choose **em...  )

In [4]:
def normalize_text(s: str) -> str:
    # Preserve meaning-bearing punctuation, collapse whitespace
    s = s or ""
    s = s.replace('\r','')
    s = re.sub(r"[\t ]+", " ", s)
    s = re.sub(r" *\n *", "\n", s)
    return s.strip()

df['question_norm'] = df['question'].map(normalize_text)
df['answer_norm'] = df['answer'].map(normalize_text)

# Governance gates
MIN_Q = 5
MIN_A_CHARS = 20
mask = (df['question_norm'].str.len() >= MIN_Q) & (df['answer_norm'].str.len() >= MIN_A_CHARS)
df_clean = df.loc[mask].copy()

# Deduplicate by (normalized question + answer); keep most recent created_at
df_clean['created_at_dt'] = pd.to_datetime(df_clean['created_at'], errors='coerce')
sig = (df_clean['question_norm'].str.lower() + ' || ' + df_clean['answer_norm'].str.lower()).map(
    lambda s: hashlib.sha1(s.encode('utf-8')).hexdigest()
)
df_clean['content_key'] = sig
df_clean.sort_values(['content_key','created_at_dt'], ascending=[True, False], inplace=True)
df_clean = df_clean.drop_duplicates(subset=['content_key'], keep='first')
before, after = len(df), len(df_clean)
before, after


(10000, 8000)

In [5]:
# Reviewer sample
samp_path = ART/'samples'/'capstone_csv_sample.jsonl'
with samp_path.open('w', encoding='utf-8') as f:
    for _, r in df_clean.head(5).iterrows():
        rec = {
            'id': r['id'],
            'question': r['question_norm'],
            'answer': r['answer_norm'],
            'category': r['category'],
            'topic': r['topic'],
            'difficulty': r['difficulty']
        }
        f.write(orjson.dumps(rec).decode() + '\n')
samp_path


PosixPath('artifacts/samples/capstone_csv_sample.jsonl')

## Part B — RAG Chunks with Provenance

In [6]:

def split_chunks(text: str, max_chars=1100, overlap=180):
    text = re.sub(r"\s+", " ", str(text)).strip()
    chunks = []
    i = 0
    while i < len(text):
        end = min(i + max_chars, len(text))
        chunks.append(text[i:end])
        if end == len(text):
            break
        i = max(0, end - overlap)
    return chunks

schema = 'rag-chunk-v1'
max_shard_bytes = 50_000_000
idx = 0; cur = 0
out = open(ART/'jsonl'/f"rag_chunks_from_csv_{idx:03d}.jsonl", 'w', encoding='utf-8')

def write_line(obj):
    global idx, cur, out
    line = orjson.dumps(obj).decode() + '\n'
    if cur + len(line.encode('utf-8')) > max_shard_bytes:
        out.close(); idx += 1; cur = 0
        out = open(ART/'jsonl'/f"rag_chunks_from_csv_{idx:03d}.jsonl", 'w', encoding='utf-8')
    out.write(line); cur += len(line.encode('utf-8'))

for _, r in tqdm(df_clean.iterrows(), total=len(df_clean)):
    doc_id = r['id']
    text = r['answer_norm']
    chunks = split_chunks(text)
    for j, ch in enumerate(chunks):
        rec = {
            'doc_id': doc_id,
            'chunk_id': f"{doc_id}-{j:04d}",
            'text': ch,
            'metadata': {
                'category': r['category'],
                'topic': r['topic'],
                'difficulty': r['difficulty'],
                'source': r['source'],
                'schema_version': schema
            }
        }
        write_line(rec)

out.close()
files = sorted(glob(str(ART/'jsonl'/'rag_chunks_from_csv_*.jsonl')))
files[:3], len(files)


100%|██████████| 8000/8000 [00:00<00:00, 11457.93it/s]


(['artifacts/jsonl/rag_chunks_from_csv_000.jsonl'], 1)

In [7]:
# Validate & sample
import json, itertools
def validate_jsonl(path):
    total = bad = short = 0
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            total += 1
            try:
                obj = json.loads(line)
                ok = isinstance(obj, dict) and 'text' in obj and 'metadata' in obj and len(obj['text']) >= 40
                if not ok:
                    bad += 1
                if len(obj.get('text','')) < 40:
                    short += 1
            except Exception:
                bad += 1
    return {'file': path, 'total': total, 'bad': bad, 'short_text': short}

val = [validate_jsonl(p) for p in files]
val[:2]


[{'file': 'artifacts/jsonl/rag_chunks_from_csv_000.jsonl',
  'total': 8000,
  'bad': 0,
  'short_text': 0}]

## Part C — Instruction‑Tuning JSONL (Trio & Prompt‑Completion)

In [8]:
TRIO = ART/'jsonl'/'instruct_trio.jsonl'
PC   = ART/'jsonl'/'instruct_prompt_completion.jsonl'

TEMPLATE = "### Instruction:\n{q}\n\n### Response:\n"

kept = 0
with TRIO.open('w', encoding='utf-8') as ft, PC.open('w', encoding='utf-8') as fp:
    for _, r in df_clean.iterrows():
        instruction = r['question_norm']
        output = r['answer_norm']
        if len(output) < 20: 
            continue
        meta = {
            'id': r['id'], 'category': r['category'], 'topic': r['topic'],
            'difficulty': r['difficulty'], 'created_at': r['created_at'],
            'schema_version': 'trio-v1'
        }
        ft.write(orjson.dumps({'instruction': instruction, 'input': '', 'output': output, 'metadata': meta}).decode() + '\n')
        prompt = TEMPLATE.format(q=instruction)
        fp.write(orjson.dumps({'prompt': prompt, 'completion': output, 'metadata': meta}).decode() + '\n')
        kept += 1
kept


8000

In [9]:
# Length governance & stats
import json, numpy as np
def token_proxy(s): return max(1, len(s.split()))

stats = {'count_pc': 0, 'prompt_tokens': [], 'completion_tokens': []}
with open(PC, 'r', encoding='utf-8') as f:
    for line in f:
        o = json.loads(line)
        stats['count_pc'] += 1
        stats['prompt_tokens'].append(token_proxy(o['prompt']))
        stats['completion_tokens'].append(token_proxy(o['completion']))

import statistics as st
summary = {
    'pc_count': stats['count_pc'],
    'prompt_mean_words': float(st.mean(stats['prompt_tokens'])) if stats['prompt_tokens'] else 0.0,
    'completion_mean_words': float(st.mean(stats['completion_tokens'])) if stats['completion_tokens'] else 0.0
}
Path(ART/'stats'/'instruction_stats.json').write_text(json.dumps(summary, indent=2), encoding='utf-8')
summary


{'pc_count': 8000,
 'prompt_mean_words': 15.288125,
 'completion_mean_words': 16.8485}

## Part D — 🤗 Datasets + (Optional) Offline Tokenization

In [10]:
from datasets import load_dataset, DatasetDict

ds = load_dataset('json', data_files=str(PC), split='train')
ds = ds.filter(lambda ex: len(ex['prompt'])>0 and len(ex['completion'])>0)
seed = 42
train_test = ds.train_test_split(test_size=0.2, seed=seed)
val_test = train_test['test'].train_test_split(test_size=0.5, seed=seed)
dds = DatasetDict({'train': train_test['train'], 'validation': val_test['train'], 'test': val_test['test']})
for k in dds: dds[k] = dds[k].shuffle(seed=seed)
dds


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 8000 examples [00:00, 141154.04 examples/s]
Filter: 100%|██████████| 8000/8000 [00:00<00:00, 50060.77 examples/s]


DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'metadata'],
        num_rows: 6400
    })
    validation: Dataset({
        features: ['prompt', 'completion', 'metadata'],
        num_rows: 800
    })
    test: Dataset({
        features: ['prompt', 'completion', 'metadata'],
        num_rows: 800
    })
})

In [11]:
dds.save_to_disk(str(ART/'datasets'/'pc_splits'))
str(ART/'datasets'/'pc_splits')


Saving the dataset (1/1 shards): 100%|██████████| 6400/6400 [00:00<00:00, 89531.01 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 800/800 [00:00<00:00, 62564.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 800/800 [00:00<00:00, 54789.01 examples/s]


'artifacts/datasets/pc_splits'

In [12]:
# Optional tiny tokenizer (Byte‑Level BPE) and mapping
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import ByteLevel as ByteLevelProcessor
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

train_txt = ART/'tokenizer'/'train_prompts.txt'
with train_txt.open('w', encoding='utf-8') as f:
    for ex in dds['train']:
        f.write(ex['prompt'] + '\n')

tok = Tokenizer(BPE(unk_token='<unk>'))
tok.pre_tokenizer = ByteLevel()
trainer = BpeTrainer(vocab_size=8000, min_frequency=2, special_tokens=['<unk>','<pad>','<bos>','<eos>'])
tok.train([str(train_txt)], trainer)
tok.post_processor = ByteLevelProcessor()
tok.decoder = ByteLevelDecoder()
tok.save(str(ART/'tokenizer'/'bytebpe.json'))

# Map prompts+completions to ids (simple concat)
from tokenizers import Tokenizer as T2
tok2 = T2.from_file(str(ART/'tokenizer'/'bytebpe.json'))
pad_id = tok2.token_to_id('<pad>')
eos_id = tok2.token_to_id('<eos>') or 0
MAX_LEN = 512

def encode_batch(batch):
    ids, attn = [], []
    for p, c in zip(batch['prompt'], batch['completion']):
        text = p + c
        enc = tok2.encode(text)
        x = enc.ids[:MAX_LEN-1] + [eos_id]
        pad_len = MAX_LEN - len(x)
        if pad_len > 0:
            x = x + [pad_id]*pad_len
        m = [1]*min(len(enc.ids)+1, MAX_LEN) + [0]*max(0, MAX_LEN - (min(len(enc.ids)+1, MAX_LEN)))
        ids.append(x); attn.append(m)
    return {'input_ids': ids, 'attention_mask': attn}

batched = dds.map(encode_batch, batched=True, batch_size=256, remove_columns=dds['train'].column_names)
batched







Map: 100%|██████████| 6400/6400 [00:01<00:00, 3705.79 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 3632.71 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 3570.27 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 6400
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 800
    })
})

## Part E — Few‑Shot Exemplar Bank & Prompt Packs

In [13]:
# Build exemplar bank from Trio
bank = []
with open(ART/'jsonl'/'instruct_trio.jsonl','r',encoding='utf-8') as f:
    for line in f:
        o = json.loads(line)
        if len(o.get('output','').split()) >= 5:
            bank.append({'instruction': o['instruction'], 'input': o.get('input',''), 'output': o['output']})
len(bank)


8000

In [14]:
# TF‑IDF selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import pandas as pd
bank_df = pd.DataFrame(bank)
corpus = (bank_df['instruction'] + ' ' + bank_df['input']).tolist()
vec = TfidfVectorizer(min_df=2, max_df=0.9, ngram_range=(1,2))
X = vec.fit_transform(corpus)

def topk(query_instruction: str, k: int = 3):
    qv = vec.transform([query_instruction])
    sims = cosine_similarity(qv, X)[0]
    idxs = np.argsort(-sims)[:k]
    return bank_df.iloc[idxs][['instruction','input','output']].to_dict(orient='records')

HDR_TMPL = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
def build_fewshot(task_instruction: str, task_input: str = '', k: int = 3):
    shots = topk(task_instruction, k=k)
    parts = []
    for ex in shots:
        parts.append(HDR_TMPL.format(instruction=ex['instruction'], input=ex.get('input','')) + ex['output'] + "\n\n")
    parts.append(HDR_TMPL.format(instruction=task_instruction, input=task_input))
    return ''.join(parts), shots

prompt, used = build_fewshot("Summarize SSO setup steps", k=3)
prompt[:600]


'### Instruction:\nHow do I set up SSO with SSO (SAML)?\n\n### Input:\n\n\n### Response:\nYes. Visit Integrations and connect **Microsoft Teams**. You may need admin permissions in both systems.\n\n### Instruction:\nHow do I set up SSO with SSO (SAML)?\n\n### Input:\n\n\n### Response:\nOpen Reports → Export and choose **JSON**. Exports are emailed and available in your download history.\n\n### Instruction:\nHow do I set up SSO with SSO (SAML)?\n\n### Input:\n\n\n### Response:\nUse Import → Upload and select **JSON**. Map fields, run a preview, and start the import; invalid rows are reported.\n\n### Instruction:\nSummarize'

In [15]:
# Persist prompt pack
pack = ART/'prompts'/'fewshot_prompt_pack.jsonl'
with pack.open('w', encoding='utf-8') as f:
    tasks = [
        {'instruction':'Summarize SSO setup steps','input':'', 'k':3},
        {'instruction':'Explain rate limits policy','input':'', 'k':4},
        {'instruction':'Create a troubleshooting checklist for SAML claims','input':'', 'k':3},
    ]
    for t in tasks:
        p, used = build_fewshot(t['instruction'], t.get('input',''), k=t['k'])
        rec = {'prompt': p, 'metadata': {'k': t['k'], 'used': used}}
        f.write(orjson.dumps(rec).decode() + '\n')
str(pack)


'artifacts/prompts/fewshot_prompt_pack.jsonl'

## Wrap‑Up & Artifacts
- JSONL: `artifacts/jsonl/rag_chunks_from_csv_*.jsonl`, `artifacts/jsonl/instruct_trio.jsonl`, `artifacts/jsonl/instruct_prompt_completion.jsonl`
- Datasets: `artifacts/datasets/pc_splits/`
- Tokenizer: `artifacts/tokenizer/bytebpe.json`
- Prompts: `artifacts/prompts/fewshot_prompt_pack.jsonl`
- Samples/Stats under `artifacts/samples/` and `artifacts/stats/`