In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("VSCodeTest") \
    .master("local[*]") \
    .getOrCreate()

print("✅ Spark session running in VS Code!")
print("Spark version:", spark.version)

✅ Spark session running in VS Code!
Spark version: 4.0.0


In [2]:
import time
import json
from typing import Optional
from openai import OpenAI
from IPython.display import Markdown, display


# Connect to your local Ollama server
client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama"  # Dummy key required by SDK
)

In [3]:
# --- Setup ---
conversation_log = []
chat_memory = []

def ask_local_gpt(
    prompt: str,
    model: str = "gpt-oss:20b",
    system_message: Optional[str] = None,
    render_markdown: bool = True,
    verbose: bool = False,
    return_raw: bool = False,
    reset_chat: bool = False,
    show_history: bool = False,
    stream_response: bool = True,
    reasoning_mode: bool = False,
    json_mode: bool = False,  # <-- new flag!
    structured_return: bool = False  # <-- NEW!
) -> Optional[str]:
    """
    Query a local GPT model via Ollama, with support for step-by-step reasoning (Scratchpad).
    If json_mode is True, the model is instructed to respond in JSON, and results are parsed accordingly.

    Args:
        prompt (str): The user’s input/question.
        ...
        json_mode (bool): If True, instructs the LLM to return a JSON object with 'thoughts' and 'answer'.
    Returns:
        str or None: Clean final answer or full output, depending on flags.
    """
    global chat_memory, conversation_log

    try:
        if reset_chat:
            chat_memory = []

        # -- Prompt setup --
        if json_mode:
            sys_prompt = (
                "You are a helpful assistant. "
                "For every question, reply ONLY in this exact JSON format: "
                "{\"thoughts\": \"<step-by-step reasoning>\", \"answer\": \"<final answer only>\"} "
                "Do not include any extra commentary, code fences, or markdown—just the JSON."
            )
            chat_memory = [{"role": "system", "content": sys_prompt}]
            full_prompt = prompt
        else:
            # Normal or reasoning mode (old logic)
            default_reasoning_prompt = (
                "You are a helpful assistant that always reasons step by step before giving an answer. "
                "First, think through the problem, then provide a clear final answer."
            )
            if not any(m.get("role") == "system" for m in chat_memory):
                chat_memory.insert(0, {
                    "role": "system",
                    "content": system_message or (default_reasoning_prompt if reasoning_mode else "")
                })
            if reasoning_mode:
                full_prompt = (
                    f"### Scratchpad:\n"
                    f"The user asked: \"{prompt.strip()}\"\n"
                    f"Think step-by-step and reason before answering.\n\n"
                    f"### Final Answer:\n"
                )
            else:
                full_prompt = prompt

        chat_memory.append({"role": "user", "content": full_prompt})
        start = time.time()

        # --- Get response (streamed or not) ---
        if stream_response:
            print("🤔 Thinking...\n")
            stream = client.chat.completions.create(
                model=model,
                messages=chat_memory,
                stream=True
            )
            tokens = []
            for chunk in stream:
                delta = chunk.choices[0].delta.content or ""
                print(delta, end="", flush=True)
                tokens.append(delta)
            print()
            assistant_reply = "".join(tokens)
        else:
            response = client.chat.completions.create(
                model=model,
                messages=chat_memory
            )
            assistant_reply = response.choices[0].message.content

        chat_memory.append({"role": "assistant", "content": assistant_reply})

        # --- Parse output ---
        thoughts, final_answer = "", assistant_reply.strip()
        if json_mode:
            # Try to parse first valid JSON block in reply
            try:
                start_idx = assistant_reply.find('{')
                end_idx = assistant_reply.rfind('}') + 1
                if start_idx == -1 or end_idx == -1:
                    raise ValueError("No JSON object detected in model reply.")
                data = json.loads(assistant_reply[start_idx:end_idx])
                thoughts = data.get("thoughts", "").strip()
                final_answer = data.get("answer", "").strip()
            except Exception as e:
                print("❌ Failed to parse model JSON output:", e)
                thoughts = ""
                final_answer = assistant_reply.strip()
        else:
            # fallback: regex/extraction (from your original, see above)
            import re
            scratchpad_match = re.search(
                r"### Scratchpad:\s*(.*?)### Final Answer:",
                assistant_reply,
                re.DOTALL | re.IGNORECASE
            )
            final_answer_match = re.search(
                r"### Final Answer:\s*([\s\S]*?)(?:$|\n#|\n\n)",
                assistant_reply,
                re.DOTALL | re.IGNORECASE
            )
            if scratchpad_match:
                thoughts = scratchpad_match.group(1).strip()
            if final_answer_match:
                final_answer = final_answer_match.group(1).strip()

        # --- Log results ---
        conversation_log.append({
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "prompt": prompt,
            "thoughts": thoughts,
            "answer": final_answer,
            "raw": assistant_reply
        })

        # --- Output control ---
        if structured_return:
            return {
                "thoughts": thoughts,
                "answer": final_answer,
                "raw": assistant_reply
            }
        elif return_raw:
            return assistant_reply
        elif render_markdown:
            display(Markdown(final_answer))
        elif verbose:
            print("\n🧼 Clean answer:\n", final_answer)
        else:
            return final_answer

        if verbose:
            print(f"\n✅ Response time: {round(time.time() - start, 2)}s")

        if show_history:
            print("\n📜 Message History:")
            for msg in chat_memory:
                print(f"{msg['role'].upper()}: {msg['content']}\n")

        return final_answer if verbose else None

    except Exception as e:
        print("❌ Error in ask_local_gpt:", str(e))
        return "Error occurred."


In [4]:
ask_local_gpt(
    "what are the top 10 dogs for new couples. Number them from 1 to 10 and explain why each is a good choice.",
    stream_response=True,
    reasoning_mode=True,
    json_mode=True
)

🤔 Thinking...

{"thoughts":"User requested a list of top 10 dog breeds suitable for new couples, numbered 1 through 10, with brief explanations. Provide answer in plain text list within JSON \"answer\" field, no markdown or extraneous text.","answer":"1. Labrador Retriever – Friendly, trainable, and great with newcomers. 2. Cavalier King Charles Spaniel – Small, affectionate and low energy. 3. Golden Retriever – Gentle, loyal, and excellent for active couples. 4. Australian Shepherd – Intelligent, energetic, and pair well with couples who enjoy hiking. 5. French Bulldog – Low maintenance, short‑lived, and great indoor companions. 6. Bichon Frise – Hypoallergenic, playful, and easy to groom. 7. Shih Tzu – Compact, affectionate, and suited to apartment life. 8. Border Collie – Highly intelligent, ideal for couples who can give plenty of exercise. 9. Collie – Calm, friendly, and adaptable to various living environments. 10. Boxer – Playful, protective, and great for families starting out.

1. Labrador Retriever – Friendly, trainable, and great with newcomers. 2. Cavalier King Charles Spaniel – Small, affectionate and low energy. 3. Golden Retriever – Gentle, loyal, and excellent for active couples. 4. Australian Shepherd – Intelligent, energetic, and pair well with couples who enjoy hiking. 5. French Bulldog – Low maintenance, short‑lived, and great indoor companions. 6. Bichon Frise – Hypoallergenic, playful, and easy to groom. 7. Shih Tzu – Compact, affectionate, and suited to apartment life. 8. Border Collie – Highly intelligent, ideal for couples who can give plenty of exercise. 9. Collie – Calm, friendly, and adaptable to various living environments. 10. Boxer – Playful, protective, and great for families starting out.

In [5]:
import pandas as pd

## Batch Questions in structured format

In [6]:
results = []
questions = [
    "what are the top 10 dogs for new couples",
    "what are the top 5 cats for families",
    "what are the best 3 fish for beginners and what are their care requirements",
    "what are the top 5 birds for small apartments",
    # ... more prompts
]
for q in questions:
    res = ask_local_gpt(q, stream_response=True, reasoning_mode=True, json_mode=True, structured_return=True)
    res["prompt"] = q  # Optionally add prompt to output
    results.append(res)

df = pd.DataFrame(results)

🤔 Thinking...

{"thoughts":"Identify qualities important for couples (friendly, low-maintenance, good bond, adaptable). Select breeds known for those traits. Prioritize medium to small with easy grooming, low aggression, good with partners. List top 10 accordingly.","answer":"1. Labrador Retriever\n2. Golden Retriever\n3. Cavalier King Charles Spaniel\n4. French Bulldog\n5. Poodle (Miniature)\n6. Bichon Frise\n7. Boston Terrier\n8. Shih Tzu\n9. Beagle\n10. Cocker Spaniel"}
🤔 Thinking...

{"thoughts":"Identify breeds known for family-friendly traits: sociability, low aggression, good with kids, adaptable. Common recommendations are Ragdoll, Maine Coon, British Shorthair, Burmese, and Siamese. Format the answer as a numbered list for clarity.", "answer":"1. Ragdoll – calm, affectionate, loves cuddles.\n2. Maine Coon – gentle, friendly, playful.\n3. British Shorthair – easygoing, tolerant, low maintenance.\n4. Burmese – social, energetic, loves family.\n5. Siamese – vocal, interactive, go

In [7]:
df

Unnamed: 0,thoughts,answer,raw,prompt
0,Identify qualities important for couples (frie...,1. Labrador Retriever\n2. Golden Retriever\n3....,"{""thoughts"":""Identify qualities important for ...",what are the top 10 dogs for new couples
1,Identify breeds known for family-friendly trai...,"1. Ragdoll – calm, affectionate, loves cuddles...","{""thoughts"":""Identify breeds known for family-...",what are the top 5 cats for families
2,Selecting beginner fish requires considering t...,"1. Betta fish – 5‑glb tank, 28–30°C (78–84°F),...","{""thoughts"":""Selecting beginner fish requires ...",what are the best 3 fish for beginners and wha...
3,I need to output a list of 5 birds suitable fo...,"1. Budgie (Budgerigar) – Small, friendly, and ...","{""thoughts"":""I need to output a list of 5 bird...",what are the top 5 birds for small apartments


## Showing the difference between the chat log and the structured format

In [8]:

df_log = pd.DataFrame(conversation_log)
df_log.to_csv("chat_log.csv", index=False)

In [9]:
df_log

Unnamed: 0,timestamp,prompt,thoughts,answer,raw
0,2025-08-07 00:16:13,what are the top 10 dogs for new couples. Numb...,I need to list the top 10 dog breeds that are ...,"1. Labrador Retriever – Friendly, easy to trai...","{""thoughts"":""I need to list the top 10 dog bre..."
1,2025-08-07 00:17:09,what are the top 10 dogs for new couples,Identify qualities important for couples (frie...,1. Labrador Retriever\n2. Golden Retriever\n3....,"{""thoughts"":""Identify qualities important for ..."
2,2025-08-07 00:18:05,what are the top 5 cats for families,Identify breeds known for family-friendly trai...,"1. Ragdoll – calm, affectionate, loves cuddles...","{""thoughts"":""Identify breeds known for family-..."
3,2025-08-07 00:20:04,what are the best 3 fish for beginners and wha...,Selecting beginner fish requires considering t...,"1. Betta fish – 5‑glb tank, 28–30°C (78–84°F),...","{""thoughts"":""Selecting beginner fish requires ..."
4,2025-08-07 00:20:52,what are the top 5 birds for small apartments,I need to output a list of 5 birds suitable fo...,"1. Budgie (Budgerigar) – Small, friendly, and ...","{""thoughts"":""I need to output a list of 5 bird..."


## Sample DF 

In [10]:
import pandas as pd

df = pd.DataFrame({
    "product_id": [101, 102, 103, 104, 105, 106, 107],
    "description": [
        "Black lace-up sneakers with a memory foam sole.",
        "Elegant silver necklace with a heart pendant.",
        "Bluetooth noise-cancelling over-ear headphones.",
        "Organic cotton t-shirt in various colors.",
        "Stainless steel water bottle with a flip-top lid.",
        "Samsung Galaxy S21 smartphone with 128GB storage.",
        "Wireless ergonomic mouse with customizable buttons."
    ]
})


## Leveraging the LLM to add a column based on the description in our dataframe

In [11]:
import pandas as pd
from tqdm.auto import tqdm

def annotate_column_with_llm(
    df: pd.DataFrame,
    prompt_template: str,
    target_column: str,
    context_column: str,
    gpt_kwargs: dict = None,
    json_mode: bool = True,
    reasoning_column: str = None
):
    """
    Annotate a DataFrame using the answer/reasoning fields returned by ask_local_gpt(structured_return=True).
    """
    gpt_kwargs = gpt_kwargs or {}

    outputs = []
    reasonings = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        prompt = prompt_template.format(context=row[context_column])
        res = ask_local_gpt(
            prompt,
            json_mode=json_mode,
            render_markdown=False,
            structured_return=True,  # <--- The magic!
            **gpt_kwargs
        )
        value = res.get("answer") if res else None
        thoughts = res.get("thoughts") if res else None
        print(value)
        outputs.append(value)
        reasonings.append(thoughts)
    df[target_column] = outputs
    if reasoning_column:
        df[reasoning_column] = reasonings
    return df


In [12]:
annotate_column_with_llm(
    df,
    prompt_template="Assign a broad product category (shoes, jewelry, electronics, etc) for: {context}\nAlso explain your reasoning.",
    target_column="category",
    context_column="description",
    reasoning_column="category_reasoning"
)

  0%|          | 0/7 [00:00<?, ?it/s]

🤔 Thinking...

{"thoughts":"The product described is a pair of sneakers, which are a type of footwear. Sneakers are commonly classified under the broad category of shoes, as they are a style of shoes designed for casual wear or athletic use. The mention of a memory foam sole and black lace-up design reinforces that it is a shoe, not a separate category like apparel or accessories. Therefore the appropriate broad product category is Shoes.","answer":"Shoes"}
Shoes
🤔 Thinking...

{"thoughts":"1. Identify what the item is: a necklace made of silver with a heart pendant. 2. Determine the broader category that includes necklaces. 3. Jewelry is the standard category for necklaces, rings, bracelets, earrings, etc. 4. So the appropriate broad product category is Jewelry.","answer":"jewelry"}
jewelry
🤔 Thinking...

{"thoughts":"First, identify the main function of the item: the headphones produce audio and have active noise cancellation, both technical features that involve electronic component

Unnamed: 0,product_id,description,category,category_reasoning
0,101,Black lace-up sneakers with a memory foam sole.,Shoes,"The product described is a pair of sneakers, w..."
1,102,Elegant silver necklace with a heart pendant.,jewelry,1. Identify what the item is: a necklace made ...
2,103,Bluetooth noise-cancelling over-ear headphones.,electronics,"First, identify the main function of the item:..."
3,104,Organic cotton t-shirt in various colors.,Clothing,The item is an organic cotton t‑shirt availabl...
4,105,Stainless steel water bottle with a flip-top lid.,Kitchen & Dining,1. Identify the primary function of the item: ...
5,106,Samsung Galaxy S21 smartphone with 128GB storage.,Electronics,1. Identify the item: Samsung Galaxy S21 smart...
6,107,Wireless ergonomic mouse with customizable but...,Electronics,The product is a computer peripheral that uses...


In [14]:
spark.stop()