In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("VSCodeTest") \
    .master("local[*]") \
    .getOrCreate()

print("✅ Spark session running in VS Code!")
print("Spark version:", spark.version)

✅ Spark session running in VS Code!
Spark version: 4.0.0


In [2]:
import time
import json
from typing import Optional
from openai import OpenAI
from IPython.display import Markdown, display


# Connect to your local Ollama server
client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama"  # Dummy key required by SDK
)

In [3]:
# --- Setup ---
conversation_log = []
chat_memory = []

def ask_local_gpt(
    prompt: str,
    model: str = "gpt-oss:20b",
    system_message: Optional[str] = None,
    render_markdown: bool = True,
    verbose: bool = False,
    return_raw: bool = False,
    reset_chat: bool = False,
    show_history: bool = False,
    stream_response: bool = True,
    reasoning_mode: bool = False,
    json_mode: bool = False,  # <-- new flag!
    structured_return: bool = False  # <-- NEW!
) -> Optional[str]:
    """
    Query a local GPT model via Ollama, with support for step-by-step reasoning (Scratchpad).
    If json_mode is True, the model is instructed to respond in JSON, and results are parsed accordingly.

    Args:
        prompt (str): The user’s input/question.
        ...
        json_mode (bool): If True, instructs the LLM to return a JSON object with 'thoughts' and 'answer'.
    Returns:
        str or None: Clean final answer or full output, depending on flags.
    """
    global chat_memory, conversation_log

    try:
        if reset_chat:
            chat_memory = []

        # -- Prompt setup --
        if json_mode:
            sys_prompt = (
                "You are a helpful assistant. "
                "For every question, reply ONLY in this exact JSON format: "
                "{\"thoughts\": \"<step-by-step reasoning>\", \"answer\": \"<final answer only>\"} "
                "Do not include any extra commentary, code fences, or markdown—just the JSON."
            )
            chat_memory = [{"role": "system", "content": sys_prompt}]
            full_prompt = prompt
        else:
            # Normal or reasoning mode (old logic)
            default_reasoning_prompt = (
                "You are a helpful assistant that always reasons step by step before giving an answer. "
                "First, think through the problem, then provide a clear final answer."
            )
            if not any(m.get("role") == "system" for m in chat_memory):
                chat_memory.insert(0, {
                    "role": "system",
                    "content": system_message or (default_reasoning_prompt if reasoning_mode else "")
                })
            if reasoning_mode:
                full_prompt = (
                    f"### Scratchpad:\n"
                    f"The user asked: \"{prompt.strip()}\"\n"
                    f"Think step-by-step and reason before answering.\n\n"
                    f"### Final Answer:\n"
                )
            else:
                full_prompt = prompt

        chat_memory.append({"role": "user", "content": full_prompt})
        start = time.time()

        # --- Get response (streamed or not) ---
        if stream_response:
            print("🤔 Thinking...\n")
            stream = client.chat.completions.create(
                model=model,
                messages=chat_memory,
                stream=True
            )
            tokens = []
            for chunk in stream:
                delta = chunk.choices[0].delta.content or ""
                print(delta, end="", flush=True)
                tokens.append(delta)
            print()
            assistant_reply = "".join(tokens)
        else:
            response = client.chat.completions.create(
                model=model,
                messages=chat_memory
            )
            assistant_reply = response.choices[0].message.content

        chat_memory.append({"role": "assistant", "content": assistant_reply})

        # --- Parse output ---
        thoughts, final_answer = "", assistant_reply.strip()
        if json_mode:
            # Try to parse first valid JSON block in reply
            try:
                start_idx = assistant_reply.find('{')
                end_idx = assistant_reply.rfind('}') + 1
                if start_idx == -1 or end_idx == -1:
                    raise ValueError("No JSON object detected in model reply.")
                data = json.loads(assistant_reply[start_idx:end_idx])
                thoughts = data.get("thoughts", "").strip()
                final_answer = data.get("answer", "").strip()
            except Exception as e:
                print("❌ Failed to parse model JSON output:", e)
                thoughts = ""
                final_answer = assistant_reply.strip()
        else:
            # fallback: regex/extraction (from your original, see above)
            import re
            scratchpad_match = re.search(
                r"### Scratchpad:\s*(.*?)### Final Answer:",
                assistant_reply,
                re.DOTALL | re.IGNORECASE
            )
            final_answer_match = re.search(
                r"### Final Answer:\s*([\s\S]*?)(?:$|\n#|\n\n)",
                assistant_reply,
                re.DOTALL | re.IGNORECASE
            )
            if scratchpad_match:
                thoughts = scratchpad_match.group(1).strip()
            if final_answer_match:
                final_answer = final_answer_match.group(1).strip()

        # --- Log results ---
        conversation_log.append({
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "prompt": prompt,
            "thoughts": thoughts,
            "answer": final_answer,
            "raw": assistant_reply
        })

        # --- Output control ---
        if structured_return:
            return {
                "thoughts": thoughts,
                "answer": final_answer,
                "raw": assistant_reply
            }
        elif return_raw:
            return assistant_reply
        elif render_markdown:
            display(Markdown(final_answer))
        elif verbose:
            print("\n🧼 Clean answer:\n", final_answer)
        else:
            return final_answer

        if verbose:
            print(f"\n✅ Response time: {round(time.time() - start, 2)}s")

        if show_history:
            print("\n📜 Message History:")
            for msg in chat_memory:
                print(f"{msg['role'].upper()}: {msg['content']}\n")

        return final_answer if verbose else None

    except Exception as e:
        print("❌ Error in ask_local_gpt:", str(e))
        return "Error occurred."


In [19]:
ask_local_gpt(
    "what are the top 10 dogs for new couples. Number them from 1 to 10 and explain why each is a good choice.",
    stream_response=True,
    reasoning_mode=True,
    json_mode=True
)

🤔 Thinking...

{"thoughts":"Identify 10 dog breeds suitable for new couples: low maintenance, friendly, moderate size, good with cohabitation, non-aggressive, easy training, adaptable to lifestyle. Provide clear reasons for each choice. Format answer as numbered list in a single string within JSON.", "answer":"1. Labrador Retriever – friendly, trainable, gentle temperament.\n2. Cavalier King Charles Spaniel – affectionate, adapts to small spaces.\n3. French Bulldog – low exercise, easy care, social.\n4. Golden Retriever – loyalty, good with partners, easy to train.\n5. Pug – low grooming, playful, good companionship.\n6. Bichon Frise – hypoallergenic, happy, manageable size.\n7. Shih Tzu – low activity need, love lap sitting.\n8. Boston Terrier – adaptable, friendly, minimal shedding.\n9. Beagle – happy, good for couples who like outdoor walks.\n10. Greyhound (retired) – relaxed indoors, calm, low maintenance."}


1. Labrador Retriever – friendly, trainable, gentle temperament.
2. Cavalier King Charles Spaniel – affectionate, adapts to small spaces.
3. French Bulldog – low exercise, easy care, social.
4. Golden Retriever – loyalty, good with partners, easy to train.
5. Pug – low grooming, playful, good companionship.
6. Bichon Frise – hypoallergenic, happy, manageable size.
7. Shih Tzu – low activity need, love lap sitting.
8. Boston Terrier – adaptable, friendly, minimal shedding.
9. Beagle – happy, good for couples who like outdoor walks.
10. Greyhound (retired) – relaxed indoors, calm, low maintenance.

In [16]:
import pandas as pd

In [None]:
results = []
questions = [
    "what are the top 10 dogs for new couples",
    "what are the top 5 cats for families",
    # ... more prompts
]
for q in questions:
    res = ask_local_gpt(q, stream_response=True, reasoning_mode=True, json_mode=True, structured_return=True)
    res["prompt"] = q  # Optionally add prompt to output
    results.append(res)

df = pd.DataFrame(results)

🤔 Thinking...

{"thoughts":"The user wants a list of the top 10 dog breeds that are suitable for new couples. I’ll pick breeds that are generally friendly, low to moderate energy, not overly large, and easy to train. I’ll number them and list them in a single string. I’ll use commas to separate items so the answer stays clear. The JSON must contain exactly two keys: \"thoughts\" and \"answer\".", "answer":"1. Labrador Retriever, 2. Golden Retriever, 3. Cavalier King Charles Spaniel, 4. French Bulldog, 5. Beagle, 6. Boston Terrier, 7. Shih Tzu, 8. Cocker Spaniel, 9. Pomeranian, 10. Bichon Fr\u00e8s"}
🤔 Thinking...

{"thoughts":"Identified common breeds known for their friendly, docile nature and compatibility with children and other pets. Chose five that are well-documented as family-friendly: Maine Coon, Ragdoll, Birman, British Shorthair, and Persian. Prepared a concise list for the final answer.","answer":"1. Maine Coon – large, gentle, great with kids.\n2. Ragdoll – affectionate, la

In [25]:
df

Unnamed: 0,thoughts,answer,raw,prompt
0,The user wants a list of the top 10 dog breeds...,"1. Labrador Retriever, 2. Golden Retriever, 3....","{""thoughts"":""The user wants a list of the top ...",what are the top 10 dogs for new couples
1,Identified common breeds known for their frien...,"1. Maine Coon – large, gentle, great with kids...","{""thoughts"":""Identified common breeds known fo...",what are the top 5 cats for families


In [None]:

df_log = pd.DataFrame(conversation_log)
df_log.to_csv("chat_log.csv", index=False)

In [23]:
df_log

Unnamed: 0,timestamp,prompt,thoughts,answer,raw
0,2025-08-06 23:02:20,what are the top 10 dogs for new couples,,,
1,2025-08-06 23:02:29,"Assign a broad product category (shoes, jewelr...",The item described is a pair of Black lace‑up ...,Shoes,"{""thoughts"":""The item described is a pair of B..."
2,2025-08-06 23:02:45,"Assign a broad product category (shoes, jewelr...","The item described is a necklace, which is a p...",jewelry,"{""thoughts"":""The item described is a necklace,..."
3,2025-08-06 23:03:07,"Assign a broad product category (shoes, jewelr...",1. Identify the product type: Bluetooth noise‑...,Electronics,"{""thoughts"":""1. Identify the product type: Blu..."
4,2025-08-06 23:03:28,"Assign a broad product category (shoes, jewelr...",A t-shirt is a garment worn on the upper body....,Clothing,"{""thoughts"":""A t-shirt is a garment worn on th..."
5,2025-08-06 23:03:54,"Assign a broad product category (shoes, jewelr...",A stainless steel water bottle with a flip-top...,Sports & Outdoors,"{""thoughts"":""A stainless steel water bottle wi..."
6,2025-08-06 23:05:43,what are the top 10 dogs for new couples,Determine dog breeds that are low to moderate ...,"1. Labrador Retriever (friendly, family‑orient...","{""thoughts"":""Determine dog breeds that are low..."
7,2025-08-06 23:09:28,what are the top 10 dogs for new couples,I considered traits important for new couples—...,"French Bulldog, Cavalier King Charles Spaniel,...","{""thoughts"":""I considered traits important for..."
8,2025-08-06 23:12:07,what are the top 10 dogs for new couples. Numb...,Identify 10 dog breeds suitable for new couple...,"1. Labrador Retriever – friendly, trainable, g...","{""thoughts"":""Identify 10 dog breeds suitable f..."


In [7]:
import pandas as pd

df = pd.DataFrame({
    "product_id": [101, 102, 103, 104, 105],
    "description": [
        "Black lace-up sneakers with a memory foam sole.",
        "Elegant silver necklace with a heart pendant.",
        "Bluetooth noise-cancelling over-ear headphones.",
        "Organic cotton t-shirt in various colors.",
        "Stainless steel water bottle with a flip-top lid."
    ]
})


In [8]:
import pandas as pd
from tqdm.auto import tqdm

def annotate_column_with_llm(
    df: pd.DataFrame,
    prompt_template: str,
    target_column: str,
    context_column: str,
    gpt_kwargs: dict = None,
    json_mode: bool = True,
    reasoning_column: str = None
):
    """
    Annotate a DataFrame using the answer/reasoning fields returned by ask_local_gpt(structured_return=True).
    """
    gpt_kwargs = gpt_kwargs or {}

    outputs = []
    reasonings = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        prompt = prompt_template.format(context=row[context_column])
        res = ask_local_gpt(
            prompt,
            json_mode=json_mode,
            render_markdown=False,
            structured_return=True,  # <--- The magic!
            **gpt_kwargs
        )
        value = res.get("answer") if res else None
        thoughts = res.get("thoughts") if res else None
        print(value)
        outputs.append(value)
        reasonings.append(thoughts)
    df[target_column] = outputs
    if reasoning_column:
        df[reasoning_column] = reasonings
    return df


In [9]:
annotate_column_with_llm(
    df,
    prompt_template="Assign a broad product category (shoes, jewelry, electronics, etc) for: {context}\nAlso explain your reasoning.",
    target_column="category",
    context_column="description",
    reasoning_column="category_reasoning"
)
print(df)

  0%|          | 0/5 [00:00<?, ?it/s]

🤔 Thinking...

{"thoughts":"The item described is a pair of Black lace‑up sneakers with a memory foam sole. It is a type of footwear, so the appropriate product category is shoes.","answer":"Shoes"}
Shoes
🤔 Thinking...

{"thoughts":"The item described is a necklace, which is a piece of ornamental jewelry. A necklace is a classic example of jewelry, distinguished from other categories such as shoes or electronics. The presence of a decorative pendant further confirms it as a jewelry item.", "answer":"jewelry"}
jewelry
🤔 Thinking...

{"thoughts":"1. Identify the product type: Bluetooth noise‑cancelling over‑ear headphones.\n2. Determine the primary function: audio playback and wireless connectivity.\n3. Classify by industry sector: belongs to consumer electronics.\n4. Narrow down to broad category: electronics (as opposed to fashion, jewelry, etc.).\n5. Provide the final category name as \"Electronics\".","answer":"Electronics"}
Electronics
🤔 Thinking...

{"thoughts":"A t-shirt is a garm

In [10]:
spark.stop()