In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("VSCodeTest") \
    .master("local[*]") \
    .getOrCreate()

print("✅ Spark session running in VS Code!")
print("Spark version:", spark.version)

✅ Spark session running in VS Code!
Spark version: 4.0.0


In [2]:
import time
import json
from typing import Optional
from openai import OpenAI
from IPython.display import Markdown, display


# Connect to your local Ollama server
client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama"  # Dummy key required by SDK
)

In [3]:
# --- Setup ---
conversation_log = []
chat_memory = []

def ask_local_gpt(
    prompt: str,
    model: str = "gpt-oss:20b",
    system_message: Optional[str] = None,
    render_markdown: bool = True,
    verbose: bool = False,
    return_raw: bool = False,
    reset_chat: bool = False,
    show_history: bool = False,
    stream_response: bool = True,
    reasoning_mode: bool = False,
    json_mode: bool = False,  # <-- new flag!
    structured_return: bool = False  # <-- NEW!
) -> Optional[str]:
    """
    Query a local GPT model via Ollama, with support for step-by-step reasoning (Scratchpad).
    If json_mode is True, the model is instructed to respond in JSON, and results are parsed accordingly.

    Args:
        prompt (str): The user’s input/question.
        ...
        json_mode (bool): If True, instructs the LLM to return a JSON object with 'thoughts' and 'answer'.
    Returns:
        str or None: Clean final answer or full output, depending on flags.
    """
    global chat_memory, conversation_log

    try:
        if reset_chat:
            chat_memory = []

        # -- Prompt setup --
        if json_mode:
            sys_prompt = (
                "You are a helpful assistant. "
                "For every question, reply ONLY in this exact JSON format: "
                "{\"thoughts\": \"<step-by-step reasoning>\", \"answer\": \"<final answer only>\"} "
                "Do not include any extra commentary, code fences, or markdown—just the JSON."
            )
            chat_memory = [{"role": "system", "content": sys_prompt}]
            full_prompt = prompt
        else:
            # Normal or reasoning mode (old logic)
            default_reasoning_prompt = (
                "You are a helpful assistant that always reasons step by step before giving an answer. "
                "First, think through the problem, then provide a clear final answer."
            )
            if not any(m.get("role") == "system" for m in chat_memory):
                chat_memory.insert(0, {
                    "role": "system",
                    "content": system_message or (default_reasoning_prompt if reasoning_mode else "")
                })
            if reasoning_mode:
                full_prompt = (
                    f"### Scratchpad:\n"
                    f"The user asked: \"{prompt.strip()}\"\n"
                    f"Think step-by-step and reason before answering.\n\n"
                    f"### Final Answer:\n"
                )
            else:
                full_prompt = prompt

        chat_memory.append({"role": "user", "content": full_prompt})
        start = time.time()

        # --- Get response (streamed or not) ---
        if stream_response:
            print("🤔 Thinking...\n")
            stream = client.chat.completions.create(
                model=model,
                messages=chat_memory,
                stream=True
            )
            tokens = []
            for chunk in stream:
                delta = chunk.choices[0].delta.content or ""
                print(delta, end="", flush=True)
                tokens.append(delta)
            print()
            assistant_reply = "".join(tokens)
        else:
            response = client.chat.completions.create(
                model=model,
                messages=chat_memory
            )
            assistant_reply = response.choices[0].message.content

        chat_memory.append({"role": "assistant", "content": assistant_reply})

        # --- Parse output ---
        thoughts, final_answer = "", assistant_reply.strip()
        if json_mode:
            # Try to parse first valid JSON block in reply
            try:
                start_idx = assistant_reply.find('{')
                end_idx = assistant_reply.rfind('}') + 1
                if start_idx == -1 or end_idx == -1:
                    raise ValueError("No JSON object detected in model reply.")
                data = json.loads(assistant_reply[start_idx:end_idx])
                thoughts = data.get("thoughts", "").strip()
                final_answer = data.get("answer", "").strip()
            except Exception as e:
                print("❌ Failed to parse model JSON output:", e)
                thoughts = ""
                final_answer = assistant_reply.strip()
        else:
            # fallback: regex/extraction (from your original, see above)
            import re
            scratchpad_match = re.search(
                r"### Scratchpad:\s*(.*?)### Final Answer:",
                assistant_reply,
                re.DOTALL | re.IGNORECASE
            )
            final_answer_match = re.search(
                r"### Final Answer:\s*([\s\S]*?)(?:$|\n#|\n\n)",
                assistant_reply,
                re.DOTALL | re.IGNORECASE
            )
            if scratchpad_match:
                thoughts = scratchpad_match.group(1).strip()
            if final_answer_match:
                final_answer = final_answer_match.group(1).strip()

        # --- Log results ---
        conversation_log.append({
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "prompt": prompt,
            "thoughts": thoughts,
            "answer": final_answer,
            "raw": assistant_reply
        })

        # --- Output control ---
        if structured_return:
            return {
                "thoughts": thoughts,
                "answer": final_answer,
                "raw": assistant_reply
            }
        elif return_raw:
            return assistant_reply
        elif render_markdown:
            display(Markdown(final_answer))
        elif verbose:
            print("\n🧼 Clean answer:\n", final_answer)
        else:
            return final_answer

        if verbose:
            print(f"\n✅ Response time: {round(time.time() - start, 2)}s")

        if show_history:
            print("\n📜 Message History:")
            for msg in chat_memory:
                print(f"{msg['role'].upper()}: {msg['content']}\n")

        return final_answer if verbose else None

    except Exception as e:
        print("❌ Error in ask_local_gpt:", str(e))
        return "Error occurred."


In [4]:
ask_local_gpt(
    "what are the top 10 dogs for new couples. Number them from 1 to 10 and explain why each is a good choice.",
    stream_response=True,
    reasoning_mode=True,
    json_mode=True
)

🤔 Thinking...

{"thoughts":"I will list ten dog breeds that are typically recommended for new couples, highlighting their friendly nature, moderate exercise needs, and low to moderate grooming requirements. Each item will be numbered and briefly explained.", "answer":"1. Labrador Retriever – Friendly, moderate energy, great for beginners.\n2. Golden Retriever – Similar temperament, loves people.\n3. Cavalier King Charles Spaniel – Small, affectionate, low exercise.\n4. French Bulldog – Low-maintenance, good for apartments.\n5. Boston Terrier – Friendly, easy to train, small size.\n6. Beagle – Curious, good with couples, moderate exercise.\n7. Miniature Poodle – Low shedding, intelligent, easy to groom.\n8. Miniature Schnauzer – Protective, adaptable, low shedding.\n9. Shetland Sheepdog – Intelligent, gentle, small to medium size.\n10. Pembroke Welsh Corgi – Energetic, loyal, good family dog."}


1. Labrador Retriever – Friendly, moderate energy, great for beginners.
2. Golden Retriever – Similar temperament, loves people.
3. Cavalier King Charles Spaniel – Small, affectionate, low exercise.
4. French Bulldog – Low-maintenance, good for apartments.
5. Boston Terrier – Friendly, easy to train, small size.
6. Beagle – Curious, good with couples, moderate exercise.
7. Miniature Poodle – Low shedding, intelligent, easy to groom.
8. Miniature Schnauzer – Protective, adaptable, low shedding.
9. Shetland Sheepdog – Intelligent, gentle, small to medium size.
10. Pembroke Welsh Corgi – Energetic, loyal, good family dog.

In [5]:
import pandas as pd

## Batch Questions in structured format

In [6]:
results = []
questions = [
    "what are the top 10 dogs for new couples",
    "what are the top 5 cats for families",
    # ... more prompts
]
for q in questions:
    res = ask_local_gpt(q, stream_response=True, reasoning_mode=True, json_mode=True, structured_return=True)
    res["prompt"] = q  # Optionally add prompt to output
    results.append(res)

df = pd.DataFrame(results)

🤔 Thinking...

{"thoughts":"1. Identify key factors for new couples: companionship, friendliness, trainability, low to moderate exercise and grooming requirements, and suitability for living situations such as apartments or houses.\n2. Create a list of breeds that consistently score well on these dimensions. Prioritize medium-sized dogs that are easy to train, affectionate, and have a strong social nature.\n3. Verify that each breed has a reputation for being calm, adaptable, and not overly demanding.\n4. Compile the final list of ten breeds, ordering from most universally recommended for couples to slightly more niche options.\n5. Format the answer with a simple numbered list, delivering only the essential information as requested." , "answer":"1. Labrador Retriever\n2. Golden Retriever\n3. Cavalier King Charles Spaniel\n4. French Bulldog\n5. Boston Terrier\n6. Beagle\n7. Cocker Spaniel\n8. Havanese\n9. Bichon Frise\n10. Pug"}
🤔 Thinking...

{"thoughts":"The user asked for the top 5 c

In [7]:
df

Unnamed: 0,thoughts,answer,raw,prompt
0,1. Identify key factors for new couples: compa...,1. Labrador Retriever\n2. Golden Retriever\n3....,"{""thoughts"":""1. Identify key factors for new c...",what are the top 10 dogs for new couples
1,The user asked for the top 5 cat breeds that a...,"1. Maine Coon – large, gentle, good with kids....","{""thoughts"":""The user asked for the top 5 cat ...",what are the top 5 cats for families


## Showing the difference between the chat log and the structured format

In [8]:

df_log = pd.DataFrame(conversation_log)
df_log.to_csv("chat_log.csv", index=False)

In [9]:
df_log

Unnamed: 0,timestamp,prompt,thoughts,answer,raw
0,2025-08-06 23:18:30,what are the top 10 dogs for new couples. Numb...,I will list ten dog breeds that are typically ...,"1. Labrador Retriever – Friendly, moderate ene...","{""thoughts"":""I will list ten dog breeds that a..."
1,2025-08-06 23:19:26,what are the top 10 dogs for new couples,1. Identify key factors for new couples: compa...,1. Labrador Retriever\n2. Golden Retriever\n3....,"{""thoughts"":""1. Identify key factors for new c..."
2,2025-08-06 23:19:54,what are the top 5 cats for families,The user asked for the top 5 cat breeds that a...,"1. Maine Coon – large, gentle, good with kids....","{""thoughts"":""The user asked for the top 5 cat ..."


## Sample DF 

In [10]:
import pandas as pd

df = pd.DataFrame({
    "product_id": [101, 102, 103, 104, 105],
    "description": [
        "Black lace-up sneakers with a memory foam sole.",
        "Elegant silver necklace with a heart pendant.",
        "Bluetooth noise-cancelling over-ear headphones.",
        "Organic cotton t-shirt in various colors.",
        "Stainless steel water bottle with a flip-top lid."
    ]
})


## Leveraging the LLM to add a column based on the description in our dataframe

In [11]:
import pandas as pd
from tqdm.auto import tqdm

def annotate_column_with_llm(
    df: pd.DataFrame,
    prompt_template: str,
    target_column: str,
    context_column: str,
    gpt_kwargs: dict = None,
    json_mode: bool = True,
    reasoning_column: str = None
):
    """
    Annotate a DataFrame using the answer/reasoning fields returned by ask_local_gpt(structured_return=True).
    """
    gpt_kwargs = gpt_kwargs or {}

    outputs = []
    reasonings = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        prompt = prompt_template.format(context=row[context_column])
        res = ask_local_gpt(
            prompt,
            json_mode=json_mode,
            render_markdown=False,
            structured_return=True,  # <--- The magic!
            **gpt_kwargs
        )
        value = res.get("answer") if res else None
        thoughts = res.get("thoughts") if res else None
        print(value)
        outputs.append(value)
        reasonings.append(thoughts)
    df[target_column] = outputs
    if reasoning_column:
        df[reasoning_column] = reasonings
    return df


In [None]:
annotate_column_with_llm(
    df,
    prompt_template="Assign a broad product category (shoes, jewelry, electronics, etc) for: {context}\nAlso explain your reasoning.",
    target_column="category",
    context_column="description",
    reasoning_column="category_reasoning"
)

  0%|          | 0/5 [00:00<?, ?it/s]

🤔 Thinking...

{"thoughts":"1. The item is described as 'Black lace-up sneakers with a memory foam sole'.\n2. Sneakers are a type of footwear.\n3. The question requests a broad product category like 'shoes', 'jewelry', 'electronics', etc.\n4. Therefore the correct broad category is shoes (footwear).", "answer":"Shoes"}
Shoes
🤔 Thinking...

{"thoughts":"The item is described as an elegant silver necklace with a heart pendant. Necklaces are a common jewelry item. Silver is a metal used in jewelry, and the heart pendant indicates decorative design typical of jewelry. Therefore the broad category is ‘Jewelry’.","answer":"Jewelry"}
Jewelry
🤔 Thinking...

{"thoughts":"1. Identify the main characteristics of the item: it is a wireless (Bluetooth) headphone, which performs sound processing (noise-cancelling) and is an over-ear design. 2. Determine the broad category used in retail or e-commerce contexts that best captures such devices; common high-level categories are: clothing, household, ele

In [14]:
df

Unnamed: 0,product_id,description,category,category_reasoning
0,101,Black lace-up sneakers with a memory foam sole.,Shoes,1. The item is described as 'Black lace-up sne...
1,102,Elegant silver necklace with a heart pendant.,Jewelry,The item is described as an elegant silver nec...
2,103,Bluetooth noise-cancelling over-ear headphones.,electronics,1. Identify the main characteristics of the it...
3,104,Organic cotton t-shirt in various colors.,Clothing,"First, identify the main type of item: an orga..."
4,105,Stainless steel water bottle with a flip-top lid.,Sporting Goods,The product is a stainless steel water bottle ...


In [13]:
spark.stop()