In [69]:

from langsmith import evaluate

character_driven_experiment_id = "20fa3c52-fd37-4437-b2d3-7b83e0411705"
plot_driven_experiment_id = "2c241beb-3b6e-4145-8400-8d2cdc405068"

print("Running comparative evaluation...")
evaluate(
    [character_driven_experiment_id, plot_driven_experiment_id],
    evaluators=[ranked_preference]
)
print("Comparative evaluation complete!")

Running comparative evaluation...
View the pairwise evaluation results at:
https://smith.langchain.com/o/7078c9e6-ede7-4d80-b45b-119f20137777/datasets/80f6d3d0-3e0a-404f-8845-b073c7c16f44/compare?selectedSessions=20fa3c52-fd37-4437-b2d3-7b83e0411705%2C2c241beb-3b6e-4145-8400-8d2cdc405068&comparativeExperiment=0fb26c92-c037-4ef4-a2aa-1aa0eacdcc29




  0%|          | 0/5 [00:00<?, ?it/s]

Evaluating stories for prompt: A lone astronaut discovers a strange, glowing plan...
Evaluating stories for prompt: A talking cat helps a shy librarian uncover a magi...
Evaluating stories for prompt: In a future where dreams can be recorded, a detect...
Evaluating stories for prompt: A baker finds a mysterious ingredient that makes h...
Evaluating stories for prompt: Two rival knights from different kingdoms meet by ...
Comparative evaluation complete!


In [68]:
import json
from groq import Groq
import uuid 


def ranked_preference(runs, example):
    """
    Evaluates two runs (stories) against an example prompt using a Groq model as a judge.
    Returns a dictionary in the format expected by LangSmith for comparative evaluation.
    """
    if len(runs) != 2:
        raise ValueError("This evaluator expects exactly two runs for comparison.")

    prompt_text = example.inputs.get("prompt", "No prompt provided in example.")

    
    story_a = runs[0].outputs.get("output", "N/A") if runs[0].outputs else "N/A"
    story_b = runs[1].outputs.get("output", "N/A") if runs[1].outputs else "N/A"

   
    run_id_a = str(runs[0].id)
    run_id_b = str(runs[1].id)

    print(f"Evaluating stories for prompt: {prompt_text[:50]}...")

    try:
        completion = groq_client.chat.completions.create(
            model=GROQ_MODEL,
            messages=[
                {   
                    "role": "system",
                    "content": JUDGE_SYSTEM_PROMPT,
                },
                {
                    "role": "user",
                    "content": JUDGE_HUMAN_PROMPT.format(
                        prompt=prompt_text, 
                        story_a=story_a,
                        story_b=story_b
                    )}
            ],
            response_format={"type": "json_object"},
            temperature=0.0,
            max_tokens=50,
        )
        
        parsed_content = json.loads(completion.choices[0].message.content)
        preference_score = parsed_content.get("preference")

        if preference_score not in ["A", "B", "Tie"]:
            print(f"Warning: Unexpected preference score received: {preference_score}. Full response: {completion.choices[0].message.content}")
            return {
                "key": "ranked_preference", 
                "score": 0, 
                "comment": f"Invalid preference from judge: {preference_score}",
                "scores": {run_id_a: 0, run_id_b: 0} 
            }

        scores_dict = {run_id_a: 0, run_id_b: 0} 

        if preference_score == "A":
            scores_dict[run_id_a] = 1 
            scores_dict[run_id_b] = 0
        elif preference_score == "B":
            scores_dict[run_id_a] = 0
            scores_dict[run_id_b] = 1 
        else: 
            scores_dict[run_id_a] = 0.5 
            scores_dict[run_id_b] = 0.5
            
        return {
            "key": "ranked_preference", 
            "score": preference_score, 
            "comment": f"Judge preferred {preference_score}",
            "scores": scores_dict 
        }

    except Exception as e:
        print(f"Error during Groq API call or JSON parsing: {e}")
        return {
            "key": "ranked_preference",
            "score": "Error",
            "comment": f"Evaluation failed: {e}",
            "scores": {run_id_a: 0, run_id_b: 0} 
        }

In [63]:
JUDGE_SYSTEM_PROMPT = """
Please act as an impartial literary judge and evaluate the creativity and originality of two short stories generated from the same prompt.
Your evaluation should consider factors such as narrative originality, imaginative details, character uniqueness, and the overall 'spark' of creativity. 
Begin your evaluation by comparing the two stories and provide a short explanation of your preference. 
Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. 
Do not favor certain names of the assistants. 
Be as objective as possible. """

JUDGE_HUMAN_PROMPT = """
[The Story Prompt] {prompt}

[The Start of Assistant A's Story] {story_a} [The End of Assistant A's Story]

[The Start of Assistant B's Story] {story_b} [The End of Assistant B's Story]"""

In [62]:
def character_driven_story_generator(inputs: dict):
    response = groq_client.chat.completions.create(
        model=GROQ_MODEL,
        messages=[
            {
                "role": "user",
                "content": f"Craft a unique, character-driven short story (around 100 words) inspired by this prompt: '{inputs['prompt']}'. Emphasize character development and introduce an unexpected element."
            }
        ],
    )
    return response.choices[0].message.content

client.evaluate(
    character_driven_story_generator,
    data=dataset,
    evaluators=[creativity_score_evaluator],
    experiment_prefix="Character-Driven Story"
)

View the evaluation results for experiment: 'Character-Driven Story-d847abf9' at:
https://smith.langchain.com/o/7078c9e6-ede7-4d80-b45b-119f20137777/datasets/80f6d3d0-3e0a-404f-8845-b073c7c16f44/compare?selectedSessions=20fa3c52-fd37-4437-b2d3-7b83e0411705




0it [00:00, ?it/s]

Error running evaluator <DynamicRunEvaluator creativity_score_evaluator> on run d1a814fa-6a2b-4fd6-95bc-8819e3e3c010: BadRequestError('Error code: 400 - {\'error\': {\'message\': "\'messages\' must contain the word \'json\' in some form, to use \'response_format\' of type \'json_object\'.", \'type\': \'invalid_request_error\'}}')
Traceback (most recent call last):
  File "c:\Users\DELL\anaconda3\envs\MAT496\Lib\site-packages\langsmith\evaluation\_runner.py", line 1620, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
        run=run,
        example=example,
        evaluator_run_id=evaluator_run_id,
    )
  File "c:\Users\DELL\anaconda3\envs\MAT496\Lib\site-packages\langsmith\evaluation\evaluator.py", line 351, in evaluate_run
    result = self.func(
        run,
        example,
        langsmith_extra={"run_id": evaluator_run_id, "metadata": metadata},
    )
  File "c:\Users\DELL\anaconda3\envs\MAT496\Lib\site-packages\langsmith\run_helpe

Unnamed: 0,inputs.prompt,outputs.output,error,feedback.wrapper,execution_time,example_id,id
0,Two rival knights from different kingdoms meet...,"Sir Kaelen's visor gaped at the roiling, green...",,,0.610456,58544518-82cc-452e-bd5a-55cb9a938c2d,d1a814fa-6a2b-4fd6-95bc-8819e3e3c010
1,A baker finds a mysterious ingredient that mak...,"Elara, always practical, scoffed at the irides...",,,0.560345,b520ab2a-5837-4637-98eb-eb526a9ea313,f59c6f6b-4da7-4ae7-8913-1524b9ec70c2
2,"In a future where dreams can be recorded, a de...",Rain lashed against Inspector Crowe's office w...,,,0.548364,36da89f4-36e0-4339-ab0c-b5d13d66ae83,2b72d428-a813-4bec-a931-74cc634f65f0
3,A talking cat helps a shy librarian uncover a ...,"Eleanor, the librarian, trembled as she held t...",,,0.651115,63c4dad9-5528-460f-bfe5-43c1ac42e94d,702cc4d7-b014-4546-8dd4-01c85fbe0111
4,"A lone astronaut discovers a strange, glowing ...",Dr. Elara Vance's helmet visor reflected the d...,,,0.525727,cfde001b-7710-47c8-b1b6-b27e4e9d66ed,86d73d13-a43d-4540-a592-5df2a1779319


In [61]:

def plot_driven_story_generator(inputs: dict):
    response = groq_client.chat.completions.create(
        model=GROQ_MODEL,
        messages=[
            {
                "role": "user",
                "content": f"Write a short, plot-driven story (around 100 words) based on this prompt: '{inputs['prompt']}'. Focus on clear events and a resolution."
            }
        ],
    )
    return response.choices[0].message.content

client.evaluate(
    plot_driven_story_generator,
    data=dataset,
    evaluators=[creativity_score_evaluator],
    experiment_prefix="Plot-Driven Story"
)



View the evaluation results for experiment: 'Plot-Driven Story-e726a52b' at:
https://smith.langchain.com/o/7078c9e6-ede7-4d80-b45b-119f20137777/datasets/80f6d3d0-3e0a-404f-8845-b073c7c16f44/compare?selectedSessions=2c241beb-3b6e-4145-8400-8d2cdc405068




0it [00:00, ?it/s]

Error running evaluator <DynamicRunEvaluator creativity_score_evaluator> on run dbc0f8de-6a2f-46bb-9511-fdedc1b4b7a5: BadRequestError('Error code: 400 - {\'error\': {\'message\': "\'messages\' must contain the word \'json\' in some form, to use \'response_format\' of type \'json_object\'.", \'type\': \'invalid_request_error\'}}')
Traceback (most recent call last):
  File "c:\Users\DELL\anaconda3\envs\MAT496\Lib\site-packages\langsmith\evaluation\_runner.py", line 1620, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
        run=run,
        example=example,
        evaluator_run_id=evaluator_run_id,
    )
  File "c:\Users\DELL\anaconda3\envs\MAT496\Lib\site-packages\langsmith\evaluation\evaluator.py", line 351, in evaluate_run
    result = self.func(
        run,
        example,
        langsmith_extra={"run_id": evaluator_run_id, "metadata": metadata},
    )
  File "c:\Users\DELL\anaconda3\envs\MAT496\Lib\site-packages\langsmith\run_helpe

Unnamed: 0,inputs.prompt,outputs.output,error,feedback.wrapper,execution_time,example_id,id
0,Two rival knights from different kingdoms meet...,The crimson halo bathed the battlefield in an ...,,,0.712201,58544518-82cc-452e-bd5a-55cb9a938c2d,dbc0f8de-6a2f-46bb-9511-fdedc1b4b7a5
1,A baker finds a mysterious ingredient that mak...,Elara discovered a vial tucked in her grandmot...,,,0.566061,b520ab2a-5837-4637-98eb-eb526a9ea313,8571ed30-341b-4f50-a8c3-0eed3ed4e6b1
2,"In a future where dreams can be recorded, a de...","The holo-projection flickered, displaying a lu...",,,0.562653,36da89f4-36e0-4339-ab0c-b5d13d66ae83,db10fcbf-be5a-4e57-b7d9-65d17583744a
3,A talking cat helps a shy librarian uncover a ...,"Agnes, the shy librarian, stumbled upon an anc...",,,0.579295,63c4dad9-5528-460f-bfe5-43c1ac42e94d,ec9de22a-f810-4fe5-865d-4f4c5f044ac1
4,"A lone astronaut discovers a strange, glowing ...",The rusty drill screeched as it pierced the si...,,,0.605316,cfde001b-7710-47c8-b1b6-b27e4e9d66ed,af733901-045e-436d-b035-4339ecd5d397


In [60]:
from pydantic import BaseModel, Field
from groq import Groq
import json 

groq_client = Groq(api_key="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")

GROQ_MODEL = "gemma2-9b-it"

CREATIVITY_SYSTEM_PROMPT = """You are a highly experienced literary critic, tasked with evaluating the creativity and originality of short stories based on a given prompt."""

CREATIVITY_HUMAN_PROMPT = """
[The Story Prompt] {prompt}
[The Generated Story] {story}
"""

class CreativityScore(BaseModel):
    score: int = Field(description="""A score from 1-5 ranking the creativity and originality of the generated story in response to the prompt, with 1 being unoriginal/generic, and 5 being highly creative and unique.""")
    
def creativity_score_evaluator(inputs: dict, outputs: dict) -> list:
    completion = groq_client.chat.completions.create(
        model=GROQ_MODEL,
        messages=[
            {   
                "role": "system",
                "content": CREATIVITY_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": CREATIVITY_HUMAN_PROMPT.format(
                    prompt=inputs["prompt"], 
                    story=outputs.get("output", "N/A"),
                )}
        ],
        response_format={"type": "json_object"},
    )

    parsed_content = json.loads(completion.choices[0].message.content)
    creativity_score = parsed_content.get("score")
    
    return {"key": "creativity_score", "score": creativity_score}

In [58]:

first_example = next(client.list_examples(dataset_id=dataset.id))
print(first_example.inputs)

{'prompt': 'Two rival knights from different kingdoms meet by chance during a rare cosmic event, forcing them to cooperate.'}


In [57]:

from langsmith import Client
from langsmith.utils import LangSmithNotFoundError 
import uuid 

client = Client()


story_prompts_data_raw = [
    {"prompt": "A lone astronaut discovers a strange, glowing plant on a desolate alien moon."},
    {"prompt": "A talking cat helps a shy librarian uncover a magical secret hidden in an old book."},
    {"prompt": "In a future where dreams can be recorded, a detective investigates a dream-heist."},
    {"prompt": "A baker finds a mysterious ingredient that makes her pastries grant wishes, but with unexpected side effects."},
    {"prompt": "Two rival knights from different kingdoms meet by chance during a rare cosmic event, forcing them to cooperate."}
]


dataset_name_to_use = "Story Prompts Dataset for Creativity"

try:
    
    my_story_dataset = client.read_dataset(dataset_name=dataset_name_to_use)
    print(f"Using existing '{dataset_name_to_use}'.")
  

except LangSmithNotFoundError:
   
    print(f"Creating new '{dataset_name_to_use}'.")
    my_story_dataset = client.create_dataset(
        dataset_name=dataset_name_to_use,
        description="Dataset for evaluating story generation creativity",
    )
    
   
    for prompt_dict in story_prompts_data_raw:
        client.create_example(
            dataset_id=my_story_dataset.id,
            inputs=prompt_dict, 
        )
    print(f"Added {len(story_prompts_data_raw)} examples to '{dataset_name_to_use}'.")
except Exception as e:
    print(f"An unexpected error occurred during dataset creation/retrieval: {e}")
    raise


dataset = my_story_dataset


print(f"Dataset ID in use: {dataset.id}")

Using existing 'Story Prompts Dataset for Creativity'.
Dataset ID in use: 80f6d3d0-3e0a-404f-8845-b073c7c16f44


In [56]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../.env", override=True)


False