In [1]:
from langsmith import Client
from langsmith.schemas import Run
from typing import List, Dict, Any
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

def analyze_langsmith_experiments(dataset_name: str, target_experiment_prefix: str):
    """
    Analyzes LangSmith experiments for a given dataset and target experiment prefix.

    Args:
        dataset_name (str): The name of the LangSmith dataset.
        target_experiment_prefix (str): The prefix of the experiment to analyze in detail.
    """
    client = Client()

    # Listing Experiments 
    print("--- Listing Recent Experiments ---")
    try:
        experiments_project_name = client.read_project(project_name=dataset_name).name
    except Exception as e:
        print(f"Error reading project '{dataset_name}': {e}")
        print("Please ensure the dataset name is correct and you have access.")
        return

    experiments = client.list_runs(
        project_name=experiments_project_name,
        run_type="experiment"
    )

    experiments_list = list(experiments)
    print(f"Found {len(experiments_list)} experiments in project '{experiments_project_name}':")
    for i, exp in enumerate(experiments_list[:5]):
        print(f"  {i+1}. Name: {exp.name}, ID: {exp.id}, Created: {exp.created_at}")
        if exp.metadata:
            print(f"     Metadata: {exp.metadata}")
    print("-" * 30)

    # Fetching Runs for a Specific Experiment 
    target_experiment_run = None
    for exp in experiments_list:
        if exp.name.startswith(target_experiment_prefix):
            target_experiment_run = exp
            break

    if target_experiment_run:
        print(f"\n--- Analyzing Experiment: '{target_experiment_run.name}' (ID: {target_experiment_run.id}) ---")

        experiment_runs = client.list_runs(
            project_name=experiments_project_name,
            parent_run_id=target_experiment_run.id,
        )

        all_run_data = []
        print(f"Fetching evaluation results for {target_experiment_run.name}...")
        for run in experiment_runs:
            scores = run.feedback
            inputs = run.inputs
            outputs = run.outputs

            run_info = {
                "run_id": run.id,
                "example_id": run.reference_example_id,
                "question": inputs.get("question") if inputs else "N/A",
                "model_output": outputs.get("output") if outputs else "N/A",
                "status": run.status,
                "start_time": run.start_time,
                "end_time": run.end_time,
                "duration_ms": (run.end_time - run.start_time).total_seconds() * 1000 if run.start_time and run.end_time else None
            }

            for score in scores:
                run_info[f"score_{score.key}"] = score.score
                if score.comment:
                    run_info[f"comment_{score.key}"] = score.comment
            all_run_data.append(run_info)

        if not all_run_data:
            print(f"No individual runs found for experiment '{target_experiment_run.name}'. "
                  "Please ensure the experiment ran successfully and generated evaluation runs.")
        else:
            df = pd.DataFrame(all_run_data)
            print("\n--- Individual Run Review (First 5 Rows) ---")
            print(df.head())

            print("\n--- Summary Metrics ---")
            conciseness_scores_col = f"score_is_concise"
            if conciseness_scores_col in df.columns:
                conciseness_scores = df[conciseness_scores_col].mean()
                print(f"Average Conciseness Score: {conciseness_scores:.2f}")
            else:
                print(f"'{conciseness_scores_col}' not found in scores. Skipping conciseness analysis.")

            avg_duration = df["duration_ms"].mean()
            print(f"Average Run Duration: {avg_duration:.2f} ms")

            if conciseness_scores_col in df.columns:
                print("\n--- Visualizing Conciseness Scores ---")
                plt.figure(figsize=(8, 5))
                sns.countplot(x=conciseness_scores_col, data=df)
                plt.title(f"Conciseness Scores for Experiment: {target_experiment_run.name}")
                plt.xlabel("Is Concise (0=No, 1=Yes)")
                plt.ylabel("Number of Runs")
                plt.xticks([0, 1])
                plt.show() 
            
            print("\n--- Visualizing Run Duration Distribution ---")
            plt.figure(figsize=(10, 6))
            sns.histplot(df["duration_ms"].dropna(), kde=True)
            plt.title(f"Distribution of Run Durations for Experiment: {target_experiment_run.name}")
            plt.xlabel("Duration (ms)")
            plt.ylabel("Frequency")
            plt.show() 

            print("\n--- Debugging and Error Analysis (Example) ---")
            if conciseness_scores_col in df.columns:
                not_concise_runs = df[df[conciseness_scores_col] == 0]
                if not not_concise_runs.empty:
                    print(f"\nExamples that were NOT concise (first 2):")
                    for i, row in not_concise_runs.head(2).iterrows():
                        print(f"  Run ID: {row['run_id']}")
                        print(f"  Example ID: {row['example_id']}")
                        print(f"  Question: {row['question']}")
                        print(f"  Model Output: {row['model_output']}")
                        comment_col = f"comment_is_concise"
                        if comment_col in row and pd.notna(row[comment_col]):
                            print(f"  Conciseness Comment: {row[comment_col]}")
                        print("-" * 20)
                else:
                    print("\nAll runs were concise, no non-concise examples found for debugging.")
            else:
                print(f"Cannot perform conciseness debugging as '{conciseness_scores_col}' was not found.")

    else:
        print(f"Experiment with prefix '{target_experiment_prefix}' not found. "
              "Please check your experiment prefixes or ensure experiments have run.")

def compare_langsmith_experiments(dataset_name: str, experiment_prefixes_to_compare: List[str]):
    """
    Compares multiple LangSmith experiments based on conciseness and duration.

    Args:
        dataset_name (str): The name of the LangSmith dataset.
        experiment_prefixes_to_compare (List[str]): A list of experiment prefixes to compare.
    """
    client = Client()
    print("\n--- Comparing Multiple Experiments ---")

    try:
        experiments_project_name = client.read_project(project_name=dataset_name).name
    except Exception as e:
        print(f"Error reading project '{dataset_name}': {e}")
        print("Please ensure the dataset name is correct and you have access.")
        return

    experiments = client.list_runs(
        project_name=experiments_project_name,
        run_type="experiment"
    )
    experiments_list = list(experiments)

    comparison_data = []

    for prefix in experiment_prefixes_to_compare:
        exp_run = None
        for exp in experiments_list:
            if exp.name.startswith(prefix):
                exp_run = exp
                break

        if exp_run:
            print(f"Collecting data for experiment: {exp_run.name}")
            child_runs = client.list_runs(
                project_name=experiments_project_name,
                parent_run_id=exp_run.id
            )
            current_exp_scores = []
            current_exp_durations = []
            for run in child_runs:
                for score in run.feedback:
                    if score.key == "is_concise":
                        current_exp_scores.append(score.score)
                if run.start_time and run.end_time:
                    current_exp_durations.append((run.end_time - run.start_time).total_seconds() * 1000)

            if current_exp_scores:
                comparison_data.append({
                    "Experiment": exp_run.name,
                    "Average Conciseness": sum(current_exp_scores) / len(current_exp_scores),
                    "Average Duration (ms)": sum(current_exp_durations) / len(current_exp_durations) if current_exp_durations else None,
                    "Number of Runs": len(current_exp_scores)
                })
        else:
            print(f"Experiment with prefix '{prefix}' not found.")

    if comparison_data:
        comparison_df = pd.DataFrame(comparison_data)
        print("\n--- Experiment Comparison Summary ---")
        print(comparison_df.set_index("Experiment"))

        if len(comparison_data) > 1:
            plt.figure(figsize=(12, 6))

            plt.subplot(1, 2, 1)
            sns.barplot(x="Experiment", y="Average Conciseness", data=comparison_df)
            plt.title("Average Conciseness Score by Experiment")
            plt.ylabel("Score (0-1)")
            plt.ylim(0, 1)

            plt.subplot(1, 2, 2)
            sns.barplot(x="Experiment", y="Average Duration (ms)", data=comparison_df)
            plt.title("Average Run Duration by Experiment")
            plt.ylabel("Duration (ms)")

            plt.tight_layout()
            plt.show() 
    else:
        print("No data collected for comparison.")

if __name__ == "__main__":

    dataset_name = "RAG Application Golden Dataset - Groq Test" 

    target_experiment_prefix = "groq-deepseek" 
    analyze_langsmith_experiments(dataset_name, target_experiment_prefix)

   
    experiment_prefixes_to_compare = ["groq-deepseek", "groq-deepseek-alternate"] 
    compare_langsmith_experiments(dataset_name, experiment_prefixes_to_compare)

--- Listing Recent Experiments ---
Error reading project 'RAG Application Golden Dataset - Groq Test': Project RAG Application Golden Dataset - Groq Test not found
Please ensure the dataset name is correct and you have access.

--- Comparing Multiple Experiments ---
Error reading project 'RAG Application Golden Dataset - Groq Test': Project RAG Application Golden Dataset - Groq Test not found
Please ensure the dataset name is correct and you have access.
