In [1]:
from langsmith import evaluate, Client
from langsmith.schemas import Example, Run

# 1. Create and/or select your dataset
client = Client()
dataset = client.clone_public_dataset("https://smith.langchain.com/public/728e92ee-b050-4284-93b8-45682ad008f2/d")

In [2]:
def is_concise_enough(root_run: Run, example: Example) -> dict:
    root_output = root_run.outputs.get("output", "")
    example_answer = example.outputs.get("output", "")
    if root_output and example_answer:
        score = len(root_output) < 3 * len(example_answer)
    else:
        score = False  
    return {"key": "is_concise", "score": int(score)}

In [3]:
generate_output = lambda x: x["messages"][-1].get("content", "No query provided") + " is a good question. I don't know the answer."
evaluate(
    generate_output,
    data=dataset.name,
    evaluators=[is_concise_enough],
    experiment_prefix="my first experiment"
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'my first experiment-2c9077a4' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=7ab8f2a1-9e49-4adf-bb0a-f10989b08c24




7it [00:01,  4.89it/s]


Unnamed: 0,inputs.messages,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,"[{'role': 'system', 'content': 'You are a help...",Recommendations for enterprises to accelerate ...,,There are five recommendations for enterprises...,1,0.016797,422c974f-4dc3-47be-8c33-118ab9ee00b5,a6584037-638e-48d5-bb86-cd9840003c4a
1,"[{'role': 'system', 'content': 'You are a help...",what are the Top Emerging Gen AI Roles in CY 2...,,Here are the Top Emerging Gen AI Roles in CY 2...,1,0.016797,527abb15-f47d-4cb0-98bf-ac40b2aabe42,73f29538-2f43-4123-9432-239b3b852331
2,"[{'role': 'system', 'content': 'You are a help...",How are companies preparing thnd technology fo...,,"Unfortunately, the provided information does n...",1,0.016797,d9c5e7ba-bd3c-4a6a-9896-2f6036d141a8,d2031cf6-c0de-4572-b310-8a351cf43030
3,"[{'role': 'system', 'content': 'You are a help...",The survey provides data on the budget allocat...,,"Yes, the survey provides data on the budget al...",1,0.016797,f3ab39ba-5e76-44df-8243-8ae24db06263,69dce796-ec5e-4e57-aaf1-d56be43380bc
4,"[{'role': 'system', 'content': 'You are a help...",what are the Top Emerging Gen AI Roles in CY 2...,,"According to the data, the Top Emerging Gen AI...",1,0.016797,4eb0805f-c95d-4bcc-9112-b2475b0aeda4,184fe364-3d90-4908-a586-fb75a8f4c4f8
5,"[{'role': 'system', 'content': 'You are a help...",Recommendations for enterprises to accelerate ...,,To accelerate digital adoption in the AI and s...,1,0.016797,ed63e96c-a5c6-455f-ad78-81d737ccd398,5cfbe2eb-a062-41b5-b644-dce0731ceb7a
6,"[{'role': 'system', 'content': 'You are a help...",what are the Recommendations for enterprises t...,,According to the Avasant and nasscom Digital E...,1,0.016797,60a85602-072a-4a7b-88f8-a3ca2b2a9bc7,d709e558-985d-415b-bce8-5ce0210512c9


## Ollama(llama3.2)

In [32]:
import ollama

def call_ollama(messages, model: str):
    response = ""
    stream = ollama.chat(messages=messages, model=model, stream=True)
    for chunk in stream:
        #print(chunk["message"]["content"], end="", flush=True)
        response += chunk["message"]["content"]
    return response

In [33]:
def my_app(inputs: dict) -> dict:
    system_msg = "Answer user questions about this context: \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")
    response = call_ollama(messages, model="llama3.2")
    
    return {"answer": response}

In [34]:
# Test the function with the correct input key
result = my_app(
    {
        "messages": "What are the main differences in training efficiency between graphic card and ram"
    }
)
print(result)

{'answer': "The main differences in training efficiency between a graphics card (GPU) and Random Access Memory (RAM) depend on the type of task, specifically deep learning model training. Here's a summary:\n\n**Graphics Card (GPU):**\n\n1. **Parallel processing**: GPUs are designed for parallel processing, making them well-suited for matrix multiplications, convolutional operations, and other compute-intensive tasks commonly used in deep learning.\n2. **Massive parallelism**: With hundreds or thousands of cores, GPUs can process vast amounts of data simultaneously, reducing the training time for large models.\n3. **High throughput**: GPUs can handle high-throughput data transfer rates, which is essential for efficient model training.\n\nHowever, there are some limitations:\n\n1. **Memory bandwidth**: GPUs have limited memory bandwidth, which can bottleneck memory-intensive tasks like batch normalization and data loading.\n2. **Cache hierarchy**: GPUs often rely on slower cache hierarch

In [35]:
from langsmith.schemas import Run, Example
def is_answered(run: Run, example: Example) -> dict:
    student_answer = run.outputs.get("answer")
    if not student_answer:
        return {"key": "is_answered", "score": 0}
    else:
        return {"key": "is_answered", "score": 1}
qa_evalulator = [is_answered]
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evalulator,
    experiment_prefix="llama3.2-Evaluations",
)
print(experiment_results)

View the evaluation results for experiment: 'llama3.2-Evaluations-d8c0816f' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=96df9d1f-96be-41a4-8410-cbfbce87449d




2it [01:51, 55.59s/it]

<ExperimentResults llama3.2-Evaluations-d8c0816f>





## Evaluation using groq(llama3-8b-1092)

In [13]:
from groq import Groq

# Initialize Groq client with API key
groq_client = Groq(api_key="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")

# Few-shot prompt template
few_shot_prompt = """
You are an AI assistant that answers questions about digital transformation, industry trends, and strategic technology roles. Answer the following questions concisely in 2-3 sentences based on the provided context. Provide direct answers based on the examples without adding unnecessary prefaces.

### Example 1
**Question**: How are companies adapting their digital strategies to sustain growth amid fluctuating tech budgets?
**Answer**: Organizations are prioritizing automation and data insights, reallocating resources to maintain growth despite budget cuts.

### Example 2
**Question**: Which industries have shown significant advancement in adopting automation technologies recently?
**Answer**: Manufacturing and energy sectors lead in automation, boosting efficiency and cutting costs.

### Example 3
**Question**: How is the growing role of AI specialists transforming business decision-making?
**Answer**: AI specialists enable real-time, data-driven decisions, helping companies stay responsive to market needs.

### Example 4
**Question**: What role does vendor consolidation play in modernizing digital infrastructure?
**Answer**: Vendor consolidation simplifies tech management, allowing companies to adopt innovations faster and save costs.

### Example 5
**Question**: How is Generative AI influencing personalized customer experiences?
**Answer**: Generative AI personalizes experiences by analyzing customer data, widely adopted in retail to increase engagement.

### Example 6
**Question**: In what ways are digital transformation initiatives impacting the financial services industry?
**Answer**: Financial services focus on digital channels and automation, leading to improved customer satisfaction and regulatory response.

### Example 7
**Question**: Why are Chief Technology Officers focusing more on digital maturity in organizations?
**Answer**: CTOs emphasize digital maturity to use data, AI, and cloud effectively, aligning digital strategy with business goals.

### Example 8
**Question**: How are educational institutions supporting the need for digital talent in tech-focused sectors?
**Answer**: Universities partner with tech firms to develop programs that address skill gaps in AI, data science, and cybersecurity.

### Example 9
**Question**: What is the importance of AI ethics in the deployment of enterprise AI solutions?
**Answer**: AI ethics ensures fair, transparent solutions aligned with social values, especially critical in healthcare and finance.

### Example 10
**Question**: How is digital spending expected to evolve in response to sustainability goals?
**Answer**: Organizations are investing in sustainable tech like IoT and AI for energy management, reducing costs and supporting environmental goals.

### Question
**Question**: {question}
**Answer**:
"""

In [14]:
from groq import Groq

# Initialize Groq client with API key
groq_client = Groq(api_key="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")

def my_app(inputs: dict) -> dict:
    """
    Generates answers to user questions based on the provided context using the Groq API.
    """
    system_msg = "You are a helpful assistant for document-based queries. \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")
    try:
        response = groq_client.chat.completions.create(
            messages=messages,
            model="llama3-8b-8192",
            max_tokens=1024,
            temperature=0.7
        )
        return {"answer": response.choices[0].message.content}
    except Exception as e:
        return {"error": f"An error occurred while generating the response: {str(e)}"}

In [15]:
# Example usage
answer = my_app({"messages": "What sectors showed the most improvement in digital maturity in 2023?"})
print(answer)

{'answer': 'According to a recent report by the Digital McKinsey Global Survey, the sectors that showed the most improvement in digital maturity in 2023 are:\n\n1. Healthcare: The healthcare sector has made significant strides in digital maturity, with 64% of respondents reporting improvement in digital capabilities, particularly in areas such as patient engagement, data analytics, and telemedicine.\n2. Financial Services: The financial services sector has also seen significant improvement in digital maturity, with 58% of respondents reporting improvement in areas such as customer onboarding, payment processing, and digital lending.\n3. Retail and Consumer Goods: The retail and consumer goods sector has made notable progress in digital maturity, with 55% of respondents reporting improvement in areas such as e-commerce, supply chain management, and customer experience.\n4. Manufacturing: The manufacturing sector has also shown improvement in digital maturity, with 52% of respondents rep

In [17]:
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

# Correctness metric
def correctness_metric(run: Run, example: Example) -> dict:
    student_answer = run.outputs.get("output")
    correct_answer = example.outputs.get("output")
    if student_answer == correct_answer:
        return {"key": "correctness", "score": 1}
    else:
        return {"key": "correctness", "score": 0}  


# Define your evaluation function
qa_evaluators = [correctness_metric]
experiment_results = evaluate(
    my_app, 
    data=dataset.name,  
    evaluators=qa_evaluators,
    experiment_prefix="Groq-evaluations",
    metadata={"variant": "stuff website context"}
)
print("Evaluation Results:", experiment_results)

View the evaluation results for experiment: 'Groq-evaluations-088f70b2' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=db729d62-efce-49be-8a6b-35ff097429c7




7it [00:01,  4.50it/s]

Evaluation Results: <ExperimentResults Groq-evaluations-088f70b2>





In [16]:
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

# Faithfulness metric
def faithfulness(root_run: Run, example: Example) -> dict:
    root_context = root_run.inputs.get("context", "").strip().lower()
    root_output = root_run.outputs.get("output", "").strip().lower()
    
    faithfulness_score = int(all(word in root_context for word in root_output.split()))
    return {"key": "faithfulness", "score": faithfulness_score}

def is_answered(run: Run, example: Example) -> dict:
    student_answer = run.outputs.get("answer")
    if not student_answer:
        return {"key": "is_answered", "score": 0}
    else:
        return {"key": "is_answered", "score": 1}

# List of all evaluators
qa_evaluators = [
    faithfulness,is_answered
]

# Run the evaluation
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evaluators,
    experiment_prefix="Groq-evaluations",
)

print("Evaluation Results:", experiment_results)


View the evaluation results for experiment: 'Groq-evaluations-d2863b81' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=f82385cf-b518-47a5-ad57-c1174427fd8f




7it [00:01,  4.10it/s]

Evaluation Results: <ExperimentResults Groq-evaluations-d2863b81>



