## Install Dependencies

In [6]:
!pip install langchain_community arxiv pymupdf openai langsmith langchain_core langchain_openai langchainhub wikipedia

Collecting wikipedia
  Using cached wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=1f19d52b5f714c9380355412a5bbcc3efd7f9810d9d6b75fd9d8410da33f0d86
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [7]:
import os
from langchain_community.document_loaders import ArxivLoader
from langsmith import Client
import openai
from langsmith.wrappers import wrap_openai
from langsmith import traceable
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langsmith.schemas import Example, Run
from langchain_core.pydantic_v1 import BaseModel, Field
from langsmith.evaluation import LangChainStringEvaluator, evaluate
from langchain.document_loaders import WikipediaLoader

**Insert your OpenAI API key and Langsmith API key here before execution**

## Wikipedia Loader - Load movie plots

In [23]:
from langchain.document_loaders import WikipediaLoader

movies_list = ['Harry Potter and the philosophers stone', 'Harry Potter and the chamber of secrets']

movies = []

for movie in movies_list:
  loader = WikipediaLoader(query = movie, load_max_docs=1).load()
  movies.extend(loader)

movies

[Document(page_content='Harry Potter and the Philosopher\'s Stone is a fantasy novel written by British author J. K. Rowling. The first novel in the Harry Potter series and Rowling\'s debut novel, it follows Harry Potter, a young wizard who discovers his magical heritage on his eleventh birthday, when he receives a letter of acceptance to Hogwarts School of Witchcraft and Wizardry. Harry makes close friends and a few enemies during his first year at the school and with the help of his friends, Ron Weasley and Hermione Granger, he faces an attempted comeback by the dark wizard Lord Voldemort, who killed Harry\'s parents, but failed to kill Harry when he was just 15 months old.\nThe book was first published in the United Kingdom on 26 June 1997 by Bloomsbury. It was published in the United States the following year by Scholastic Corporation under the title Harry Potter and the Sorcerer\'s Stone. It won most of the British book awards that were judged by children and other awards in the U

# Create Dataset

In [26]:
from langsmith import Client

client = Client()

dataset_name = "Movies_summary_generator"

dataset = client.create_dataset(dataset_name = dataset_name, description = 'Movies to summarize',)

In [28]:
inputs = [m.page_content for m in movies]

inputs

['Harry Potter and the Philosopher\'s Stone is a fantasy novel written by British author J. K. Rowling. The first novel in the Harry Potter series and Rowling\'s debut novel, it follows Harry Potter, a young wizard who discovers his magical heritage on his eleventh birthday, when he receives a letter of acceptance to Hogwarts School of Witchcraft and Wizardry. Harry makes close friends and a few enemies during his first year at the school and with the help of his friends, Ron Weasley and Hermione Granger, he faces an attempted comeback by the dark wizard Lord Voldemort, who killed Harry\'s parents, but failed to kill Harry when he was just 15 months old.\nThe book was first published in the United Kingdom on 26 June 1997 by Bloomsbury. It was published in the United States the following year by Scholastic Corporation under the title Harry Potter and the Sorcerer\'s Stone. It won most of the British book awards that were judged by children and other awards in the US. The book reached th

In [29]:
client.create_examples(inputs = [{
    "text": m
} for m in inputs],
dataset_id = dataset.id)

## Generate tweets

In [30]:
# Generate tweet from movie plots using GPt-3..5-turbo
system_tweet_instructions = (
    """You are an assistant that generates Tweets to summarise movie plots.
    Ensure the summary: 1. has an engaging title. 2. Provides a bullet point list of main characters from the movie.
    3. Utilises emojis 4. includes plot twist 5. highlights in one sentence the key point of the movie.
    """
)

human = "Generate tweets for the following movie: {movie}"
prompt = ChatPromptTemplate.from_messages([("system", system_tweet_instructions),("human", human)])

def predict_tweet_gpt_3(example: dict):
  chat = ChatOpenAI(openai_api_key = os.getenv('OPENAI_API_KEY'), model = 'gpt-3.5-turbo')
  tweet_generator_gpt_3 = prompt | chat | StrOutputParser()
  response = tweet_generator_gpt_3.invoke({"movie":example["text"]})
  return {"answer": response}

## Evaluate Response and assign scores using GPT-4o

In [56]:
def answer_evaluator(run:Run, example: Example) -> dict:
  input_text = example.inputs["text"]
  prediction = run.outputs["answer"]

  class GradeSummary(BaseModel):
    score: int = Field(description = "Answer meets criteria, score from 0 to 5")

  llm = ChatOpenAI(model = 'gpt-4o', temperature=0)
  structured_llm_grader = llm.with_structured_output(GradeSummary)

  system = """
  You are grading tweets for movie plot summary. Ensure that the Assistant's answer is engaging and meets the criteria,
  Ensure the summary: 1. has an engaging title. 2. Provides a bullet point list of main characters from the movie.
  3. Utilises emojis 4. includes plot twist 5. highlights the feedback 6. Includes hashtag
  """

  grade_prompt = ChatPromptTemplate.from_messages(
      [
          ("system", system),
          ("human", "Assistant's answer for the movie summary: {prediction}")
      ]
  )

  answer_grader = grade_prompt | structured_llm_grader
  score = answer_grader.invoke({"prediction": prediction})
  return {"key": "summary_engagement_score", "score":int(score.score)}

## Track the results in Langsmith

In [None]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate
dataset_name = "Movies_summary_generator"

experiment_results = evaluate(
    predict_tweet_gpt_3,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix="summary-gpt3-turbo",
    metadata={"variant": "movie summary tweet, gpt-3-turbo"},
)

##Pairwise evaluation

In [52]:
# Generate tweets with GPT-3.5 and GPT-4
system_tweet_instructions = (
    """You are an assistant that generates Tweets to summarise movie plots.
    Ensure the summary: 1. has an engaging title. 2. Provides a bullet point list of main characters and the role of each character in the movie.
    3. Narrates the entire story in 3 lines 4. Utilises emojis 5. includes plot twist 6. Highlights one or two feedback about the movie
    """
)

human = "Generate tweets for the following movie: {movie}"
prompt = ChatPromptTemplate.from_messages([("system", system_tweet_instructions),("human", human)])

def predict_tweet_gpt_3(example: dict):
  chat = ChatOpenAI(openai_api_key = os.getenv('OPENAI_API_KEY'), model = 'gpt-3.5-turbo')
  tweet_generator_gpt_3 = prompt | chat | StrOutputParser()
  response = tweet_generator_gpt_3.invoke({"movie":example["text"]})
  return {"answer": response}

def predict_tweet_gpt_4(example: dict):
  chat = ChatOpenAI(openai_api_key = os.getenv('OPENAI_API_KEY'), model = 'gpt-4-turbo')
  tweet_generator_gpt_4 = prompt | chat | StrOutputParser()
  response = tweet_generator_gpt_4.invoke({"movie":example["text"]})
  return {"answer": response}

In [None]:
# Add experiment for both the models
from langsmith.evaluation import LangChainStringEvaluator, evaluate
dataset_name = "Movies_summary_generator"

experiment_results = evaluate(
    predict_tweet_gpt_3,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix="summary-gpt3-turbo",
    metadata={"variant": "movie summary tweet, gpt-3-turbo"},
)

experiment_results = evaluate(
    predict_tweet_gpt_4,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix="summary-gpt4-turbo",
    metadata={"variant": "movie summary tweet, gpt-4-turbo"},
)

In [None]:
# Pairwise evaluation between GPT-3.5 and GPT-4 using GPT-4o
from langsmith.evaluation import evaluate_comparative
from langchain import hub
from langchain_openai import ChatOpenAI
from langsmith.schemas import Run, Example
import json


def evaluate_pairwise(runs: list[Run], example: Example):
    scores = {}

    class GradeSummary(BaseModel):
      score: int = Field(description = "Answer meets criteria, score from 0 to 5")

    llm = ChatOpenAI(model = 'gpt-4o', temperature=0)

    system = """
    Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below.
    You should score both the assistants that
    1. has an engaging title.
    2. Provides a bullet point list of main characters from the movie.
    3. Utilises emojis
    4. includes plot twist
    5. highlights the feedback
    Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision.
    Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
    Your output should strictly be on a scale of 1 to 5 for both the assistants with the keys "Score for Assistant 1" for Assistant 1
    and "Score for Assistant 2" for Assistant and your output should be in JSON format. Your output should not include any explanations.
    """

    grade_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system),
            ("human", "[User Question] {question} \n[The Start of Assistant A's Answer] {answer_a} [The End of Assistant A's Answer]\nThe Start of Assistant B's Answer] {answer_b} [The End of Assistant B's Answer]")
            ])

    runnable = grade_prompt | llm

    response = runnable.invoke({
        "question": example.inputs["text"],
        "answer_a": runs[0].outputs["answer"] if runs[0].outputs is not None else "N/A",
        "answer_b": runs[1].outputs["answer"] if runs[1].outputs is not None else "N/A",
    })
    response = str(response.content)
    json_response = eval(response)
    scores[runs[0].id] = json_response["Score for Assistant 1"]
    scores[runs[1].id] = json_response["Score for Assistant 2"]

    return {"key": "ranked_preference", "scores": scores}


evaluate_comparative(
    # Replace the following array with the names or IDs of your experiments
    ["summary-gpt3-turbo-b0e727f2", "summary-gpt4-turbo-a19504ed"],
    evaluators=[evaluate_pairwise],
)