In [None]:
import os
import pandas as pd 
import numpy as np
import replicate
import pickle
from rouge import Rouge
from wikipedia import wikipedia
from tqdm import tqdm

In [None]:
os.environ["REPLICATE_API_TOKEN"] = # YOUR REPLICATE API KEY HERE

In [None]:
def retrieve_wikipedia_articles(keyword, num_articles=1):
    articles = []
    search_results = wikipedia.search(keyword, num_articles)
    for result in search_results:
        try:
            page = wikipedia.page(result)
            articles.append(page.content)
        except:
            print(f"No Wikipedia page found for '{result}'.")
    return articles

In [None]:
def generate_rag_prompt(llm_keywords):
    related_articles = retrieve_wikipedia_articles(llm_keywords)
    rag_prompt = " ".join(article for article in related_articles)
    return rag_prompt

In [None]:
def calculate_rouge_scores(hypotheses, references):
    rouge = Rouge()
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores

In [None]:
# Need to download the USB benchmark for the test.jsonl file
test = pd.read_json(path_or_buf='usb/task_datasets/all/abstractive_summarization/test.jsonl', lines=True)

In [None]:
xs, ys = [], []
for i in range(test.shape[0]):
    x, y = " ".join(test['input_lines'][i])[:4096], " ".join(test['output_lines'][i])
    xs.append(x)
    ys.append(y)

In [None]:
topics = [xs[i].split('.')[0] for i in range(len(xs))]

###  Summarization GPT-3.5 Turbo

In [None]:
from openai import OpenAI

In [None]:
gptkey = # YOUR OPENAI API KEY HERE
client = OpenAI(api_key=gptkey)

In [None]:
gpt_outputs = []
for i,x in tqdm(enumerate(xs)):
    topic = topics[i]
    response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": f"Summarize the following passage about {topic} in a few sentences: {x}"}]
            )
    output = response.choices[0].message.content
    gpt_outputs.append(output)

In [None]:
with open('gpt_outputs.pkl', 'wb') as f:
    pickle.dump(gpt_outputs, f)

### Summarization  Llama-2-70b-Chat

In [None]:
llama_outputs = []
for i, x in tqdm(enumerate(xs)):
    topic = topics[i]
    output = replicate.run(
    "meta/llama-2-70b-chat",
    input={
        "top_k": 0,
        "top_p": 1,
        "prompt": x,
        "temperature": 0.75,
        "system_prompt": f"Summarize the following passage about {topic} in a few sentences",
        "length_penalty": 1,
        "max_new_tokens": 800,
        "prompt_template": "<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{prompt} [/INST]",
        "presence_penalty": 0
        }
    )
    output = "".join(output)
    llama_outputs.append(output)

In [None]:
with open('llama_outputs.pkl', 'wb') as f:
    pickle.dump(llama_outputs, f)

### Summarization Mixtral

In [None]:
mixtral_outputs = []
for i,x in tqdm(enumerate(xs)):
    topic = topics[i]
    output = replicate.run(
    "mistralai/mixtral-8x7b-instruct-v0.1",
    input={
        "top_k": 0,
        "top_p": 1,
        "prompt": x,
        "temperature": 0.75,
        "system_prompt": f"Summarize the following passage about {topic} in a few sentences",
        "length_penalty": 1,
        "max_new_tokens": 800,
        "prompt_template": "<s>[INST] {prompt} [/INST] ",
        "presence_penalty": 0
        }
    )
    output = "".join(output)
    mixtral_outputs.append(output)

In [None]:
with open('mixtral_outputs.pkl', 'wb') as f:
    pickle.dump(mixtral_outputs, f)

### Calculate ROUGE Scores

In [None]:
gpt_scores, llama_scores, mixtral_scores = [], [], []
for i, y in enumerate(ys):
    gpt_scores.append(calculate_rouge_scores(gpt_outputs[i], y)['rouge-l']['r'])
    llama_scores.append(calculate_rouge_scores(llama_outputs[i], y)['rouge-l']['r'])
    mixtral_scores.append(calculate_rouge_scores(mixtral_outputs[i], y)['rouge-l']['r'])

In [None]:
np.mean(gpt_scores), np.mean(llama_scores), np.mean(mixtral_scores)

In [None]:
np.std(gpt_scores), np.std(llama_scores), np.std(mixtral_scores)