In [4]:
import fireworks.client
import os
import dotenv
import chromadb
import json
from tqdm.auto import tqdm
import pandas as pd
import random

# you can set envs using Colab secrets
dotenv.load_dotenv()

fireworks.client.api_key = os.getenv("FIREWORKS_API_KEY")

In [5]:
def get_completion(prompt, model=None, max_tokens=50):

    fw_model_dir = "accounts/fireworks/models/"

    if model is None:
        model = fw_model_dir + "llama-v2-7b"
    else:
        model = fw_model_dir + model

    completion = fireworks.client.Completion.create(
        model=model,
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=0
    )

    return completion.choices[0].text

In [50]:
get_completion("Hello, my name is?")

'\nI am a 20 year old female. I am a student at the University of North Carolina at Chapel Hill. I am a member of the UNC Dance Team. I am a member of the UNC Dance Team. I am'

In [51]:
mistral_llm = "mistral-7b-instruct-4k"

get_completion("Hello, my name is", model=mistral_llm)


' [Your Name]. I am a [Your Profession/Occupation]. I am writing to [Purpose of Writing].\n\nI am writing to [Purpose of Writing] because [Reason for Writing]. I believe that ['

In [52]:
mistral_llm = "mistral-7b-instruct-4k"

get_completion("Tell me 2 jokes", model=mistral_llm)

".\n1. Why don't scientists trust atoms? Because they make up everything!\n2. Did you hear about the mathematician who’s afraid of negative numbers? He will stop at nothing to avoid them."

In [13]:
mistral_llm = "mistral-7b-instruct-4k"

get_completion("[INST]Tell me 2 jokes[/INST]", model=mistral_llm)

" Sure, here are two jokes for you:\n\n1. Why don't scientists trust atoms? Because they make up everything!\n2. Why did the tomato turn red? Because it saw the salad dressing!"

In [54]:
prompt = """[INST]
Given the following wedding guest data, write a very short 3-sentences thank you letter:

{
  "name": "John Doe",
  "relationship": "Bride's cousin",
  "hometown": "New York, NY",
  "fun_fact": "Climbed Mount Everest in 2020",
  "attending_with": "Sophia Smith",
  "bride_groom_name": "Tom and Mary"
}

Use only the data provided in the JSON object above.

The senders of the letter is the bride and groom, Tom and Mary.
[/INST]"""

get_completion(prompt, model=mistral_llm, max_tokens=150)

" Dear John Doe,\n\nWe, Tom and Mary, would like to extend our heartfelt gratitude for your attendance at our wedding. It was a pleasure to have you there, and we truly appreciate the effort you made to be a part of our special day.\n\nWe were thrilled to learn about your fun fact - climbing Mount Everest is an incredible accomplishment! We hope you had a safe and memorable journey.\n\nThank you again for joining us on this special occasion. We hope to stay in touch and catch up on all the amazing things you've been up to.\n\nWith love,\n\nTom and Mary"

<h1>Load Dataset of ML papers of the week</h1>


In [6]:
# load dataset from data/ folder to pandas dataframe
# dataset contains column names

ml_papers = pd.read_csv("mlpapers-otw.csv", header=0)

# remove rows with empty titles or descriptions
ml_papers = ml_papers.dropna(subset=["Title", "Description"])

In [56]:
ml_papers.head()

Unnamed: 0,Title,Description,PaperURL,TweetURL,Abstract
0,Llemma,an LLM for mathematics which is based on conti...,https://arxiv.org/abs/2310.10631,https://x.com/zhangir_azerbay/status/171409802...,"We present Llemma, a large language model for ..."
1,LLMs for Software Engineering,a comprehensive survey of LLMs for software en...,https://arxiv.org/abs/2310.03533,https://x.com/omarsar0/status/1713940983199506...,This paper provides a survey of the emerging a...
2,Self-RAG,presents a new retrieval-augmented framework t...,https://arxiv.org/abs/2310.11511,https://x.com/AkariAsai/status/171511027707796...,"Despite their remarkable capabilities, large l..."
3,Retrieval-Augmentation for Long-form Question ...,explores retrieval-augmented language models o...,https://arxiv.org/abs/2310.12150,https://x.com/omarsar0/status/1714986431859282...,We present a study of retrieval-augmented lang...
4,GenBench,presents a framework for characterizing and un...,https://www.nature.com/articles/s42256-023-007...,https://x.com/AIatMeta/status/1715041427283902...,


In [7]:
ml_papers_dict = ml_papers.to_dict(orient="records")
ml_papers_dict[0]

{'Title': 'Llemma',
 'Description': 'an LLM for mathematics which is based on continued pretraining from Code Llama on the Proof-Pile-2 dataset; the dataset involves scientific paper, web data containing mathematics, and mathematical code; Llemma outperforms open base models and the unreleased Minerva on the MATH benchmark; the model is released, including dataset and code to replicate experiments.',
 'PaperURL': 'https://arxiv.org/abs/2310.10631',
 'TweetURL': 'https://x.com/zhangir_azerbay/status/1714098025956864031?s=20',
 'Abstract': 'We present Llemma, a large language model for mathematics. We continue pretraining Code Llama on the Proof-Pile-2, a mixture of scientific papers, web data containing mathematics, and mathematical code, yielding Llemma. On the MATH benchmark Llemma outperforms all known open base models, as well as the unreleased Minerva model suite on an equi-parameter basis. Moreover, Llemma is capable of tool use and formal theorem proving without any further finet

In [9]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        batch_embeddings = embedding_model.encode(input)
        return batch_embeddings.tolist()

embed_fn = MyEmbeddingFunction()

# Initialize the chromadb directory, and client.
client = chromadb.PersistentClient(path="./chromadb")

# create collection
collection = client.get_or_create_collection(
    name=f"ml-papers"
)

In [10]:
# Generate embeddings, and index titles in batches
batch_size = 50

# loop through batches and generated + store embeddings
for i in tqdm(range(0, len(ml_papers_dict), batch_size)):

    i_end = min(i + batch_size, len(ml_papers_dict))
    batch = ml_papers_dict[i : i + batch_size]

    # Replace title with "No Title" if empty string
    batch_titles = [str(paper["Title"]) if str(paper["Title"]) != "" else "No Title" for paper in batch]
    batch_ids = [str(sum(ord(c) + random.randint(1, 10000) for c in paper["Title"])) for paper in batch]
    batch_metadata = [dict(url=paper["PaperURL"],
                           abstract=paper['Abstract'])
                           for paper in batch]

    # generate embeddings
    batch_embeddings = embedding_model.encode(batch_titles)

    # upsert to chromadb
    collection.upsert(
        ids=batch_ids,
        metadatas=batch_metadata,
        documents=batch_titles,
        embeddings=batch_embeddings.tolist(),
    )

  0%|          | 0/9 [00:00<?, ?it/s]

In [11]:
collection = client.get_or_create_collection(
    name=f"ml-papers",
    embedding_function=embed_fn
)

retriever_results = collection.query(
    query_texts=["Software Engineering"],
    n_results=2,
)

print(retriever_results["documents"])

[['LLMs for Software Engineering', 'Communicative Agents for Software Development']]


In [14]:
# user query
user_query = "S3Eval: A Synthetic, Scalable, Systematic Evaluation Suite for Large Language Models"

# query for user query
results = collection.query(
    query_texts=[user_query],
    n_results=10,
)

# concatenate titles into a single string
short_titles = '\n'.join(results['documents'][0])

prompt_template = f'''[INST]

Your main task is to generate 5 SUGGESTED_TITLES based for the PAPER_TITLE

You should mimic a similar style and length as SHORT_TITLES but PLEASE DO NOT include titles from SHORT_TITLES in the SUGGESTED_TITLES, only generate versions of the PAPER_TILE.

PAPER_TITLE: {user_query}

SHORT_TITLES: {short_titles}

SUGGESTED_TITLES:

[/INST]
'''

responses = get_completion(prompt_template, model=mistral_llm, max_tokens=2000)
suggested_titles = ''.join([str(r) for r in responses])

# Print the suggestions.
print("Model Suggestions:")
print(suggested_titles)
print("\n\n\nPrompt Template:")
print(prompt_template)

Model Suggestions:

1. S3Eval: A Comprehensive Evaluation Suite for Large Language Models
2. Synthetic and Scalable Evaluation for Large Language Models
3. Systematic Evaluation of Large Language Models with S3Eval
4. S3Eval: A Synthetic and Scalable Approach to Language Model Evaluation
5. S3Eval: A Synthetic and Scalable Evaluation Suite for Large Language Models



Prompt Template:
[INST]

Your main task is to generate 5 SUGGESTED_TITLES based for the PAPER_TITLE

You should mimic a similar style and length as SHORT_TITLES but PLEASE DO NOT include titles from SHORT_TITLES in the SUGGESTED_TITLES, only generate versions of the PAPER_TILE.

PAPER_TITLE: S3Eval: A Synthetic, Scalable, Systematic Evaluation Suite for Large Language Models

SHORT_TITLES: Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling
ChemCrow: Augmenting large-language models with chemistry tools
A Survey of Large Language Models
LLaMA: Open and Efficient Foundation Language Models
Spars