In [2]:
import faiss
import cohere
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.environ["COHERE_API_KEY"]
co = cohere.Client(api_key)

In [3]:
text = """
Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.
Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007.
Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm.
Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles.
Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects.

Interstellar premiered on October 26, 2014, in Los Angeles.
In the United States, it was first released on film stock, expanding to venues using digital projectors.
The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014.
It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight.
It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics. Since its premiere, Interstellar gained a cult following,[5] and now is regarded by many sci-fi experts as one of the best science-fiction films of all time.
Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades"""
texts = text.split(".")
texts = [t.strip().strip("\n") for t in texts]
texts

['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan',
 'It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine',
 'Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind',
 'Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007',
 'Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar',
 'Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm',
 'Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles',
 'Interstellar uses extensive practical a

In [4]:
# call api to get embeddings

response = co.embed(texts=texts, input_type="search_document").embeddings
embeds = np.array(response)
embeds.shape

(15, 4096)

In [5]:
# build index
index = faiss.IndexFlatL2(embeds.shape[1])
print(index.is_trained)

True


In [6]:
index.add(np.float32(embeds))

In [7]:
texts_np = np.array(texts)

In [8]:
def search(query: str, k: int = 3):
    query_embed = co.embed(texts=[query], input_type="search_query").embeddings[0]
    distances, indices = index.search(np.float32([query_embed]), k)
    df = pd.DataFrame({"text": texts_np[indices[0]], "distance": distances[0]})
    return df


search("Interstellar")

Unnamed: 0,text,distance
0,Interstellar is a 2014 epic science fiction fi...,5754.339844
1,"Since its premiere, Interstellar gained a cult...",7780.146484
2,"It stars Matthew McConaughey, Anne Hathaway, J...",8318.056641


In [9]:
pd.set_option("display.max_colwidth", None)

In [10]:
search("how precise is the science")

Unnamed: 0,text,distance
0,It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics,11093.708984
1,"Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar",11558.036133
2,Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects,12309.347656


In [11]:
from rank_bm25 import BM25Okapi

from sklearn.feature_extraction import _stop_words
import string


def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for passage in tqdm(texts):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)


def keyword_search(query, top_k=3, num_candidates=15):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{"corpus_id": idx, "score": bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x["score"], reverse=True)

    print("Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:top_k]:
        print(
            "\t{:.3f}\t{}".format(
                hit["score"], texts[hit["corpus_id"]].replace("\n", " ")
            )
        )

100%|██████████| 15/15 [00:00<00:00, 21523.97it/s]


In [12]:
keyword_search("how precise is the science")

Input question: how precise is the science
Top-3 lexical search (BM25) hits
	1.789	Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan
	1.373	Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar
	0.000	It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine


In [22]:
# mini-RAG


def rag_search(query: str):
    docs = search(query)
    docs_dict = [{"text": text} for text in docs["text"]]
    res = co.chat(message=query, documents=docs_dict)
    return res.text, res.citations


query = "What is the revenue of the movie?"
rag_search(query)

('The movie had a worldwide gross of $677 million, and $773 million with subsequent re-releases.',
 [ChatCitation(start=35, end=47, text='$677 million', document_ids=['doc_0'], type='TEXT_CONTENT'),
  ChatCitation(start=53, end=94, text='$773 million with subsequent re-releases.', document_ids=['doc_0'], type='TEXT_CONTENT')])