In [5]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
import uuid
import os
from openai import OpenAI

In [2]:
def chunk_text(text, chunk_size=512):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

def process_and_add_to_chromadb(input_file='data/medium.csv'):
    df = pd.read_csv(input_file)

    documents = []
    metadatas = []
    ids = []

    for index, row in df.iterrows():
        title = row['Title']
        text_chunks = chunk_text(row['Text'])

        for chunk in text_chunks:
            documents.append(chunk)
            metadatas.append({"title": title})
            ids.append(str(uuid.uuid4()))

    chroma_client = chromadb.PersistentClient(path="medium_db")

    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

    collection = chroma_client.get_or_create_collection(name="medium_articles", embedding_function=sentence_transformer_ef)

    collection.add(documents=documents, metadatas=metadatas, ids=ids)

process_and_add_to_chromadb()


In [6]:
def query_db(query_text, n_results=5):
    collection_name = "medium_articles"

    # Initialize ChromaDB client and access the collection
    chroma_client = chromadb.PersistentClient(path="medium_db")
    collection = chroma_client.get_collection(name=collection_name)

    # Perform the query
    results = collection.query(
        query_texts=[query_text],
        n_results=n_results,
        include=['documents', 'metadatas']
    )

    # Prepare the documents and titles for return
    documents = [doc for doc in results['documents'][0]]
    titles = [meta['title'] for meta in results['metadatas'][0]]

    return documents, titles


In [15]:
os.environ["OPENAI_API_KEY"] = "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"

def generate_answer_with_openai(question, n_results=5):
    documents, titles = query_db(question, n_results)

    # Initialize the OpenAI client
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

    # Construct a detailed prompt from the question and the retrieved documents
    articles = ""
    prompt = f"Question: {question}\n\n"
    for title, doc in zip(titles, documents):
        prompt += f"Article Title: {title}\n{doc}\n\n"
        if articles == "":
            articles += f"Sources: {title}"
        else:
            articles += f", {title}"
    prompt += "Based on the above articles, please answer the question."

    # Check for max token limit and truncate if necessary
    max_tokens = 4096
    if len(prompt) > max_tokens:
        prompt = prompt[:max_tokens]

    # Generate an answer using the OpenAI API
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",  # Adjust the model as necessary
        messages=[
            {"role": "system", "content": "You are a knowledgeable assistant who provides answers based on the provided articles."},
            {"role": "user", "content": prompt}
        ],
    )

    # Extract the generated answer
    answer = response.choices[0].message.content + f"\n\n{articles}"

    return answer.strip()


In [16]:
print(query_db("What is NLP?", 5))

(['So, What is Natural Language Processing (NLP)? NLP is an interdisciplinary field concerned with the interactions between computers and natural human languages (e.g. English) — speech or text. NLP-powered software helps us in our daily lives in various ways, for example: Personal assistants : Siri, Cortana, and Google Assistant. : Siri, Cortana, and Google Assistant. Auto-complete : In search engines (e.g. Google). : In search engines (e.g. Google). Spell checking : Almost everywhere, in your browser, your IDE (e.g. Visual Studio), desktop apps (e.g. Microsoft Word). : Almost everywhere, in your browser, your IDE (e.g. Visual Studio), desktop apps (e.g. Microsoft Word). Machine Translation: Google Translate. Okay, now we get it, NLP plays a significant role in our daily computer interactions; let’s take a look at some example business-related use-cases for NLP: Fast-food chains receive a vast amount of orders and complaints daily; manually handling this will be tiresome and repetitiv

In [18]:
question = "What is Word2Vec?"
answer = generate_answer_with_openai(question, 5)
print("Answer:", answer)


Answer: Word2Vec is a technique used to learn word embeddings by using a two-layer neural network. It takes a text corpus as input and generates a set of vectors as output. These word embeddings make natural language computer-readable, allowing mathematical operations to be performed on words to detect similarities. With Word2Vec, similar words tend to cluster together in a vector space. There are two main training algorithms for Word2Vec: continuous bag of words (CBOW) and skip-gram. Skip-gram is often preferred because it can capture multiple semantics for a single word, resulting in more accurate representations. The Gensim Python library is commonly used for implementing Word2Vec models for word embeddings.

Sources: A Beginner’s Guide to Word Embedding with Gensim Word2Vec Model, Sentiment Analysis — A how-to guide with movie reviews, Word Embeddings for NLP, Wine Embeddings and a Wine Recommender, Spam Filtering System With Deep Learning
