In [None]:
# !pip install -r requirements.txt

In [None]:
# !mkdir data
# !mv stack-help_data.json data/

In [1]:
import os
import json
import pandas as pd
import datetime
import re
import string
import numpy as np

def wrangle_data(filepath: str) -> pd.DataFrame:
    """
    This function reads in the data from the filepath, cleans it and returns a pandas dataframe

    Args:
    filepath: str: a path to the json file

    Returns:
    df: pd.DataFrame: a pandas dataframe with the cleaned data

    """
    # load data
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Get article body, urls and titles
    doc = {
        'article_title': data['article_link_title'],
        'article_links': data['article_links'],
        'article_body': data['article_body'],
    }


    # Store the data in a pandas dataframe
    df = pd.DataFrame(doc)


    ## Clean article title
    # Remove the forward-slash character
    df['article_title_cleaned'] = df['article_title'].str.replace(r"\/","")

    # Remove punctuation
    df['article_title_cleaned'] = df['article_title_cleaned'].str.translate(string.punctuation)

    # Remove digits
    df['article_title_cleaned'] = df['article_title_cleaned'].str.replace(r"\d+","")

    # Remove running spaces
    df['article_title_cleaned'] = df['article_title_cleaned'].str.replace(r"\s{2,}","")

    # Make the text lowercase
    df['article_title_cleaned'] = df['article_title_cleaned'].str.lower()

    ## Clean article body

    # Remove the forward-slash character
    df['article_body_cleaned'] = df['article_body'].str.replace(r"\/","")

    # Remove punctuation
    df['article_body_cleaned'] = df['article_body_cleaned'].str.translate(string.punctuation)

    # Remove digits
    df['article_body_cleaned'] = df['article_body_cleaned'].str.replace(r"\d+","")

    # Remove running spaces
    df['article_body_cleaned'] = df['article_body_cleaned'].str.replace(r"\s{2,}","")

    # Make the text lowercase
    df['article_body_cleaned'] = df['article_body_cleaned'].str.lower()

    # Remove double white space
    df['article_body_cleaned'] = df['article_body_cleaned'].str.replace('  ', '')

    # Fills empty article with word Empty
    df['article_body_cleaned'] = df['article_body_cleaned'].replace('', 'Empty')

    return df


doc_df = wrangle_data('data/stack-help_data.json')

In [2]:
# Import DataFrameLoader
from langchain.document_loaders import DataFrameLoader

new_df = doc_df[['article_title_cleaned', 'article_body_cleaned', 'article_links']]
new_df.rename(columns={'article_title_cleaned': 'title', 'article_body_cleaned': 'page_content', 'article_links': 'urls'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.rename(columns={'article_title_cleaned': 'title', 'article_body_cleaned': 'page_content', 'article_links': 'urls'}, inplace=True)


In [3]:
# The page content column is 'page_content'
docs = DataFrameLoader(
    new_df,
    page_content_column="page_content",
).load()

# Print the first 3 documents and the number of documents
docs[-1]
# display(f"Number of documents: {len(docs)}")

Document(metadata={'title': '21 collyer quay, singapore 049320', 'urls': 'https://stackuphelpcentre.zendesk.com/hc/en-us/articles/21537922604441-21-Collyer-Quay-Singapore-049320'}, page_content='Empty')

In [4]:
# Import os and pinecone
import pinecone

# Set the pinecone api key from the environment variable. Assign to api_key.
api_key=os.getenv("PINECONE_API_KEY")

# Initialize pinecone using the `PINECONE_API_KEY` variable.
pc = pinecone.Pinecone(api_key)

# Import os and pinecone
# Use this index name
index_name = "stackragapp"

# List the names of available indexes. Assign to existing_index_names.
existing_index_names = [idx.name for idx in pc.list_indexes().indexes]

# First check that the given index does not exist yet
if index_name not in existing_index_names:
    # Create the 'imbd-movies' index with cosine metric, 1536 dims, serverless spec: aws in us-east-1
    pc.create_index(
        name=index_name,
        metric='cosine',
        dimension=768,
        spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1")
    )

  from tqdm.autonotebook import tqdm


In [5]:
from langchain_together import TogetherEmbeddings

# Create the embeddings object
embeddings = TogetherEmbeddings()

# From the langchain_pinecone package, import PineconeVectorStore
from langchain_pinecone import PineconeVectorStore

# Create an index from its name
index = pc.Index(index_name)

In [6]:
# Count the number of vectors in the index
n_vectors = index.describe_index_stats()['total_vector_count']
print(f"There are {n_vectors} vectors in the index already.")

# Check if there is already some data in the index on Pinecone
if n_vectors > 0:
    # If there is, get the documents to search from the index. Assign to docsearch.
    docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings)
else:
    # If not, fill the index from the documents and return those docs to assign to docsearch
    docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

There are 74 vectors in the index already.


In [12]:
# Define a question about movies to ask
question = "help me"
    
# Convert the vector database to a retriever and get the relevant documents for a question
print("These are the documents most relevant to the question:")
docsearch.as_retriever().invoke(question)

These are the documents most relevant to the question:


[Document(metadata={'title': '21 collyer quay, singapore 049320', 'urls': 'https://stackuphelpcentre.zendesk.com/hc/en-us/articles/21537922604441-21-Collyer-Quay-Singapore-049320'}, page_content='Empty'),
 Document(metadata={'title': 'want to stay updated on campaign drops and quest starts on discord?', 'urls': 'https://stackuphelpcentre.zendesk.com/hc/en-us/articles/32834633225113-Want-to-stay-updated-on-campaign-drops-and-quest-starts-on-Discord'}, page_content='join this channel on discord! \u2060🔔｜campaign-quest-notifications'),
 Document(metadata={'title': '📕 need help for a quest?', 'urls': 'https://stackuphelpcentre.zendesk.com/hc/en-us/articles/15062396510745--Need-help-for-a-quest'}, page_content='our stackup team believes in stackies finding their solution by constructive collaboration. to better facilitate this process, we have created a forum in our discord server, in 🆘｜quest-help-forum. please read through the rules template for easier navigation of the space. moreover, yo

In [16]:
DOCUMENT_PROMPT = """
Title: {title}
page_content: {page_content}
Help Center link: {urls}
========="""

QUESTION_PROMPT = """Given the following extracted parts of a help center data and a question, create a final answer with the Help Center link as source ("SOURCE").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCE" part in your answer. YOU CAN RETURN MULTIPLE 'SOURCES' relevant to the question asked

QUESTION: Tell me about bounty?
=========
Title: What is bounty?
Description: The StackUp bounty program offers an additional opportunity for Stackies to engage in more advanced learning activities with higher expectations for their output. This program presents a new level of challenge compared to quests, allowing Stackies to tackle more complex challenges in exchange for a larger reward amount.
Help Center link: https://stackuphelpcentre.zendesk.com/hc/en-us/articles/18932072999065-What-is-Bounty
=========
FINAL ANSWER: The StackUp bounty program provides Stackies with advanced learning opportunities and more challenging tasks than quests, offering higher rewards. Submissions are evaluated for quality and alignment with criteria, ensuring that only the best submissions receive rewards.
SOURCE: https://stackuphelpcentre.zendesk.com/hc/en-us/categories/35260449941529-Earn-App

QUESTION: {question}
=========
{summaries}
FINAL ANSWER:"""


In [17]:
# Import PromptTemplate
from langchain.prompts import PromptTemplate
document_prompt = PromptTemplate.from_template(DOCUMENT_PROMPT)
question_prompt = PromptTemplate.from_template(QUESTION_PROMPT)

In [18]:
# From the langchain.chains module, import RetrievalQAWithSourcesChain
from langchain.chains import RetrievalQAWithSourcesChain

# Querying chat models with Together AI
from langchain_together import ChatTogether

# choose from our 50+ models here: https://docs.together.ai/docs/inference-models
llm = ChatTogether(
    # together_api_key="YOUR_API_KEY",
    model="meta-llama/Llama-3-70b-chat-hf",
    temperature=0
)

In [19]:
# Create the QA bot LLM chain
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    chain_type="stuff",
    llm=llm,
    chain_type_kwargs={
        "document_prompt": document_prompt,
        "prompt": question_prompt,
    },
    retriever=docsearch.as_retriever(),
)

# Invoke qa_with_sources to ask the LLM the question about movies
qa_with_sources.invoke('tell me about hackathons')

{'question': 'tell me about hackathons',
 'answer': "Hackathons are events with varying participation guidelines, and it's necessary to review the individual hackathon listing to determine if participating as an individual is permitted. Additionally, suspended users are not allowed to join hackathons due to fraud policy enforcement. \n\n",
 'sources': 'https://stackuphelpcentre.zendesk.com/hc/en-us/articles/33323886215193-Is-it-possible-to-participate-in-a-hackathon-individually-without-team-members, https://stackuphelpcentre.zendesk.com/hc/en-us/articles/35134432316185-I-was-suspended-from-another-StackUp-app-Can-I-still-join-a-Hackathon'}