In [None]:
# !pip install -r requirements.txt

In [None]:
# !mkdir data
# !mv stack-help_data.json data/

In [10]:
import os
import json
import pandas as pd
import datetime
import re
import string
import numpy as np

def wrangle_data(filepath: str) -> pd.DataFrame:
    """
    This function reads in the data from the filepath, cleans it and returns a pandas dataframe

    Args:
    filepath: str: a path to the json file

    Returns:
    df: pd.DataFrame: a pandas dataframe with the cleaned data

    """
    # load data
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Get article body, urls and titles
    doc = {
        'article_title': data['article_link_title'],
        'article_links': data['article_links'],
        'article_body': data['article_body'],
    }


    # Store the data in a pandas dataframe
    df = pd.DataFrame(doc)


    ## Clean article title
    # Remove the forward-slash character
    df['article_title_cleaned'] = df['article_title'].str.replace(r"\/","")

    # Remove punctuation
    df['article_title_cleaned'] = df['article_title_cleaned'].str.translate(string.punctuation)

    # Remove digits
    df['article_title_cleaned'] = df['article_title_cleaned'].str.replace(r"\d+","")

    # Remove running spaces
    df['article_title_cleaned'] = df['article_title_cleaned'].str.replace(r"\s{2,}","")

    # Make the text lowercase
    df['article_title_cleaned'] = df['article_title_cleaned'].str.lower()

    ## Clean article body

    # Remove the forward-slash character
    df['article_body_cleaned'] = df['article_body'].str.replace(r"\/","")

    # Remove punctuation
    df['article_body_cleaned'] = df['article_body_cleaned'].str.translate(string.punctuation)

    # Remove digits
    df['article_body_cleaned'] = df['article_body_cleaned'].str.replace(r"\d+","")

    # Remove running spaces
    df['article_body_cleaned'] = df['article_body_cleaned'].str.replace(r"\s{2,}","")

    # Make the text lowercase
    df['article_body_cleaned'] = df['article_body_cleaned'].str.lower()

    # Remove double white space
    df['article_body_cleaned'] = df['article_body_cleaned'].str.replace('  ', '')

    # Fills empty article with word Empty
    df['article_body_cleaned'] = df['article_body_cleaned'].replace('', 'Empty')

    return df


doc_df = wrangle_data('data/stack-help_data.json')

Unnamed: 0,article_title,article_links,article_body,article_title_cleaned,article_body_cleaned
73,"21 Collyer Quay, Singapore 049320",https://stackuphelpcentre.zendesk.com/hc/en-us...,,"21 collyer quay, singapore 049320",Empty


In [11]:
# Import DataFrameLoader
from langchain.document_loaders import DataFrameLoader

new_df = doc_df[['article_title_cleaned', 'article_body_cleaned', 'article_links']]
new_df.rename(columns={'article_title_cleaned': 'title', 'article_body_cleaned': 'page_content', 'article_links': 'urls'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.rename(columns={'article_title_cleaned': 'title', 'article_body_cleaned': 'page_content', 'article_links': 'urls'}, inplace=True)


In [13]:
# The page content column is 'page_content'
docs = DataFrameLoader(
    new_df,
    page_content_column="page_content",
).load()

# Print the first 3 documents and the number of documents
docs[-1]
# display(f"Number of documents: {len(docs)}")

Document(metadata={'title': '21 collyer quay, singapore 049320', 'urls': 'https://stackuphelpcentre.zendesk.com/hc/en-us/articles/21537922604441-21-Collyer-Quay-Singapore-049320'}, page_content='Empty')

In [6]:
# Import os and pinecone
import pinecone

# Set the pinecone api key from the environment variable. Assign to api_key.
api_key=os.getenv("PINECONE_API_KEY")

# Initialize pinecone using the `PINECONE_API_KEY` variable.
pc = pinecone.Pinecone(api_key)

# Import os and pinecone
# Use this index name
index_name = "stackragapp"

# List the names of available indexes. Assign to existing_index_names.
existing_index_names = [idx.name for idx in pc.list_indexes().indexes]

# First check that the given index does not exist yet
if index_name not in existing_index_names:
    # Create the 'imbd-movies' index with cosine metric, 1536 dims, serverless spec: aws in us-east-1
    pc.create_index(
        name=index_name,
        metric='cosine',
        dimension=768,
        spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1")
    )

  from tqdm.autonotebook import tqdm


In [17]:
from langchain_together import TogetherEmbeddings

# Create the embeddings object
embeddings = TogetherEmbeddings()

# From the langchain_pinecone package, import PineconeVectorStore
from langchain_pinecone import PineconeVectorStore

# Create an index from its name
index = pc.Index(index_name)

In [19]:
# Count the number of vectors in the index
n_vectors = index.describe_index_stats()['total_vector_count']
print(f"There are {n_vectors} vectors in the index already.")

# Check if there is already some data in the index on Pinecone
if n_vectors > 0:
    # If there is, get the documents to search from the index. Assign to docsearch.
    docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings)
else:
    # If not, fill the index from the documents and return those docs to assign to docsearch
    docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

There are 74 vectors in the index already.


In [20]:
# Define a question about movies to ask
question = "What's a good movie about an epic viking?"
    
# Convert the vector database to a retriever and get the relevant documents for a question
print("These are the documents most relevant to the question:")
docsearch.as_retriever().invoke(question)

These are the documents most relevant to the question:


[Document(metadata={'title': "why can't i  join a quest?", 'urls': 'https://stackuphelpcentre.zendesk.com/hc/en-us/articles/13043680713625-Why-can-t-I-join-a-quest'}, page_content='1. you might be attempting to join a quest that has a pre-requisitesome quests may have pre-requisite quests. in such cases, you need to have at least joined the pre-requisite quest, before you can join the next quest. 2. you might be attempting to join a repeated quest if you already participated in a repeated quest and got rewarded, you will be unable to join this new repeated quest, to give other players a fair chance. check out this article on what are pre-requisite and repeated quests? 3. the quest has not started yetquests are only available for registration on the start date and time indicated.'),
 Document(metadata={'title': 'is it necessary to complete a prerequisite quest before joining a bounty?', 'urls': 'https://stackuphelpcentre.zendesk.com/hc/en-us/articles/19285174681881-Is-it-necessary-to-co