Chunking

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import regex as re
import pickle

In [2]:
article_URL = "https://www.npr.org/transcripts/153705721"
page = requests.get(article_URL)
soup = BeautifulSoup(page.content, "html.parser")

In [3]:
# only the first p-tag has useful information we want, so we index to first element
p_tags = soup.find_all("p", id=False, class_=False)

In [4]:
# we replace newline characters and information within parentheses with blanks
interview_text = re.sub(r"\n|\(.+\)", "", p_tags[0].find("p").get_text()).replace("\\'", "'")

In [5]:
# we split the article into chunks where we split on the condiiton of ".{any capital letter}"
lines = re.split(r"\.(?=[A-Z])", interview_text)[:-2]

In [11]:
# putting all lines in a list to feed into chroma
article_list = []
for i in lines:
  article_list.append(i)

In [12]:
# save as pickle file for use later
# with open('data/chunks.pkl', 'wb') as file:
#     pickle.dump(lines, file)
# print("List saved successfully as a pickle file.")

In [13]:
# with open('data/chunks.pkl', 'rb') as file:
#     loaded_lines = pickle.load(file)
# loaded_lines

In [14]:
article_list[0]

'RENEE MONTAGNE, HOST: One of the first things Michelle Obama did as first lady, was to dig up part of the beautifully manicured South Lawn of the White House and plant a garden - a vegetable garden. Today, she\'s out with a book, "American Grown," a kind of diary of that garden through the seasons. We joined her on a hot day last week for a little tour'

Chroma

In [23]:
# Install
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.27.0-py3-none-any.whl.metadata (2.3 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)
  

In [24]:
# Create A chroma Client
import chromadb
chroma_client = chromadb.Client()

In [25]:
# Create a collection
collection = chroma_client.get_or_create_collection(name="my_collection")

In [26]:
# creating id for text doc in collection
id = []
for i in range(len(article_list)):
  id.append("id" + str(i))

In [28]:
# Add text doc to collection
collection.upsert(
    documents = article_list,
    ids = id
)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 34.7MiB/s]


In [31]:
# query collection
results = collection.query(
    query_texts = ["This article’s political stance is democratic."], # Chroma will embed this for you
    n_results = 5 # how many results to return
)
print(results)

{'ids': [['id22', 'id30', 'id23', 'id5', 'id27']], 'distances': [[1.3331794738769531, 1.3858851194381714, 1.3982348442077637, 1.4557957649230957, 1.4712979793548584]], 'metadatas': [[None, None, None, None, None]], 'embeddings': None, 'documents': [["OBAMA: It's a little bit of..", 'MONTAGNE: This is NPR News', "MONTAGNE: ...to do with habits. Like the kids go into the store and there might be apples there but they buy a bag of potato chips?OBAMA: Well, you know, in many of the convenience stores there isn't fruit. But one thing we also say with this initiative with Let's Move is that there are many reasons why we're here. And food deserts, and information, and knowledge and exercise, all of those are a part of the solution", 'OBAMA: Taste it', 'OBAMA: It is beautiful. I mean, here we are in the heart of Washington, D']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [32]:
# query collection
results = collection.query(
    query_texts = ["This article  is using sensational phrases or words"], # Chroma will embed this for you
    n_results = 5 # how many results to return
)
print(results)

{'ids': [['id5', 'id30', 'id23', 'id22', 'id7']], 'distances': [[1.445431113243103, 1.4627740383148193, 1.4776935577392578, 1.4928231239318848, 1.5337291955947876]], 'metadatas': [[None, None, None, None, None]], 'embeddings': None, 'documents': [['OBAMA: Taste it', 'MONTAGNE: This is NPR News', "MONTAGNE: ...to do with habits. Like the kids go into the store and there might be apples there but they buy a bag of potato chips?OBAMA: Well, you know, in many of the convenience stores there isn't fruit. But one thing we also say with this initiative with Let's Move is that there are many reasons why we're here. And food deserts, and information, and knowledge and exercise, all of those are a part of the solution", "OBAMA: It's a little bit of..", 'OBAMA: Very sweet, which is what I try to tell my daughter']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}
