In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")  # dimension is 384


In [None]:
import pandas as pd

df = pd.read_csv("2025_complaints_scraped.csv")

texts = df.apply(lambda row: " | ".join(map(str, row.values)), axis=1).tolist()

In [None]:
vectors = model.encode(texts, convert_to_numpy=True)
#embeddings = model.encode(texts, convert_to_numpy=True)


In [None]:
#!pip install pinecone

Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl.metadata (30 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting packaging<25.0,>=24.2 (from pinecone-plugin-assistant<2.0.0,>=1.6.0->pinecone)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-7.3.0-py3-none-any.whl (587 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.3/259.3 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Downloading packagin

In [None]:
from pinecone import Pinecone
import os
#api key goes here
pc = Pinecone(api_key="")
index_name = "2025test"

index = pc.Index(index_name)

In [None]:
import math

def chunk_text(text, chunk_size=400, overlap=50):
    words = text.split()
    step = chunk_size - overlap
    chunks = []
    for i in range(0, max(1, len(words)), step):
        chunk = " ".join(words[i:i+chunk_size])
        if not chunk:
            break
        start_word = i
        end_word = i + len(chunk.split()) - 1
        chunks.append((chunk, start_word, end_word))
        if len(words) <= i + chunk_size:
            break
    return chunks

In [None]:
def upsert_in_batches(index, items, batch_size=100):
    for i in range(0, len(items), batch_size):
        batch = items[i:i+batch_size]
        index.upsert(batch)

In [None]:
index.delete(delete_all=True)
print(index.describe_index_stats())


{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

items = []
for row_idx, row in df.iterrows():
    doc_id = f"r{row_idx}"
    date = row["Date"]
    p_tags = row["P_tags"]
    l_tags = row["LI_tags"]
    url_text = row["URL_Text"]
    url_raw = row["URLs"]
    text = row["Text"]
    chunks = chunk_text(text, chunk_size=300, overlap=50)
    for chunk_idx, (chunk, start, end) in enumerate(chunks):
        excerpt = chunk[:300]
        metadata = {
            "doc_id": doc_id,
            "row": row_idx,
             "date": date,
             "url": url_raw,
             "url_text": url_text,
             "p_tags": p_tags,
             "l_tags": l_tags,
             "start": start,
             "end": end,
            "excerpt": excerpt[:250]
        }

        items.append({
            "doc_id": doc_id,
            "row_idx": row_idx,
            "chunk_idx": chunk_idx,
            "text": chunk,
            "metadata": metadata
        })
payloads = []
batch_size_encode = 64
for i in range(0, len(items), batch_size_encode):
    batch = items[i:i+batch_size_encode]
    texts = [it["text"] for it in batch]
    embs = model.encode(texts, batch_size=32, convert_to_numpy=True)
    for j, it in enumerate(batch):
        if len(embs[j].tolist())==[]:
          print("Error")
        payloads.append({
            "id": f"{it['doc_id']}-{it['chunk_idx']}",
            "values": embs[j].tolist(),
            "metadata": it["metadata"]
        })
upsert_in_batches(index, payloads, batch_size=25)

In [None]:
q_vec = model.encode("Abusive language", convert_to_numpy=True).tolist()
res = index.query(vector=q_vec, top_k=10, include_metadata=True)

In [None]:
res

{'matches': [{'id': 'r25-1',
              'metadata': {'date': '08-May-2025',
                           'doc_id': 'r25',
                           'end': 432.0,
                           'excerpt': 'a finding of “Sustained” and '
                                      'recommends, as penalty, a 4-Day '
                                      'Suspension and Training on APD’s '
                                      'policies governing Use of Force. 3. The '
                                      'allegation of Abusive Language be '
                                      'assigned a finding of “Sustained” and '
                                      'recommends, as penalty, a Written '
                                      'Reprimand ',
                           'l_tags': "['Appropriate Action Required', "
                                     "'Excessive Force', 'Abusive Language']",
                           'p_tags': "['08-May-2025', 'Case #24-157', 'Board "
                           