In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/final_dataset.csv')
df.head()

Unnamed: 0,text,emotion
0,i feel rather funny ending with so many dupes ...,fun
1,i feel surprised by the result,surprise
2,i am officially feeling festive,neutral
3,i suddenly found myself standing before this w...,surprise
4,i look at the meager pile of food i purchased ...,enthusiasm


In [3]:
test_rows = df.head(20)
test_rows

Unnamed: 0,text,emotion
0,i feel rather funny ending with so many dupes ...,fun
1,i feel surprised by the result,surprise
2,i am officially feeling festive,neutral
3,i suddenly found myself standing before this w...,surprise
4,i look at the meager pile of food i purchased ...,enthusiasm
5,"I, for one, am thrilled that Christ works outs...",happiness
6,i hate that she has the power to make me feel ...,hate
7,i feel like i missed it,neutral
8,i have personally experienced this gut wrenchi...,sadness
9,i hate feeling that people see me as ugly but ...,hate


In [4]:
import re

In [5]:
test_rows['text'] = test_rows['text'].apply(
    lambda x: re.sub(r'[^\w\s]', '', str(x)) if isinstance(x, str) else None
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rows['text'] = test_rows['text'].apply(


In [6]:
sents = test_rows['text'].tolist()
labels = test_rows['emotion'].tolist()

In [7]:
import torch
from transformers import AutoTokenizer, AutoModel

In [8]:
model_ckpt = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [9]:
encoded_input = tokenizer(sents, padding=True, truncation=True, return_tensors="pt")

In [10]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [11]:
import torch.nn.functional as F

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [12]:
sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
print(f"Sentence embeddings shape: {sentence_embeddings.size()}")

Sentence embeddings shape: torch.Size([20, 384])


In [13]:
final_embeddings = sentence_embeddings.numpy().tolist()

In [14]:
import psycopg2

In [15]:
conn = psycopg2.connect(
    dbname="vector_db",
    user="postgres",
    password="test",
    host="localhost",
    port=5432
)

In [16]:
conn.autocommit = True  # required for CREATE EXTENSION

with conn.cursor() as cur:
    cur.execute("""
        CREATE EXTENSION IF NOT EXISTS vector;

        CREATE TABLE IF NOT EXISTS documents (
            id SERIAL PRIMARY KEY,
            content TEXT,
            embedding VECTOR(384),
            metadata TEXT
        );

        CREATE INDEX IF NOT EXISTS documents_embedding_idx
        ON documents
        USING ivfflat (embedding vector_cosine_ops)
        WITH (lists = 100);
    """)

# conn.close()

In [17]:
def insert_documents(sents, embeddings, labels):

    with conn.cursor() as cur:
        for s, emb, meta in zip(sents, embeddings, labels):
            cur.execute(
                """
                INSERT INTO documents (content, embedding, metadata)
                VALUES (%s, %s, %s)
                """,
                (s, emb, meta)
            )
    conn.commit()

In [18]:
insert_documents(sents, final_embeddings, labels)

In [23]:
conn = psycopg2.connect(
    dbname="vector_db",
    user="postgres",
    password="test",
    host="localhost",
    port=5432
)

In [32]:
def similarity_search(query_emb, k=5):

    with conn.cursor() as cur:
        cur.execute(
            """
            SELECT content, metadata, embedding <=> %s::vector AS distance
            FROM documents
            ORDER BY embedding <=> %s::vector
            LIMIT %s;
            """,
            (query_emb, query_emb, k)
        )

        return cur.fetchall()



In [25]:
query = "I really hate going to the beach so much"

encoded_query = tokenizer(
    query,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

with torch.no_grad():
    model_output = model(**encoded_query)

query_emb = mean_pooling(
    model_output,
    encoded_query["attention_mask"]
)

query_emb = F.normalize(query_emb, p=2, dim=1)
query_emb = query_emb.squeeze(0).cpu().numpy().tolist()

In [33]:
conn.rollback()

In [36]:
results = similarity_search(query_emb, k=5)

for content, metadata, distance in results:
    print(f"[DIST={distance:.4f}] {content}")

In [37]:
conn.close()