In [44]:
import pandas as pd
import numpy as np

In [45]:
df = pd.read_csv('../data/final_dataset.csv')
df.head()

Unnamed: 0,text,emotion
0,i feel rather funny ending with so many dupes ...,fun
1,i feel surprised by the result,surprise
2,i am officially feeling festive,neutral
3,i suddenly found myself standing before this w...,surprise
4,i look at the meager pile of food i purchased ...,enthusiasm


In [3]:
np.unique(df['emotion'])

array(['anger', 'empty', 'enthusiasm', 'fun', 'happiness', 'hate', 'love',
       'neutral', 'relief', 'sadness', 'surprise'], dtype=object)

In [46]:
test_rows = df.head(20)
test_rows

Unnamed: 0,text,emotion
0,i feel rather funny ending with so many dupes ...,fun
1,i feel surprised by the result,surprise
2,i am officially feeling festive,neutral
3,i suddenly found myself standing before this w...,surprise
4,i look at the meager pile of food i purchased ...,enthusiasm
5,"I, for one, am thrilled that Christ works outs...",happiness
6,i hate that she has the power to make me feel ...,hate
7,i feel like i missed it,neutral
8,i have personally experienced this gut wrenchi...,sadness
9,i hate feeling that people see me as ugly but ...,hate


In [5]:
!pip install psycopg2-binary pgvector



In [6]:
# --- Now let me gen embeddings for first 20 rows so that I can connect and dump in db ---

In [47]:
import re

In [48]:
test_rows['text'] = test_rows['text'].apply(
    lambda x: re.sub(r'[^\w\s]', '', str(x)) if isinstance(x, str) else None
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rows['text'] = test_rows['text'].apply(


In [9]:
import torch
from transformers import AutoTokenizer, AutoModel

In [10]:
model_ckpt = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [49]:
sents = test_rows['text'].tolist()
labels = test_rows['emotion'].tolist()

In [12]:
encoded_input = tokenizer(sents, padding=True, truncation=True, return_tensors="pt")

In [13]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [14]:
import torch.nn.functional as F

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [15]:
sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
print(f"Sentence embeddings shape: {sentence_embeddings.size()}")

Sentence embeddings shape: torch.Size([20, 384])


In [16]:
# --- setting up db connection with langchain PGVector Object ---

In [17]:
!pip install langchain



In [18]:
!pip install -qU langchain-postgres

In [19]:
!pip install --upgrade langchain langchain-core langchain-postgres typing_extensions




In [20]:
pip install --upgrade typing_extensions

Note: you may need to restart the kernel to use updated packages.


In [21]:
pip install langchain-huggingface

Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [50]:
from langchain_postgres import PGVector

In [51]:
CONNECTION_STRING = "postgresql+psycopg2://postgres:test@localhost:5432/vector_db"
COLLECTION_NAME = "test_vectors"

In [52]:
final_embeddings = sentence_embeddings.numpy().tolist()

In [53]:
from langchain_huggingface import HuggingFaceEmbeddings

In [54]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [56]:
vector_store = PGVector(
    embeddings=embedding_model,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

In [57]:
from langchain_core.documents import Document

In [58]:
docs = []

for i in range(len(sents)):
    doc = Document(
        page_content=sents[i],
        metadata={"label": labels[i]},
    )
    docs.append(doc)

In [59]:
vector_store.add_documents(docs)

['27fd290d-23c7-44aa-805f-583ddde2bf2c',
 'b8fbcce3-7641-43a4-8529-85e5aa64fa4a',
 '9c30bb36-fbf7-483e-a9b5-126e8c7a30b1',
 '49e55d51-f26c-4877-94dd-021b990d7856',
 'f71d1cb1-8f73-4ced-9fd8-2e5a0897e5fd',
 '07fb5a1d-a6db-4608-a3ca-b6a7fef5058f',
 'b7a52719-29f5-43cb-81d4-65a91c8b808a',
 '7783ddd0-4dbc-49b0-bb08-020997fb9f37',
 'bf25e9cc-208e-42c2-acf4-b76d7ec93661',
 '13db85ce-6d63-4a7d-844c-8e2784043143',
 'ebe016f9-4a27-4774-8ff4-b3ee9200c08b',
 'a6206d16-5b34-4fc4-83bd-c55bff96c7db',
 '0a371fff-0991-4729-9a31-0178ca60dfab',
 '6c5b0d47-80a5-4ddd-b6b7-1b82e8f60717',
 '942d239b-019c-4b50-b7aa-99767cfee104',
 '1d01fe77-dc63-4ef0-829e-88bc9b9a8843',
 '61042370-1479-4cdb-878a-cc7ee1c0a8a6',
 'f3613b9c-cfdf-4c7b-8374-15bc6647669b',
 '85c43b88-d930-407f-b05e-8b51ca336fff',
 '4af82102-ad91-4531-934b-f78cd6d37ecf']

In [60]:
results = vector_store.similarity_search(
    "Wow I did not think that was going to happen", k=5
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* i feel surprised by the result [{'label': 'surprise'}]
* i feel like i missed it [{'label': 'neutral'}]
* I for one am thrilled that Christ works outside of our timeline He is so faithful And sovereign [{'label': 'happiness'}]
* i suddenly found myself standing before this woman dressed like a priestess with white robes and egyptian jewellery long black hair and realized that i was feeling amorous and passionate and noticed to my surprise that my breath was green i was breathing out clouds of green energy [{'label': 'surprise'}]
* i was working with one of my heroes so it was a combination of feeling intimidated excited and thrilled [{'label': 'enthusiasm'}]


In [63]:
results2 = vector_store.similarity_search_with_score(
    "I really hate going to the beach so much", k=5
)
for doc, score in results2:
    print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")

* [SIM=0.684024] i always feel like i am hated among my friends [{'label': 'hate'}]
* [SIM=0.697759] i hate that she has the power to make me feel such ugly things [{'label': 'hate'}]
* [SIM=0.752931] i wanna feel absolutely lovely [{'label': 'love'}]
* [SIM=0.775491] i look at the meager pile of food i purchased for the week i am feeling pretty apprehensive [{'label': 'enthusiasm'}]
* [SIM=0.812117] ive become in a mere years yeah right o but i really just feel so eager for it all [{'label': 'enthusiasm'}]
