In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/final_dataset.csv')
df.head()

Unnamed: 0,text,emotion
0,i feel rather funny ending with so many dupes ...,fun
1,i feel surprised by the result,surprise
2,i am officially feeling festive,neutral
3,i suddenly found myself standing before this w...,surprise
4,i look at the meager pile of food i purchased ...,enthusiasm


In [3]:
np.unique(df['emotion'])

array(['anger', 'empty', 'enthusiasm', 'fun', 'happiness', 'hate', 'love',
       'neutral', 'relief', 'sadness', 'surprise'], dtype=object)

In [4]:
test_rows = df.head(20)
test_rows

Unnamed: 0,text,emotion
0,i feel rather funny ending with so many dupes ...,fun
1,i feel surprised by the result,surprise
2,i am officially feeling festive,neutral
3,i suddenly found myself standing before this w...,surprise
4,i look at the meager pile of food i purchased ...,enthusiasm
5,"I, for one, am thrilled that Christ works outs...",happiness
6,i hate that she has the power to make me feel ...,hate
7,i feel like i missed it,neutral
8,i have personally experienced this gut wrenchi...,sadness
9,i hate feeling that people see me as ugly but ...,hate


In [5]:
!pip install psycopg2-binary pgvector



In [6]:
# --- Now let me gen embeddings for first 20 rows so that I can connect and dump in db ---

In [7]:
import re

In [8]:
test_rows['text'] = test_rows['text'].apply(
    lambda x: re.sub(r'[^\w\s]', '', str(x)) if isinstance(x, str) else None
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rows['text'] = test_rows['text'].apply(


In [9]:
import torch
from transformers import AutoTokenizer, AutoModel

In [10]:
model_ckpt = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [11]:
sents = test_rows['text'].tolist()
labels = test_rows['emotion'].tolist()

In [12]:
encoded_input = tokenizer(sents, padding=True, truncation=True, return_tensors="pt")

In [13]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [14]:
import torch.nn.functional as F

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [15]:
sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
print(f"Sentence embeddings shape: {sentence_embeddings.size()}")

Sentence embeddings shape: torch.Size([20, 384])


In [16]:
# --- setting up db connection with langchain PGVector Object ---

In [17]:
!pip install langchain



In [18]:
!pip install -qU langchain-postgres

In [19]:
!pip install --upgrade langchain langchain-core langchain-postgres typing_extensions




In [20]:
pip install --upgrade typing_extensions

Note: you may need to restart the kernel to use updated packages.


In [21]:
pip install langchain-huggingface

Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [23]:
from langchain_postgres import PGVector

In [24]:
CONNECTION_STRING = "postgresql+psycopg2://postgres:test@localhost:5432/vector_db"
COLLECTION_NAME = "test_vectors"

In [25]:
final_embeddings = sentence_embeddings.numpy().tolist()

In [26]:
from langchain_huggingface import HuggingFaceEmbeddings

In [35]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [36]:
vector_store = PGVector(
    embeddings=embedding_model,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

In [37]:
from langchain_core.documents import Document

In [40]:
docs = []

for i in range(len(sents)):
    doc = Document(
        page_content=sents[i],
        metadata={"label": labels[i]},
    )
    docs.append(doc)

In [41]:
vector_store.add_documents(docs)

['d88b1b03-b3b4-4160-b7e2-6cd72e4a6f25',
 'e207bee0-badc-471f-b497-a262f42498e9',
 '93cb8c66-69b0-4a72-a6f9-15bc2d6839d0',
 'cd6c31b5-e6bc-4c0d-b815-6308208bff5e',
 'b9a2f0ad-949f-47ea-ab6c-db0a6ca1c36d',
 'a1d4cd58-680f-453c-a591-d04ecba70205',
 'acf1af61-1cd3-47ca-b2a7-d31b25905d13',
 'e4a94d55-796c-49b2-b831-1a3075a0b05e',
 'e6285db3-d00c-433e-b98a-1b734cf36a4f',
 '0eb9eeda-65a4-4305-bb88-53a9af749f06',
 'bb349d01-ff8d-4b04-8ab7-336de3f4e355',
 'd9cd3b31-1850-418a-8b4f-091bc9f3bdc6',
 '39037d79-6943-49a2-ac10-a57556ce263b',
 '3b0bbeb0-c313-42e2-b0f0-f6ef8f84df80',
 'cda8fbe1-a349-4645-b2f4-ac2fdb95d93b',
 '772cfb3d-f9ec-474f-8b73-def0421cda6b',
 'e3067fe4-1046-49d3-a2f7-0d0a3d66717a',
 '0c4d2a6f-de93-4b98-b4ab-1442223ffa13',
 'a707c2f3-dc35-4351-9894-046a2aba39c7',
 'd1b476c4-9099-4a84-a1ea-4b5e5f6c93b6']

In [43]:
results = vector_store.similarity_search(
    "Wow I did not think that was going to happen", k=5
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

OperationalError: (psycopg2.OperationalError) could not receive data from server: Software caused connection abort (0x00002745/10053)

[SQL: SELECT langchain_pg_collection.uuid AS langchain_pg_collection_uuid, langchain_pg_collection.name AS langchain_pg_collection_name, langchain_pg_collection.cmetadata AS langchain_pg_collection_cmetadata 
FROM langchain_pg_collection 
WHERE langchain_pg_collection.name = %(name_1)s 
 LIMIT %(param_1)s]
[parameters: {'name_1': 'test_vectors', 'param_1': 1}]
(Background on this error at: https://sqlalche.me/e/20/e3q8)