In [1]:
import pandas as pd

from sqlalchemy import Column, Integer, String, create_engine, text, select
from sqlalchemy.sql import func
from sqlalchemy.orm import sessionmaker, declarative_base
from pgvector.sqlalchemy import Vector

In [2]:
csv_path = "celebs_embeddings.csv"

In [3]:
# Load the embeddings from the .csv
embeddings_df = pd.read_csv(csv_path)
embeddings_df.shape, embeddings_df.columns

((3366, 5),
 Index(['filename', 'filepath', 'celebname', 'embedding', 'target'], dtype='object'))

In [4]:
embeddings_df.sample(3)

Unnamed: 0,filename,filepath,celebname,embedding,target
2663,James_Kelly_02.jpg,raw_dataset_PROCESSED/Faces_in_the_Wild/James_...,James_Kelly,"[0.07445473968982697, 0.01706155762076378, -0....",0
180,Tom_Cruise_44.jpg,raw_dataset_PROCESSED/Celebrity_Faces_Dataset/...,Tom_Cruise,"[0.0582943819463253, -0.048728812485933304, 0....",1
445,Denzel_Washington_67.jpg,raw_dataset_PROCESSED/Celebrity_Faces_Dataset/...,Denzel_Washington,"[0.05230841785669327, 0.05908048525452614, 0.0...",1


In [5]:
# Define connection parameters
params = {
    "host": "localhost",
    "port": 5432,
    "database": "celebsdb",
    "user": "docker",
    "password": "docker",
    "table": "celeb_embeddings",
    "server": "proper_door_database_1"
}

# Define the connection string
# Format: dialect+driver://username:password@host:port/database
connection_string = 'postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}'.format(**params)

# Create an engine
engine = create_engine(connection_string)

# Create a session
Session = sessionmaker(bind=engine)
session = Session()

# Enable the pgvector extension
session.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
session.commit()

In [6]:
Base = declarative_base()

class CelebEmbedding(Base):
    __tablename__ = 'celeb_embeddings'

    id = Column(Integer, primary_key=True)
    filename = Column(String)
    filepath = Column(String)
    celebname = Column(String)
    target = Column(String)
    embedding = Column(Vector(512))  

# Create the table
Base.metadata.create_all(engine)

In [7]:
# Store the DataFrame into the table
embeddings_df.to_sql(name=CelebEmbedding.__tablename__, con=engine, if_exists='append', index=False)

session.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))

<sqlalchemy.engine.cursor.CursorResult at 0x7fc81ab401c0>

In [8]:
# Now you can use this model to query the "celeb_embeddings" table
random_celeb_embedding = session.query(CelebEmbedding).order_by(func.random()).first()

# Print the random celeb's name and embedding
print(random_celeb_embedding.celebname, random_celeb_embedding.embedding[:10])

Recep_Tayyip_Erdogan [ 6.6561259e-02  1.0950807e-02 -2.8160920e-02 -5.9094649e-02
 -2.8493429e-05 -6.2583722e-02 -4.6132430e-03  4.8698183e-02
 -5.3937636e-02  8.3850883e-02]


In [9]:
# session.scalars(select(CelebEmbedding).order_by(CelebEmbedding.embedding.l2_distance(random_celeb_embedding.embedding)).limit(5))

In [10]:
# Query the table
query = select(CelebEmbedding).order_by(CelebEmbedding.embedding.l2_distance(random_celeb_embedding.embedding)).limit(5)
results = session.scalars(query).all()

# Print the celebname for each result
for result in results:
    print(result.celebname)

Recep_Tayyip_Erdogan
Recep_Tayyip_Erdogan
Recep_Tayyip_Erdogan
Recep_Tayyip_Erdogan
Recep_Tayyip_Erdogan


In [11]:
# Close the session
session.close()
# Dispose the engine
engine.dispose()