In [1]:
import pandas as pd

from sqlalchemy import Column, Integer, String, create_engine, text, select
from sqlalchemy.sql import func
from sqlalchemy.orm import sessionmaker, declarative_base
from pgvector.sqlalchemy import Vector

In [2]:
csv_path = "celebs_embeddings.csv"

In [3]:
# Load the embeddings from the .csv
embeddings_df = pd.read_csv(csv_path)
embeddings_df.shape, embeddings_df.columns

((3366, 5),
 Index(['filename', 'filepath', 'celebname', 'embedding', 'target'], dtype='object'))

In [4]:
embeddings_df.sample(3)

Unnamed: 0,filename,filepath,celebname,embedding,target
436,Denzel_Washington_16.jpg,raw_dataset_PROCESSED/Celebrity_Faces_Dataset/...,Denzel_Washington,"[0.06667304039001465, 0.04279664158821106, 0.0...",1
1594,Jean_Chretien_17.jpg,raw_dataset_PROCESSED/Faces_in_the_Wild/Jean_C...,Jean_Chretien,"[0.028942354023456573, 0.04327131435275078, -0...",0
390,Megan_Fox_71.jpg,raw_dataset_PROCESSED/Celebrity_Faces_Dataset/...,Megan_Fox,"[-0.036409199237823486, 0.0006065219640731812,...",1


In [5]:
# Define connection parameters
params = {
    "host": "localhost",
    "port": 5432,
    "database": "celebsdb",
    "user": "docker",
    "password": "docker",
    "table": "celeb_embeddings",
    "server": "proper_door_database_1"
}

# Define the connection string
# Format: dialect+driver://username:password@host:port/database
connection_string = 'postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}'.format(**params)

# Create an engine
engine = create_engine(connection_string)

# Create a session
Session = sessionmaker(bind=engine)
session = Session()

# Enable the pgvector extension
session.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
session.commit()

In [6]:
Base = declarative_base()

class CelebEmbedding(Base):
    __tablename__ = 'celeb_embeddings'

    id = Column(Integer, primary_key=True)
    filename = Column(String)
    filepath = Column(String)
    celebname = Column(String)
    target = Column(String)
    embedding = Column(Vector(512))  

# Create the table
Base.metadata.create_all(engine)

In [7]:
# Store the DataFrame into the table
embeddings_df.to_sql(name=CelebEmbedding.__tablename__, con=engine, if_exists='append', index=False)

session.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))

<sqlalchemy.engine.cursor.CursorResult at 0x7fee392f6f20>

In [8]:
# Now you can use this model to query the "celeb_embeddings" table
random_celeb_embedding = session.query(CelebEmbedding).order_by(func.random()).first()

# Print the random celeb's name and embedding
print(random_celeb_embedding.celebname, random_celeb_embedding.embedding[:10])

Will_Smith [-0.00677141  0.004841   -0.05397051 -0.04053113 -0.00108151 -0.02236098
  0.0430319  -0.02607022 -0.03225573 -0.02051348]


In [None]:
# session.scalars(select(CelebEmbedding).order_by(CelebEmbedding.embedding.l2_distance(random_celeb_embedding.embedding)).limit(5))

In [9]:
# Query the table
query = select(CelebEmbedding).order_by(CelebEmbedding.embedding.l2_distance(random_celeb_embedding.embedding)).limit(5)
results = session.scalars(query).all()

# Print the celebname for each result
for result in results:
    print(result.celebname)

Will_Smith
Will_Smith
Will_Smith
Will_Smith
Will_Smith


In [10]:
# Close the session
session.close()
# Dispose the engine
engine.dispose()