In [1]:
# %%
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from IPython.core.getipython import get_ipython
import os

# Get the current working directory
cwd = os.getcwd()

dotenv_path = os.path.join(cwd, '..', '.env')
load_dotenv(dotenv_path)

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# model = SentenceTransformer("BAAI/bge-small-en-v1.5")

In [3]:
file_path = '../data/pg1513.txt'
loader = TextLoader(file_path)
documents = loader.load()

In [4]:
print(len(documents))

1


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)


In [6]:
texts = text_splitter.split_documents(documents)


In [7]:
print(texts[0])
print(len(texts))

page_content='\ufeffThe Project Gutenberg eBook of Romeo and Juliet\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: Romeo and Juliet\n\n\nAuthor: William Shakespeare\n\nRelease date: November 1, 1998 [eBook #1513]\n                Most recently updated: June 27, 2023\n\nLanguage: English\n\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***\n\n\n\nTHE TRAGEDY OF ROMEO AND JULIET\n\nby William Shakespeare\n\n\n\n\nContents\n\nTHE PROLOGUE.\n\nACT I\nScene I. A public place.\nScene II. A Street.\nScene III. Room in Capulet’s House.\nScene IV. A Street.\nScene V. 

In [8]:
HF_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

In [8]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
doc_vectors = embeddings.embed_documents([t.page_content for t in texts])

In [9]:
doc_vectors[0]

[0.00766746373847127,
 -0.052848972380161285,
 0.08701644837856293,
 0.01128862239420414,
 0.04267968609929085,
 0.021348267793655396,
 -0.07447012513875961,
 0.05210120603442192,
 0.028560617938637733,
 0.014492927119135857,
 -0.01018309686332941,
 0.09782399982213974,
 0.0049798632971942425,
 -0.035418473184108734,
 0.07265106588602066,
 -0.04062623903155327,
 0.0012055092956870794,
 -0.0019085939275100827,
 0.06241024658083916,
 0.04830075800418854,
 0.08555746078491211,
 0.001338187837973237,
 0.047592271119356155,
 -0.0058006104081869125,
 -0.05588964745402336,
 -0.04210874065756798,
 0.030639665201306343,
 0.008161087520420551,
 -0.08715153485536575,
 -0.03123178333044052,
 -0.028832273557782173,
 -0.07196307927370071,
 0.04718378558754921,
 -0.04661862179636955,
 0.029547052457928658,
 -0.0011739371111616492,
 0.018841546028852463,
 0.011335798539221287,
 0.036084841936826706,
 -0.008777371607720852,
 -0.013559538871049881,
 -0.010434328578412533,
 0.00447857566177845,
 0.074405

In [11]:
# # Assuming `texts` is a list of your documents
# doc_vectors = model.encode([t.page_content for t in texts[:5]])

In [12]:
# doc_vectors[0]


In [13]:
# embedding_list = doc_vectors.tolist()

In [14]:
# embedding_list[0]

In [10]:
from langchain.vectorstores.pgvector import PGVector
CONNECTION_STRING = "postgresql+psycopg2://postgres:test@localhost:5432/vector_db"
COLLECTION_NAME = 'state_of_the_union'

In [11]:
db = PGVector.from_documents(embedding=embeddings, documents=texts, collection_name=COLLECTION_NAME, connection_string=CONNECTION_STRING)


In [None]:
# class SentenceTransformerWrapper:
#     def __init__(self, model):
#         self.model = model

#     def embed_documents(self, texts):
#         return self.model.encode(texts)

# # Create a wrapper for your model
# model_wrapper = SentenceTransformerWrapper(model)

# # Use the wrapper to create a PGVector instance
# db = PGVector.from_documents(embedding=model_wrapper, documents=texts, collection_name=CONNECTION_STRING, connection_string=COLLECTION_NAME)

In [12]:
query = "Who is Juliet's father?"

similar = db.similarity_search_with_score(query, k=2)

for doc in similar:
    print(doc)

(Document(page_content='PARIS.\nImmoderately she weeps for Tybalt’s death,\nAnd therefore have I little talk’d of love;\nFor Venus smiles not in a house of tears.\nNow, sir, her father counts it dangerous\nThat she do give her sorrow so much sway;\nAnd in his wisdom, hastes our marriage,\nTo stop the inundation of her tears,\nWhich, too much minded by herself alone,\nMay be put from her by society.\nNow do you know the reason of this haste.\n\nFRIAR LAWRENCE.\n[_Aside._] I would I knew not why it should be slow’d.—\nLook, sir, here comes the lady toward my cell.\n\n Enter Juliet.\n\nPARIS.\nHappily met, my lady and my wife!\n\nJULIET.\nThat may be, sir, when I may be a wife.\n\nPARIS.\nThat may be, must be, love, on Thursday next.\n\nJULIET.\nWhat must be shall be.\n\nFRIAR LAWRENCE.\nThat’s a certain text.\n\nPARIS.\nCome you to make confession to this father?\n\nJULIET.\nTo answer that, I should confess to you.\n\nPARIS.\nDo not deny to him that you love me.\n\nJULIET.\nI will confes

In [14]:
print(embeddings.embed_query(query))

[-0.05212350934743881, -0.010337615385651588, 0.033933546394109726, 0.004484337288886309, -0.01979895681142807, 0.04183520749211311, 0.044246215373277664, 0.05790005996823311, 0.008366436697542667, 0.018237870186567307, 0.00514712231233716, -0.006419340614229441, -0.041555341333150864, -0.005851348862051964, -0.027940018102526665, 0.005596392787992954, -0.007709875237196684, 0.010461365804076195, -0.010739810764789581, 0.0791887640953064, -0.011798620223999023, -0.054482411593198776, 0.022976728156208992, -0.005365259945392609, 0.005747067742049694, 0.02852531708776951, 0.03425212204456329, 0.041572023183107376, -0.013946890830993652, -0.01171580608934164, -0.045744333416223526, -0.051674503833055496, 0.009281058795750141, 0.005332327447831631, -0.06281071901321411, -0.020621009171009064, 0.05716460198163986, 0.061911024153232574, 0.023762181401252747, -0.005879259202629328, -0.016468657180666924, -0.0024796021170914173, 0.004382645711302757, 0.05651291459798813, -0.021756045520305634,

: 

In [None]:
"""
SELECT document, (embedding <=> '[-0.05212350934743881, -0.010337615385651588, 0.033933546394109726, 0.004484337288886309, -0.01979895681142807, 0.04183520749211311, 0.044246215373277664, 0.05790005996823311, 0.008366436697542667, 0.018237870186567307, 0.00514712231233716, -0.006419340614229441, -0.041555341333150864, -0.005851348862051964, -0.027940018102526665, 0.005596392787992954, -0.007709875237196684, 0.010461365804076195, -0.010739810764789581, 0.0791887640953064, -0.011798620223999023, -0.054482411593198776, 0.022976728156208992, -0.005365259945392609, 0.005747067742049694, 0.02852531708776951, 0.03425212204456329, 0.041572023183107376, -0.013946890830993652, -0.01171580608934164, -0.045744333416223526, -0.051674503833055496, 0.009281058795750141, 0.005332327447831631, -0.06281071901321411, -0.020621009171009064, 0.05716460198163986, 0.061911024153232574, 0.023762181401252747, -0.005879259202629328, -0.016468657180666924, -0.0024796021170914173, 0.004382645711302757, 0.05651291459798813, -0.021756045520305634, -0.042381368577480316, 0.012229538522660732, 0.030165676027536392, 0.03875338286161423, 0.08400031179189682, -0.07128783315420151, 0.017837166786193848, -0.0850025936961174, -0.0725954994559288, -0.02100125513970852, 0.04899616166949272, 0.039723288267850876, 0.03062480501830578, 0.08209677785634995, 0.05395367369055748, -0.044569242745637894, 0.03668825328350067, -0.062102060765028, 0.026804665103554726, 0.0018385458970442414, 0.015894688665866852, 0.015797268599271774, -0.05061888322234154, -0.0687350481748581, 0.037524547427892685, 0.026279455050826073, -0.01311060693114996, 0.040577661246061325, -0.05519722402095795, -0.016347061842679977, 0.018799064680933952, -0.031435780227184296, -0.00878479890525341, -0.06315769255161285, -0.008611700497567654, -0.08435359597206116, -0.06592649966478348, -0.052892494946718216, 0.10641449689865112, -0.024976931512355804, -0.0071994103491306305, 0.08263921737670898, -0.026617759838700294, -0.02143964171409607, 0.03319283947348595, -0.04643759876489639, -0.10404635965824127, -0.04315786808729172, 0.04896826669573784, -0.11383327841758728, 0.06577373296022415, -0.027357067912817, 0.048374250531196594, -0.08239087462425232, 0.005905522033572197, -0.0003428938507568091, 0.08612111210823059, 0.06282750517129898, 0.0706491693854332, -0.03759843111038208, -0.007898107171058655, -0.025115452706813812, -0.010352913290262222, 0.009660734795033932, 0.008824861608445644, -0.027659794315695763, -0.08818916976451874, 0.02789321541786194, 0.0405513234436512, 0.03749769181013107, -0.004397010896354914, 0.04765026643872261, -0.03029584139585495, -0.03168633207678795, 0.0020704653579741716, 0.08929325640201569, 0.08309854567050934, 0.012538348324596882, 0.00845444668084383, -0.058337558060884476, -0.018336482346057892, -0.022845245897769928, -6.08740806567059e-33, -0.03394746407866478, 0.02421823889017105, 0.04485902190208435, 0.06912323832511902, -0.018935633823275566, 0.05473102629184723, -0.02507621794939041, 0.07070969045162201, -0.09535062313079834, -0.04653673991560936, -0.03917478770017624, -0.11354642361402512, -0.030110109597444534, -0.054346006363630295, -0.0386943444609642, 0.05570237338542938, 0.01797213964164257, -0.0016107967821881175, 0.036276888102293015, 0.05356072634458542, 0.0015882938168942928, 0.09267588704824448, -0.027394581586122513, -0.0368276946246624, -0.05957729369401932, 0.03555478900671005, 0.05680283531546593, 0.10139920562505722, -0.013128269463777542, -0.0010209586471319199, -0.020458480343222618, 0.03829586133360863, 0.051061000674963, -0.00730497669428587, 0.05504433438181877, 0.01077357493340969, -0.10130061954259872, -0.06371156871318817, 0.04517978057265282, 0.07016010582447052, -0.0418388694524765, 0.026031723245978355, -0.012024607509374619, 0.0034768630284816027, -0.04134642705321312, -0.08291976153850555, -0.025925379246473312, -0.024003155529499054, 0.08129504323005676, -0.011984573677182198, 0.021209625527262688, -0.03003181517124176, -0.03361848369240761, 0.02020936645567417, 0.09126971662044525, 0.08183272927999496, -0.03156649321317673, 0.05845704674720764, 0.044735509902238846, -0.03834887221455574, 0.11663205921649933, -0.04775824025273323, 0.06243472546339035, 0.09781795740127563, 0.02910977602005005, -0.009925254620611668, 0.02935759164392948, -0.05134338140487671, 0.023856567218899727, -0.059531766921281815, -0.0324581153690815, -0.018481744453310966, 0.0042444271966814995, 0.004102929029613733, -0.041432447731494904, 0.04527757316827774, 0.010578712448477745, -0.004873898345977068, -0.06532800197601318, -0.08701610565185547, -0.10443370044231415, -0.023683305829763412, 0.018717894330620766, 0.019163623452186584, -0.09107967466115952, -0.06276010721921921, -0.017469117417931557, -0.017098046839237213, -0.08553391695022583, 0.025226367637515068, -0.017664534971117973, -0.05560842156410217, -0.07690312713384628, -0.13341277837753296, -0.023365654051303864, 3.063135872809349e-33, 0.0065019926987588406, -0.018982460722327232, 0.04527172073721886, 0.022694820538163185, 0.0011937564704567194, -0.08676525205373764, 0.0009367907186970115, 0.048014286905527115, 0.0540943369269371, 0.06122688949108124, -0.055036962032318115, -0.07949787378311157, 0.03246443718671799, -0.0666620209813118, 0.01965547911822796, 0.0793556198477745, 0.09975157678127289, -0.015137327834963799, 0.007913252338767052, -0.0669422373175621, -0.09460736066102982, 0.055341869592666626, -0.016278568655252457, -0.005966901779174805, 0.019186943769454956, 0.003327179467305541, 0.002754191169515252, -0.003958836663514376, -0.09057001769542694, -0.016031533479690552, -0.042585041373968124, 0.07473650574684143, 0.037549085915088654, -0.04586237668991089, -0.03380904346704483, 0.09396025538444519, 0.04772963374853134, 0.02079184353351593, 0.07668808847665787, 0.020645327866077423, 0.0919744223356247, 0.0005599879659712315, 0.04483315348625183, 0.07794304937124252, 0.039159223437309265, 0.03259200230240822, 0.01879953034222126, 0.07819188386201859, 0.047805946320295334, 0.06183687224984169, -0.04264609515666962, 0.011694008484482765, 0.015850184485316277, -0.014515089802443981, 0.07100207358598709, -0.03480278328061104, 0.047241128981113434, -0.06676985323429108, -0.04236923158168793, -0.029584448784589767, 0.10173717886209488, 0.03585474193096161, -0.0754762515425682, 0.029251061379909515, -0.015282701700925827, 0.09873048216104507, -0.13959261775016785, 0.007741102017462254, -0.02924184314906597, 0.019561614841222763, 0.029504820704460144, -0.06252725422382355, -0.009811478666961193, 0.011517198756337166, -0.01052857842296362, 0.015271946787834167, -0.08627552539110184, -0.01982763223350048, -0.02401113510131836, -0.051039572805166245, -0.09253557026386261, -0.04155074059963226, 0.011440817266702652, -0.008895316161215305, -0.012291383929550648, -0.10965228080749512, -0.015525027178227901, -0.003773878561332822, 0.02438085339963436, -0.028880521655082703, 0.026283862069249153, -0.07528864592313766, 0.03565054014325142, -0.13418176770210266, 0.027960503473877907, -1.5635931660540336e-08, 0.04757577180862427, -0.014960864558815956, -0.0037621965166181326, -0.07127683609724045, -0.08138328790664673, 0.004862150643020868, 0.019414201378822327, 0.03761504217982292, -0.014965681359171867, 0.10760960727930069, -0.025596778839826584, 0.01710449904203415, 0.0018523609032854438, -0.03970811888575554, 0.09755636751651764, 0.01722886599600315, 0.08215437829494476, -0.06721220910549164, -0.060194484889507294, 0.031603969633579254, 0.06458482146263123, -0.0032491630408912897, 0.01900658570230007, 0.013129089958965778, -0.02792476676404476, -0.00052388955373317, 0.015025490894913673, 0.002343545900657773, -0.07888210564851761, 0.026389649137854576, 0.05386478453874588, 0.008539974689483643, -0.032421939074993134, -0.11720459163188934, -0.06665528565645218, 0.03947934880852699, 0.07614333927631378, -0.002650856738910079, 0.03642065450549126, -0.004107160959392786, 0.02708755061030388, 0.038036447018384933, -0.09045606851577759, 0.026063408702611923, 0.07517017424106598, 0.022580940276384354, 0.03070307895541191, 0.0576753169298172, -0.012930587865412235, 0.0014078165404498577, -0.06669620424509048, 0.034279417246580124, -0.0032792428974062204, -0.06370456516742706, -0.031638044863939285, -0.05305458605289459, 0.03456581011414528, 0.09758666157722473, -0.032755736261606216, 0.0007232898497022688, 0.04832566902041435, 0.06003280356526375, 0.18833354115486145, -0.08621963113546371]
') as cosine_distance
FROM langchain_pg_embedding
ORDER BY cosine_distance
LIMIT 2    
"""