### TESTING DATABASE CREATION AND CONNECTION

In [1]:
from database.neo4jConnection_utils import db_connection, get_credential
import database.create_db as create_db

cred = get_credential()
client_gds = db_connection(cred)

query_test = """
    MATCH (n)
    RETURN count(n) AS node_count
"""

node_count = client_gds.run_cypher(query_test).iloc[0, 0]
if node_count == 0:
    create_db.create_constraints(client_gds)
    create_db.create_db(client_gds)
else:
    print('LOG: db already created')






LOG connection to DB:
                                            key       value
0                                    gdsVersion       2.8.0
1                                    gdsEdition  Unlicensed
2                                  neo4jVersion      5.22.0
3                    minimumRequiredJavaVersion          17
4                      unavailableCompatibility  Neo4j 5.16
..                                          ...         ...
90                 server.memory.pagecache.size   536870912
91  server.memory.off_heap.transaction_max_size  2147483648
92            dbms.memory.transaction.total.max  8589934592
93              db.memory.transaction.total.max           0
94                    db.memory.transaction.max           0

[95 rows x 2 columns]
--------------> connection to db is OK
LOG: db already created


## TESTING CONNECTION WITH OLLAMA

In [4]:
from database.neo4jConnection_utils import get_credential

cred = get_credential()

In [5]:
# OLLAMA: SEE XTERM CONSOLE ABOVE TO CHECK FOR OTHER MODEL TO PULL
# Import Ollama module from Langchain
from langchain_community.llms import Ollama
LLM = cred['LLM']
print('--------->', LLM)
# Initialize an instance of the Ollama model
llm = Ollama(model=LLM)

test = True
if test:
  # Invoke the model to generate responses
  prompt = "Tell me a joke in two line"
  response = llm.invoke(prompt)
  print(f'***OLLAMA SANITY CHECK***\nprompt:{prompt}', '\nresponse:', response)

---------> llama3.1:8b
***OLLAMA SANITY CHECK***
prompt:Tell me a joke in two line 
response: Here's a two-line joke:

Why did the scarecrow win an award?
Because he was outstanding in his field.


In [6]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

if 'llama' in LLM:
  embedding_model = OllamaEmbeddings(model=LLM, base_url='http://localhost:11434')
  embedding_dimension = 4096
  print('USING OLLAMA')
else:
  print('WARNING not running on llama instance')

if test:
  test_embedding = embedding_model.embed_query("My query to look up")
  print('***OLLAMA EMBEDDING SANITY CHECK*****')
  print('len(test_embedding) -> ', len(test_embedding))
  print('test_embedding -> ', test_embedding[0:10])

USING OLLAMA
***OLLAMA EMBEDDING SANITY CHECK*****
len(test_embedding) ->  4096
test_embedding ->  [2.1036903858184814, -3.066297769546509, 0.3909342885017395, 0.4271194040775299, -0.624779224395752, -1.34486722946167, 2.5913209915161133, 1.1203879117965698, -3.3904855251312256, -0.4280732274055481]


### TESTING DESCRIPTION EMBEDDING

In [5]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from embeddings.utils import get_full_text
from embeddings.embed_description import embed_description

product_df = pd.read_csv('dataset/product.csv')

In [6]:
description_df = get_full_text(product_df)

In [7]:
print(description_df.head())

   productCode                                               text
0       108775  \n##Product\nName: Strap top\nType: Vest top\n...
1       110065  \n##Product\nName: OP T-shirt (Idro)\nType: Br...
2       111565  \n##Product\nName: 20 den 1p Stockings\nType: ...
3       111586  \n##Product\nName: Shape Up 30 den 1p Tights\n...
4       111593  \n##Product\nName: Support 40 den 1p Tights\nT...


- the description embedding method work but it is very long to compute on local so you can find a minimaliste code here: https://colab.research.google.com/drive/1rIt-Sn3rJQc4dbTLy9-mZc-1vveUJnIT?usp=sharing
that runs the function.

- you can then export the generated csv that has the following format:

productCode  |  text  |  embedding  |

In [8]:
# description_df = embed_description(description_df, embedding_model)

### LOADING THE EMBEDDINGS INTO THE DATABASE
- for this two steps:
    - run cypher query on every embbeding and push them to the matching product code
    - create a vector index wich allow you to run the neo4j built in function to retrieve vectors based on similarity coefficient

In [9]:
import ast
product_embedding = pd.read_csv('dataset/product_embedding.csv')



In [10]:
# Convert the string representation to a list of floats
product_embedding['textEmbedding'] = product_embedding['textEmbedding'].apply(lambda x: [float(i) for i in ast.literal_eval(x)])

# Verify the conversion
print(product_embedding['textEmbedding'].head())
print(type(product_embedding['textEmbedding'].iloc[0]))
print(type(product_embedding['textEmbedding'].iloc[0][0]))

0    [0.5340319275856018, -0.47775566577911377, -0....
1    [2.1701207160949707, -0.5895195603370667, 0.98...
2    [2.6462013721466064, -1.6008479595184326, 0.05...
3    [1.896992802619934, -1.3056349754333496, -0.85...
4    [2.645840644836426, -1.0651317834854126, 0.135...
Name: textEmbedding, dtype: object
<class 'list'>
<class 'float'>


In [11]:
product_embedding.head()

Unnamed: 0,productCode,text,textEmbedding
0,108775,\n##Product\nName: Strap top\nType: Vest top\n...,"[0.5340319275856018, -0.47775566577911377, -0...."
1,110065,\n##Product\nName: OP T-shirt (Idro)\nType: Br...,"[2.1701207160949707, -0.5895195603370667, 0.98..."
2,111565,\n##Product\nName: 20 den 1p Stockings\nType: ...,"[2.6462013721466064, -1.6008479595184326, 0.05..."
3,111586,\n##Product\nName: Shape Up 30 den 1p Tights\n...,"[1.896992802619934, -1.3056349754333496, -0.85..."
4,111593,\n##Product\nName: Support 40 den 1p Tights\nT...,"[2.645840644836426, -1.0651317834854126, 0.135..."


In [12]:
records_dict = product_embedding[['productCode', 'textEmbedding']].to_dict('records')
print(type(records_dict), len(records_dict), type(records_dict[0]), records_dict[0].keys())
print()


<class 'list'> 8018 <class 'dict'> dict_keys(['productCode', 'textEmbedding'])



In [13]:
from embeddings.push_embeddings import push_embeddings

push_embeddings(client_gds, records_dict)


staging 8,018 records
Set 100 of 8,018 text embeddings
Set 200 of 8,018 text embeddings
Set 300 of 8,018 text embeddings
Set 400 of 8,018 text embeddings
Set 500 of 8,018 text embeddings
Set 600 of 8,018 text embeddings
Set 700 of 8,018 text embeddings
Set 800 of 8,018 text embeddings
Set 900 of 8,018 text embeddings
Set 1,000 of 8,018 text embeddings
Set 1,100 of 8,018 text embeddings
Set 1,200 of 8,018 text embeddings
Set 1,300 of 8,018 text embeddings
Set 1,400 of 8,018 text embeddings
Set 1,500 of 8,018 text embeddings
Set 1,600 of 8,018 text embeddings
Set 1,700 of 8,018 text embeddings
Set 1,800 of 8,018 text embeddings
Set 1,900 of 8,018 text embeddings
Set 2,000 of 8,018 text embeddings
Set 2,100 of 8,018 text embeddings
Set 2,200 of 8,018 text embeddings
Set 2,300 of 8,018 text embeddings
Set 2,400 of 8,018 text embeddings
Set 2,500 of 8,018 text embeddings
Set 2,600 of 8,018 text embeddings
Set 2,700 of 8,018 text embeddings
Set 2,800 of 8,018 text embeddings
Set 2,900 of 8,0

#### Creating the vector index

- the ***create_vector_index*** functionis configurated to ease the workload of vector similarity search. I specified a similarity function into the query and neo4j take care of organising the index based on most similar vector according to the similarity function wich is a requirement for real time application

the ***CALL db.awaitIndex("product_text_embeddings", 300)*** ensure that the index is properly set up before making any further request on the DB that would request the vector index

In [14]:
from embeddings.push_embeddings import create_vector_index

create_vector_index(client_gds, 4096)


In [15]:
query_prompt = '''##Product
Name: Box 4p Kneehighs
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Four pairs of knee highs. 20 denier.'''

embedded_prompt = embedding_model.embed_query(query_prompt)

print('stats:')
print('query prompt : ', query_prompt)
print('len : ', len(query_prompt))
print('embedded prompt : ', embedded_prompt[0:10])
print('len embedded prompt : ', len(embedded_prompt))


stats:
query prompt :  ##Product
Name: Box 4p Kneehighs
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Four pairs of knee highs. 20 denier.
len :  158
embedded prompt :  [-0.28010207414627075, -2.592555522918701, 1.4025849103927612, -1.2416536808013916, 1.7908093929290771, -1.6643613576889038, -2.335472345352173, 0.42454540729522705, -0.8457595109939575, -1.1753787994384766]
len embedded prompt :  4096


In [16]:
print(embedded_prompt)
res = client_gds.run_cypher('''
CALL db.index.vector.queryNodes("product_text_embeddings", 10, $queryVector)
YIELD node AS product, score
RETURN product.productCode AS productCode,
    product.text AS text,
    score
''', params={'queryVector': embedded_prompt})

[-0.28010207414627075, -2.592555522918701, 1.4025849103927612, -1.2416536808013916, 1.7908093929290771, -1.6643613576889038, -2.335472345352173, 0.42454540729522705, -0.8457595109939575, -1.1753787994384766, 1.7315059900283813, -3.2220263481140137, 0.8991167545318604, 3.5488500595092773, 2.5905628204345703, -0.8063726425170898, -1.952748417854309, -0.8035945892333984, 0.3151274025440216, 1.1399579048156738, -1.5024257898330688, 0.1506577730178833, -1.9388126134872437, -1.7379331588745117, -2.8788483142852783, -0.8356017470359802, -0.7673285603523254, -2.5146324634552, -0.21937187016010284, 1.8730045557022095, 1.2212496995925903, -0.4759730100631714, -2.3299663066864014, -1.7475095987319946, 0.5399770140647888, 0.4694867432117462, -1.360768437385559, 0.8106553554534912, -2.62448787689209, 3.169856548309326, -1.5364270210266113, 0.6615755558013916, -0.7992470264434814, 2.6818816661834717, -0.1338631510734558, 1.5703526735305786, 0.7927000522613525, 0.23619654774665833, -0.385149389505386

In [17]:
for rows in res.iterrows():
    print(rows[1]['productCode'], rows[1]['text'], rows[1]['score'])
    print()


111609 ##Product
Name: 200 den 1p Tights
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Opaque matt tights. 200 denier. 0.7249389290809631

215589 ##Product
Name: Mama 40 den 2p Tights
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Tights with extra space for a growing tummy. 40 denier. 0.7235618233680725

156227 ##Product
Name: Box 4p Kneehighs
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Four pairs of knee highs. 20 denier. 0.7211600542068481

417951 ##Product
Name: Support 20 den 1p tights
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist. 20 denier. 0.7189300060272217

234421 ##Product
Name: Mama 200 den Tights
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Desc

##### Now using the langchain integration of neo4j

In [18]:
from langchain.vectorstores.neo4j_vector import Neo4jVector
from database.neo4jConnection_utils import get_credential

cred = get_credential()
vector_search = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=cred['NEO4J_URI'],
    username=cred['NEO4J_USERNAME'],
    password=cred['NEO4J_PASSWORD'],
    index_name='product_text_embeddings'
)   


In [19]:
retr = vector_search.similarity_search('''##Product
Name: Box 4p Kneehighs
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Four pairs of knee highs. 20 denier.''', k=10)

print(len(retr), type(retr[0]))

10 <class 'langchain_core.documents.base.Document'>


In [21]:
for i, stuff in enumerate(retr):
    print(f''' ######## RETRIEVAL N{i+1} ########
    {stuff.page_content}
    ###########################################
    \n
    ''')

 ######## RETRIEVAL N1 ########3
    ##Product
Name: 200 den 1p Tights
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Opaque matt tights. 200 denier.
    {'prodName': '200 den 1p Tights', 'garmentGroupName': 'Socks and Tights', 'garmentGroupNo': 1021, 'productCode': 111609, 'productTypeName': 'Underwear Tights', 'productTypeNo': 304, 'detailDesc': 'Opaque matt tights. 200 denier.', 'productGroupName': 'Socks & Tights'}
    ###########################################
    

    
 ######## RETRIEVAL N2 ########3
    ##Product
Name: Mama 40 den 2p Tights
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Tights with extra space for a growing tummy. 40 denier.
    {'prodName': 'Mama 40 den 2p Tights', 'garmentGroupName': 'Socks and Tights', 'garmentGroupNo': 1021, 'productCode': 215589, 'productTypeName': 'Underwear Tights', 'productTypeNo': 304, 'detailDesc': 'Tights with extra space for a growing tummy. 40 den

### NODE EMBEDDING FOR GRAPH DATASCIENCE

In [22]:
from database.create_db import create_node_embedding

knn_stats = create_node_embedding(client_gds)

K-Nearest Neighbours:   0%|          | 0/100 [00:00<?, ?%/s]

In [23]:
print(knn_stats)

ranIterations                                                             7
didConverge                                                            True
nodePairsConsidered                                                 7068227
preProcessingMillis                                                       0
computeMillis                                                          3574
writeMillis                                                            1421
postProcessingMillis                                                      0
nodesCompared                                                         13296
relationshipsWritten                                                 125497
similarityDistribution    {'min': 0.750030517578125, 'p5': 0.87169265747...
configuration             {'writeProperty': 'score', 'writeRelationshipT...
Name: 0, dtype: object


In [7]:
%load_ext autoreload
%autoreload 2
import store_generator.personnalised_reco as requests

customer_id = '0132cd2eb3c6b1f66784f65f94ddd8352add2653e0caf5fc564fcfe4eb977863'
credentials = cred
res_sim = requests.kg_personalized_search_gen(customer_id, credentials, embedding_model)

print(type(res_sim))



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
<class 'langchain_community.vectorstores.neo4j_vector.Neo4jVector'>


In [26]:
res_gds = requests.kg_recommendations_app_dict({"customer_id":customer_id,
                                                "credentials": credentials,
                                                  "k":15})

print(type(res_gds))

<class 'str'>


In [36]:
search_prompt = '''type: swimsuit'''
ret = res_sim.similarity_search(search_prompt)

print(len(ret))

4


In [37]:
i = 0
while i < len(ret):
    print(ret[i])
    print()

    i += 1

page_content='##Product
Name: Greece Shape Swimsuit
Type: Bikini top
Group: Swimwear
Garment Type: Swimwear
Description: Fully lined swimsuit with a sculpting effect that holds in and shapes the tummy and bum. Cups with removable inserts, wide, adjustable shoulder straps and decorative gathers in the sides.
url: https://representative-domain/product/811777' metadata={'source': 'https://representative-domain/product/811777', 'purchaseScore': 2, 'searchScore': 0.6903656721115112}

page_content='##Product
Name: Lizzy Dancesuit
Type: Bodysuit
Group: Garment Upper body
Garment Type: Accessories
Description: Leotard in fast-drying functional fabric with decorative gathers at the top, crossover straps at the back and 3/4-length sleeves. Lined gusset.
url: https://representative-domain/product/705466' metadata={'source': 'https://representative-domain/product/705466', 'purchaseScore': 0, 'searchScore': 0.6937686204910278}

page_content='##Product
Name: Veronica swimsuit
Type: Swimsuit
Group: S

In [29]:
print(res_gds)

##Product
Name: SF Caprio blk short sleeve
Type: Dress
Group: Garment Full body
Garment Type: Special Offers
Description: Long, fitted, smocked dress in a crisp organic cotton weave. Square neckline and concealed elastication at the top to hold the dress in place. Wide, half-length sleeves with a voluminous puff at the shoulders and smocking at the cuffs. Gathered seam at the hem and a flared skirt. Unlined.
url: https://representative-domain/product/868544
score:30.432336390018463

##Product
Name: Roberto
Type: Sweater
Group: Garment Upper body
Garment Type: Knitwear
Description: Jumper in a soft cable knit containing some wool with a rib-knit stand-up collar and wide raglan sleeves with foldover ribbing at the cuffs. The polyester content of the jumper is recycled.
url: https://representative-domain/product/788651
score:27.913103342056274

##Product
Name: Price TEE TVP
Type: T-shirt
Group: Garment Upper body
Garment Type: Jersey Fancy
Description: T-shirt in soft cotton jersey with r

## TESTING THE FINAL STORE GENERATION ON A CUSTOMER ID
- customerID: '0132cd2eb3c6b1f66784f65f94ddd8352add2653e0caf5fc564fcfe4eb977863'

In [14]:
%load_ext autoreload
%autoreload 2

from store_generator.prompt_chain import chain_gen
from langchain_community.llms import Ollama
from database.neo4jConnection_utils import get_credential

cred = get_credential()
llm = Ollama(model=cred['LLM'], temperature=0.2)

customer_id = '0132cd2eb3c6b1f66784f65f94ddd8352add2653e0caf5fc564fcfe4eb977863'
gen = chain_gen(customer_id, llm, cred, embedding_model)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
search_prompt = 'cute swimsuit body'
print(gen.invoke({'searchPrompt':search_prompt, 
                  'customerName':'Mimi', 
                  'timeOfYear':'Jul, 2024'}))

=== Prompt to send to LLM ===
   System: 
You speak as you were a personal assistant named Sally, you are a funny and sexy a talker. 
you work for a fashion, home, and beauty company called HRM.
you write an email to a customer named: Mimi so you will start you e-mail adressing to her.
to promote and summarize relevant products for her given the current season / time of year: Jul, 2024.
Please only mention the products listed below. Do not come up with or add any new products to the list.
Each product comes with an https `url` field. Make sure to provide that https url with descriptive name text in markdown for each product.

---
# here is the last request from your customer Mimi
cute swimsuit body

# Relevant Products:
##Product
Name: Simple as That Triangle Top
Type: Bikini top
Group: Swimwear
Garment Type: Swimwear
Description: Lined, non-wired, triangle bikini top with a wide hem. Narrow, adjustable shoulder straps that can be fastened in different ways at the back and cups with re