### TESTING DATABASE CREATION AND CONNECTION

In [1]:
from database.neo4jConnection_utils import db_connection, get_credential
import database.create_db as create_db

DATA_PATH = "./dataset"
cred = get_credential()
client_gds = db_connection(cred)

query_test = """
    MATCH (n)
    RETURN count(n) AS node_count
"""

node_count = client_gds.run_cypher(query_test).iloc[0, 0]
if node_count == 0:
    create_db.load_csv(DATA_PATH)
    create_db.create_constraints(client_gds)
    create_db.create_db(client_gds)
else:
    print('LOG: db already created')






LOG connection to DB:
                                            key       value
0                                    gdsVersion      2.10.1
1                                    gdsEdition  Unlicensed
2                                  neo4jVersion      5.22.0
3                    minimumRequiredJavaVersion          17
4                        availableCompatibility  Neo4j 5.22
..                                          ...         ...
86                 server.memory.pagecache.size   536870912
87  server.memory.off_heap.transaction_max_size  2147483648
88            dbms.memory.transaction.total.max  8589934592
89              db.memory.transaction.total.max           0
90                    db.memory.transaction.max           0

[91 rows x 2 columns]
--------------> connection to db is OK
LOG: db already created


## TESTING CONNECTION WITH OLLAMA
- language model api
- embeddings


In [2]:
from database.neo4jConnection_utils import get_credential

cred = get_credential()

In [3]:
# OLLAMA: SEE XTERM CONSOLE ABOVE TO CHECK FOR OTHER MODEL TO PULL
# Import Ollama module from Langchain
from langchain_community.llms import Ollama
LLM = cred['LLM']
print('--------->', LLM)
# Initialize an instance of the Ollama model
llm = Ollama(model=LLM)

test = True
if test:
  # Invoke the model to generate responses
  prompt = "Tell me a joke in two line"
  response = llm.invoke(prompt)
  print(f'***OLLAMA SANITY CHECK***\nprompt:{prompt}', '\nresponse:', response)

---------> llama3.1:8b
***OLLAMA SANITY CHECK***
prompt:Tell me a joke in two line 
response: Here is a two-line joke:

What do you call a fake noodle?
An impasta.


In [4]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

if 'llama' in LLM:
  embedding_model = OllamaEmbeddings(model=LLM, base_url='http://localhost:11434')
  embedding_dimension = 4096
  print('USING OLLAMA')
else:
  print('WARNING not running on llama instance')

if test:
  test_embedding = embedding_model.embed_query("My query to look up")
  print('***OLLAMA EMBEDDING SANITY CHECK*****')
  print('len(test_embedding) -> ', len(test_embedding))
  print('test_embedding -> ', test_embedding[0:10])

USING OLLAMA


***OLLAMA EMBEDDING SANITY CHECK*****
len(test_embedding) ->  4096
test_embedding ->  [2.1392972469329834, -3.0549228191375732, 0.29549142718315125, 0.4371836483478546, -0.6623877286911011, -1.4102567434310913, 2.5391085147857666, 1.2455378770828247, -3.345935344696045, -0.5352067947387695]


### TESTING DESCRIPTION EMBEDDING

In [6]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from embeddings.utils import concat_description
from embeddings.embed_description import embed_description

product_df = pd.read_csv('dataset/product.csv')

In [7]:
description_df = concat_description(product_df)

In [8]:
print(description_df.head())

   productCode                                               text
0       108775  \n##Product\nName: Strap top\nType: Vest top\n...
1       110065  \n##Product\nName: OP T-shirt (Idro)\nType: Br...
2       111565  \n##Product\nName: 20 den 1p Stockings\nType: ...
3       111586  \n##Product\nName: Shape Up 30 den 1p Tights\n...
4       111593  \n##Product\nName: Support 40 den 1p Tights\nT...


- the description embedding method work but it is very long to compute on local so you can find a minimaliste code here: https://colab.research.google.com/drive/1rIt-Sn3rJQc4dbTLy9-mZc-1vveUJnIT?usp=sharing
that runs the function.

- you can then export the generated csv that has the following format:

productCode  |  text  |  embedding  |

In [9]:
# description_df = embed_description(description_df, embedding_model)

Embedded 500 of 8018


KeyboardInterrupt: 

### LOADING THE EMBEDDINGS INTO THE DATABASE
- for this two steps:
    - run cypher query on every embbeding and push them to the matching product code
    - create a vector index wich allow you to run the neo4j built in function to retrieve vectors based on similarity coefficient

In [6]:
import pandas as pd
import ast
product_embedding = pd.read_csv('dataset/product_embedding.csv')



In [7]:
# Convert the string representation to a list of floats
product_embedding['textEmbedding'] = product_embedding['textEmbedding'].apply(lambda x: [float(i) for i in ast.literal_eval(x)])

# Verify the conversion
print(product_embedding['textEmbedding'].head())
print(type(product_embedding['textEmbedding'].iloc[0]))
print(type(product_embedding['textEmbedding'].iloc[0][0]))

0    [0.5340319275856018, -0.47775566577911377, -0....
1    [2.1701207160949707, -0.5895195603370667, 0.98...
2    [2.6462013721466064, -1.6008479595184326, 0.05...
3    [1.896992802619934, -1.3056349754333496, -0.85...
4    [2.645840644836426, -1.0651317834854126, 0.135...
Name: textEmbedding, dtype: object
<class 'list'>
<class 'float'>


In [8]:
product_embedding.head()

Unnamed: 0,productCode,text,textEmbedding
0,108775,\n##Product\nName: Strap top\nType: Vest top\n...,"[0.5340319275856018, -0.47775566577911377, -0...."
1,110065,\n##Product\nName: OP T-shirt (Idro)\nType: Br...,"[2.1701207160949707, -0.5895195603370667, 0.98..."
2,111565,\n##Product\nName: 20 den 1p Stockings\nType: ...,"[2.6462013721466064, -1.6008479595184326, 0.05..."
3,111586,\n##Product\nName: Shape Up 30 den 1p Tights\n...,"[1.896992802619934, -1.3056349754333496, -0.85..."
4,111593,\n##Product\nName: Support 40 den 1p Tights\nT...,"[2.645840644836426, -1.0651317834854126, 0.135..."


In [9]:
records_dict = product_embedding[['productCode', 'textEmbedding']].to_dict('records')
print(type(records_dict), len(records_dict), type(records_dict[0]), records_dict[0].keys())
print()


<class 'list'> 8018 <class 'dict'> dict_keys(['productCode', 'textEmbedding'])



In [10]:
from embeddings.push_embeddings import push_embeddings

push_embeddings(client_gds, records_dict)


staging 8,018 records
Set 100 of 8,018 text embeddings
Set 200 of 8,018 text embeddings
Set 300 of 8,018 text embeddings
Set 400 of 8,018 text embeddings
Set 500 of 8,018 text embeddings
Set 600 of 8,018 text embeddings
Set 700 of 8,018 text embeddings
Set 800 of 8,018 text embeddings
Set 900 of 8,018 text embeddings
Set 1,000 of 8,018 text embeddings
Set 1,100 of 8,018 text embeddings
Set 1,200 of 8,018 text embeddings
Set 1,300 of 8,018 text embeddings
Set 1,400 of 8,018 text embeddings
Set 1,500 of 8,018 text embeddings
Set 1,600 of 8,018 text embeddings
Set 1,700 of 8,018 text embeddings
Set 1,800 of 8,018 text embeddings
Set 1,900 of 8,018 text embeddings
Set 2,000 of 8,018 text embeddings
Set 2,100 of 8,018 text embeddings
Set 2,200 of 8,018 text embeddings
Set 2,300 of 8,018 text embeddings
Set 2,400 of 8,018 text embeddings
Set 2,500 of 8,018 text embeddings
Set 2,600 of 8,018 text embeddings
Set 2,700 of 8,018 text embeddings
Set 2,800 of 8,018 text embeddings
Set 2,900 of 8,0

#### Creating the vector index

- the ***create_vector_index*** functionis configurated to ease the workload of vector similarity search. I specified a similarity function into the query and neo4j take care of organising the index based on most similar vector according to the similarity function wich is a requirement for real time application

the ***CALL db.awaitIndex("product_text_embeddings", 300)*** ensure that the index is properly set up before making any further request on the DB that would request the vector index

In [11]:
from embeddings.push_embeddings import create_vector_index

create_vector_index(client_gds, 4096)


In [12]:
query_prompt = '''##Product
Name: Box 4p Kneehighs
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Four pairs of knee highs. 20 denier.'''

embedded_prompt = embedding_model.embed_query(query_prompt)

print('stats:')
print('query prompt : ', query_prompt)
print('len : ', len(query_prompt))
print('embedded prompt : ', embedded_prompt[0:10])
print('len embedded prompt : ', len(embedded_prompt))


stats:
query prompt :  ##Product
Name: Box 4p Kneehighs
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Four pairs of knee highs. 20 denier.
len :  158
embedded prompt :  [-0.34917518496513367, -2.622393846511841, 1.4032275676727295, -1.2686604261398315, 1.7714356184005737, -1.5883234739303589, -2.3025829792022705, 0.37134072184562683, -0.9117132425308228, -1.2360694408416748]
len embedded prompt :  4096


In [13]:
print(embedded_prompt)
res = client_gds.run_cypher('''
CALL db.index.vector.queryNodes("product_text_embeddings", 10, $queryVector)
YIELD node AS product, score
RETURN product.productCode AS productCode,
    product.text AS text,
    score
''', params={'queryVector': embedded_prompt})

[-0.34917518496513367, -2.622393846511841, 1.4032275676727295, -1.2686604261398315, 1.7714356184005737, -1.5883234739303589, -2.3025829792022705, 0.37134072184562683, -0.9117132425308228, -1.2360694408416748, 1.7151645421981812, -3.230201244354248, 1.0329676866531372, 3.588286876678467, 2.56642746925354, -0.70607590675354, -1.8925795555114746, -0.828697144985199, 0.3444577157497406, 1.062218189239502, -1.488359808921814, 0.17563828825950623, -1.9500230550765991, -1.7601841688156128, -2.8800785541534424, -0.7455834150314331, -0.7225560545921326, -2.452418565750122, -0.23346614837646484, 2.0072968006134033, 1.1991820335388184, -0.4709888696670532, -2.2807953357696533, -1.6996557712554932, 0.6855659484863281, 0.34314727783203125, -1.3461915254592896, 0.8291614055633545, -2.4462389945983887, 3.237294912338257, -1.51541006565094, 0.6401684284210205, -0.8333976864814758, 2.798381805419922, -0.037843868136405945, 1.551081657409668, 0.7675260305404663, 0.1791975498199463, -0.3946724832057953, 

In [14]:
for rows in res.iterrows():
    print(rows[1]['productCode'], rows[1]['text'], rows[1]['score'])
    print()


111609 ##Product
Name: 200 den 1p Tights
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Opaque matt tights. 200 denier. 0.7259818315505981

215589 ##Product
Name: Mama 40 den 2p Tights
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Tights with extra space for a growing tummy. 40 denier. 0.7246626615524292

156227 ##Product
Name: Box 4p Kneehighs
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Four pairs of knee highs. 20 denier. 0.7221738696098328

417951 ##Product
Name: Support 20 den 1p tights
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist. 20 denier. 0.7201794385910034

234421 ##Product
Name: Mama 200 den Tights
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Desc

### Now using the langchain integration of neo4j

In [18]:
from langchain.vectorstores.neo4j_vector import Neo4jVector
from database.neo4jConnection_utils import get_credential

cred = get_credential()
vector_search = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=cred['NEO4J_URI'],
    username=cred['NEO4J_USERNAME'],
    password=cred['NEO4J_PASSWORD'],
    index_name='product_text_embeddings'
)   


In [19]:
retr = vector_search.similarity_search('''##Product
Name: Box 4p Kneehighs
Type: Underwear Tights
Group: Socks & Tights
Garment Type: Socks and Tights
Description: Four pairs of knee highs. 20 denier.''', k=10)

print(len(retr), type(retr[0]))

10 <class 'langchain_core.documents.base.Document'>


In [None]:
for i, stuff in enumerate(retr):
    print(f''' ######## RETRIEVAL N{i+1} ########
    {stuff.page_content}
    ###########################################
    \n
    ''')

### NODE EMBEDDING FOR GRAPH DATASCIENCE

In [15]:
from database.create_db import create_node_embedding

knn_stats = create_node_embedding(client_gds)

K-Nearest Neighbours:   0%|          | 0/100 [00:00<?, ?%/s]

In [16]:
print(knn_stats)

ranIterations                                                             7
didConverge                                                            True
nodePairsConsidered                                                 7068227
preProcessingMillis                                                       0
computeMillis                                                          2910
writeMillis                                                            1089
postProcessingMillis                                                      0
nodesCompared                                                         13296
relationshipsWritten                                                 125497
similarityDistribution    {'min': 0.750030517578125, 'p5': 0.87169265747...
configuration             {'writeProperty': 'score', 'writeRelationshipT...
Name: 0, dtype: object


In [7]:
%load_ext autoreload
%autoreload 2
import store_generator.personnalised_reco as requests

customer_id = '0132cd2eb3c6b1f66784f65f94ddd8352add2653e0caf5fc564fcfe4eb977863'
credentials = cred
res_sim = requests.kg_personalized_search_gen(customer_id, credentials, embedding_model)

print(type(res_sim))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
<class 'langchain_community.vectorstores.neo4j_vector.Neo4jVector'>


In [26]:
res_gds = requests.kg_recommendations_app_dict({"customer_id":customer_id,
                                                "credentials": credentials,
                                                  "k":15})

print(type(res_gds))

<class 'str'>


In [36]:
search_prompt = '''type: swimsuit'''
ret = res_sim.similarity_search(search_prompt)

print(len(ret))

4


In [37]:
i = 0
while i < len(ret):
    print(ret[i])
    print()

    i += 1

page_content='##Product
Name: Greece Shape Swimsuit
Type: Bikini top
Group: Swimwear
Garment Type: Swimwear
Description: Fully lined swimsuit with a sculpting effect that holds in and shapes the tummy and bum. Cups with removable inserts, wide, adjustable shoulder straps and decorative gathers in the sides.
url: https://representative-domain/product/811777' metadata={'source': 'https://representative-domain/product/811777', 'purchaseScore': 2, 'searchScore': 0.6903656721115112}

page_content='##Product
Name: Lizzy Dancesuit
Type: Bodysuit
Group: Garment Upper body
Garment Type: Accessories
Description: Leotard in fast-drying functional fabric with decorative gathers at the top, crossover straps at the back and 3/4-length sleeves. Lined gusset.
url: https://representative-domain/product/705466' metadata={'source': 'https://representative-domain/product/705466', 'purchaseScore': 0, 'searchScore': 0.6937686204910278}

page_content='##Product
Name: Veronica swimsuit
Type: Swimsuit
Group: S

In [29]:
print(res_gds)

##Product
Name: SF Caprio blk short sleeve
Type: Dress
Group: Garment Full body
Garment Type: Special Offers
Description: Long, fitted, smocked dress in a crisp organic cotton weave. Square neckline and concealed elastication at the top to hold the dress in place. Wide, half-length sleeves with a voluminous puff at the shoulders and smocking at the cuffs. Gathered seam at the hem and a flared skirt. Unlined.
url: https://representative-domain/product/868544
score:30.432336390018463

##Product
Name: Roberto
Type: Sweater
Group: Garment Upper body
Garment Type: Knitwear
Description: Jumper in a soft cable knit containing some wool with a rib-knit stand-up collar and wide raglan sleeves with foldover ribbing at the cuffs. The polyester content of the jumper is recycled.
url: https://representative-domain/product/788651
score:27.913103342056274

##Product
Name: Price TEE TVP
Type: T-shirt
Group: Garment Upper body
Garment Type: Jersey Fancy
Description: T-shirt in soft cotton jersey with r

## TESTING THE FINAL STORE GENERATION ON A CUSTOMER ID
- customerID: '0132cd2eb3c6b1f66784f65f94ddd8352add2653e0caf5fc564fcfe4eb977863'

In [5]:
%load_ext autoreload
%autoreload 2

from store_generator.prompt_chain import chain_gen
from langchain_community.llms import Ollama
from database.neo4jConnection_utils import get_credential

cred = get_credential()
llm = Ollama(model=cred['LLM'], temperature=0.2)

customer_id = '0132cd2eb3c6b1f66784f65f94ddd8352add2653e0caf5fc564fcfe4eb977863'
gen = chain_gen(customer_id, llm, cred, embedding_model)

In [8]:
search_prompt = 'cute swimsuit body'
ret = gen["output"].invoke({'searchPrompt':search_prompt, 
                  'customerName':'Mimi', 
                  'timeOfYear':'Jul, 2024'})

In [9]:

print(type(gen))
print(type(ret))
print(type(gen['prompt']))

<class 'dict'>
<class 'str'>
<class 'langchain_core.runnables.base.RunnableSequence'>


In [10]:
prompt = gen['prompt'].invoke({'searchPrompt':search_prompt, 
                  'customerName':'Mimi', 
                  'timeOfYear':'Jul, 2024'})

In [11]:

RESULT_PATH = './result/'
with open(RESULT_PATH + "answer.txt", "w") as file1:
    file1.write(ret)

# Write the second string to another file
with open(RESULT_PATH + "prompt.txt", "w") as file2:
    file2.write(prompt)

# THE GRADIO INTERFACE

In [2]:
from database.create_db import get_credential
creds = get_credential()

In [None]:
%load_ext autoreload
%autoreload 2

from interface import gradio_interace as gri

model_ui = gri.ModelInterface(creds)
interface = model_ui.get_interface()
interface.launch(debug=True)

## THE GRADIO INTERFACE V2

In [17]:
from database.create_db import get_credential
creds = get_credential()

In [19]:
import interface.interfacev2 as uiv2

engine = uiv2.ModelInterface(creds)
gradio_ui = engine.get_interface()
gradio_ui.launch()


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Content written to ./result/prompt_00
Content written to ./result/out_00
Content written to ./result/prompt_01.txt
Content written to ./result/out_01.txt
