## 🐢 Weaviate vector database

Author: J. Visbeen  
Target: practice  
Followed: 101v Work with: Your own vectors (https://weaviate.io/developers/academy/py/starter_custom_vectors)  

Docker run command voor de container:
```
sudo docker-compose up
```

In [46]:
import weaviate
from dotenv import load_dotenv
import os
load_dotenv()

headers = {
    'X-OpenAI-Api-Key': os.getenv('OPENAI_APIKEY')
}

client = weaviate.connect_to_local(headers=headers)

In [27]:
# Test code, closes client connection!
try:
    assert client.is_live()
    pass
finally:
    client.close()

In [29]:
import json

metainfo = client.get_meta()
print(json.dumps(metainfo, indent=2))

{
  "hostname": "http://[::]:8080",
  "modules": {
    "generative-cohere": {
      "documentationHref": "https://docs.cohere.com/reference/generate",
      "name": "Generative Search - Cohere"
    },
    "generative-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "Generative Search - OpenAI"
    }
  },
  "version": "1.24.1"
}


In [None]:
import weaviate.classes.config as wc
client = weaviate.connect_to_local(headers=headers)

client.collections.create(
    name='amazon_products',
    properties=[
        wc.Property(name='title', data_type=wc.DataType.TEXT),
        wc.Property(name='imageUrl', data_type=wc.DataType.TEXT),
        wc.Property(name='productUrl', data_type=wc.DataType.TEXT),
        wc.Property(name='stars', data_type=wc.DataType.INT),
        wc.Property(name='reviews', data_type=wc.DataType.INT),
        wc.Property(name='price', data_type=wc.DataType.NUMBER),
        wc.Property(name='listPrice', data_type=wc.DataType.NUMBER),
        wc.Property(name='categoryName', data_type=wc.DataType.TEXT),
        wc.Property(name='isBestSeller', data_type=wc.DataType.BOOL),
        wc.Property(name='boughtLastMonth', data_type=wc.DataType.INT)
    ],
    vectorizer_config=wc.Configure.Vectorizer.none(),
    generative_config=wc.Configure.Generative.openai()
)

client.close()

In [31]:
import pandas as pd

df_categories = (
    pd.read_csv('../amazon_categories.csv')
    .dropna()
    .reset_index()
)

df_products = (
    pd.read_csv('../amazon_products.csv')
    .dropna()
    .sample(5000, random_state=42)
    .reset_index()
)

merged_df = pd.merge(df_products, df_categories, left_on='category_id', right_on='id', how='left')
merged_df.drop('category_id', axis=1)

Unnamed: 0,index_x,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,isBestSeller,boughtInLastMonth,index_y,id,category_name
0,1208301,B0BKZNNL1V,"LOVEVOOK Diaper Bag Backpack, Diaper Backpack ...",https://m.media-amazon.com/images/I/718pt0oLES...,https://www.amazon.com/dp/B0BKZNNL1V,4.6,0,36.99,0.00,False,0,35,36,Baby Travel Gear
1,1089181,B0968QPTK8,Seloom 148 PCS Reflective Mailbox Numbers and ...,https://m.media-amazon.com/images/I/71Sr2+igJf...,https://www.amazon.com/dp/B0968QPTK8,4.5,286,14.89,19.99,False,50,196,211,Hardware
2,466837,B00NIAULVC,"Spry Xylitol Toothpaste 5oz, Fluoride Toothpas...",https://m.media-amazon.com/images/I/51I687iehe...,https://www.amazon.com/dp/B00NIAULVC,4.7,0,8.49,8.99,False,900,112,126,Oral Care Products
3,39647,B0794ZFTDB,2 Pieces Rhinestone Bow Ties Banquet Bowties M...,https://m.media-amazon.com/images/I/81G1Dc1uZu...,https://www.amazon.com/dp/B0794ZFTDB,4.4,0,12.99,13.99,False,50,101,112,Men's Accessories
4,485619,B075PTHVKN,Mielle Organics Pomegranate & Honey Leave-In C...,https://m.media-amazon.com/images/I/71RNqb6TfJ...,https://www.amazon.com/dp/B075PTHVKN,4.7,0,12.52,14.99,False,10000,45,47,Hair Care Products
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1419223,B0C2PKGCRG,"Tote Handbag Crossbody Shoulder Bag, Canvas Ny...",https://m.media-amazon.com/images/I/41LJWAda2E...,https://www.amazon.com/dp/B0C2PKGCRG,3.9,0,18.99,0.00,False,0,105,118,Women's Handbags
4996,278230,B09WJFJX1W,Simpson Strong Tie APRTA2 Outdoor Accents® ZMA...,https://m.media-amazon.com/images/I/71edOXtsyM...,https://www.amazon.com/dp/B09WJFJX1W,0.0,0,40.39,0.00,False,0,127,141,Industrial Hardware
4997,1374809,B0CGLXS6QF,BRL136 Replacement Heads for Philips Beauty La...,https://m.media-amazon.com/images/I/71+-ZFLJnl...,https://www.amazon.com/dp/B0CGLXS6QF,0.0,0,17.99,0.00,False,0,51,53,Shaving & Hair Removal Products
4998,318330,B00F3F09NU,Arkon Mounts TAB188L22 Car or Truck Seat Rail ...,https://m.media-amazon.com/images/I/517pYYiwuW...,https://www.amazon.com/dp/B00F3F09NU,4.3,0,32.32,37.95,False,0,70,76,Accessories & Supplies


In [32]:
import requests
import os

def query(texts):
    
    model_id = 'sentence-transformers/all-MiniLM-L6-v2'
    hf_token = os.getenv('HUGGINGFACE_APIKEY')
    print(hf_token)
    api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
    headers = {"Authorization": f"Bearer {hf_token}"}

    response = requests.post(
        api_url,
        headers=headers,
        json={'inputs': texts, 'options': {'wait_for_model': True}}
    )
    return response.json()

In [None]:
emb_dfs = list()
src_texts = list()
for i,row in enumerate(merged_df.itertuples(index=False)):
    src_text = 'Title' + row.title
    src_texts.append(src_text)

    if(len(src_texts) == 50) or (i+1 == len(merged_df)):
        output = query(src_texts)
        print(output)
        emb_df = pd.DataFrame(output)
        emb_dfs.append(emb_df)
        src_texts = list()

emb_df = pd.concat(emb_dfs)

In [37]:
emb_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.022937,0.050882,0.010885,0.020871,0.016065,0.015316,0.082416,0.026439,-0.009796,0.024806,...,0.076366,0.058996,-0.007883,-0.006861,0.037871,0.080777,0.003805,-0.043692,0.014962,-0.018252
1,-0.023748,-0.027829,0.039611,-0.026064,-0.026058,0.028148,0.02381,0.012893,-0.056767,-0.092917,...,0.006483,0.010279,-0.009074,0.025294,0.086963,-0.010997,-0.029916,0.041554,-0.076306,0.016453
2,-0.01706,-0.096994,0.080717,0.010154,0.057721,0.041141,-0.017184,0.091739,-0.024178,0.003997,...,-0.027514,0.080848,0.041183,-0.031503,0.061485,-0.034853,-0.007153,-0.021261,0.005128,0.033874
3,-0.017216,0.029307,0.018561,0.03239,-0.092339,0.02825,0.054671,-0.058927,-0.030307,-0.07462,...,-0.019505,0.061174,0.100986,0.076957,0.026438,-0.038645,-0.002913,-0.062605,-0.01956,0.036482
4,-0.066412,0.04143,0.024168,-0.024835,-0.047251,0.004067,0.041826,0.067204,0.009152,-0.033587,...,-0.052011,0.04686,0.012766,-0.038181,-0.029821,-0.056746,-0.071247,-0.098143,0.056451,0.016827
5,-0.068404,0.073785,-0.007416,-0.057306,-0.052568,0.005146,0.009472,0.008225,-0.016795,-0.094406,...,0.044339,-0.002395,-0.042168,0.034828,-0.064521,0.027444,0.111388,-0.042831,0.028051,-0.014232
6,-0.061993,0.061445,0.046042,-0.075196,-0.014218,0.01735,0.041884,0.053792,-0.021357,-0.001327,...,-0.016652,0.112115,0.061334,-0.044994,0.053192,-0.08433,0.071888,0.096161,-0.074792,-0.001803
7,-0.034332,0.105377,0.000874,-0.024742,-0.035971,0.032677,0.01016,0.02733,-0.032866,0.02317,...,-0.040546,-0.02362,-0.015956,0.055884,0.09221,-0.021645,-0.005034,-0.005857,-0.026242,0.065983
8,-0.041522,0.020602,0.104403,-0.03196,-0.015903,0.041202,0.02735,-0.049866,-0.040989,0.024788,...,0.108091,0.078839,0.018033,-0.067747,0.047154,0.015166,-0.005292,0.017044,-0.012899,0.070454
9,-0.02654,0.009129,0.019572,-0.005191,-0.046588,-0.00419,0.133072,-0.007904,-0.038943,0.01615,...,-0.055557,-0.001495,-0.004238,0.001453,-0.029417,0.05355,-0.01316,-0.069037,0.014732,0.021122


In [35]:
emb_df.to_csv(
    f'amazon_products_2023_embeddings.csv',
    index=False
)

In [47]:
import weaviate
import pandas as pd
import requests
from datetime import datetime, timezone
import json
from weaviate.util import generate_uuid5
from tqdm import tqdm
import os

client = weaviate.connect_to_local(headers=headers)

amazon_products = client.collections.get('amazon_products')

with amazon_products.batch.dynamic() as batch:

    for i, amazon_product in enumerate(merged_df.itertuples(index=False)):

        amazon_product_obj = {
            'title': amazon_product.title,
            'imageUrl': amazon_product.imgUrl,
            'productUrl': amazon_product.productURL,
            'stars': amazon_product.stars,
            'reviews': amazon_product.reviews,
            'price': amazon_product.price,
            'listPrice': amazon_product.listPrice,
            'categoryName': amazon_product.category_name,
            'isBestSeller': amazon_product.isBestSeller,
            'boughtInLastMonth': amazon_product.boughtInLastMonth
        }

        vector = emb_df.iloc[i].to_list()

        batch.add_object(
            properties=amazon_product_obj,
            uuid=generate_uuid5(amazon_product.id),
            vector=vector
        )

if len(amazon_products.batch.failed_objects) > 0:
    print(f'Failed to import {len(amazon_products.batch.failed_objects)} objects')

client.close()

Failed to import 3934 objects


In [49]:
import weaviate
import weaviate.classes.query as wq
import os

client = weaviate.connect_to_local(headers=headers)

amazon_products = client.collections.get('amazon_products')

response = amazon_products.query.hybrid(
    query='xbox games',
    vector= query('xbox games'),
    limit=10,
    return_metadata=wq.MetadataQuery(score=True)
)
for o in response.objects:
    print(
        o.properties["title"]
    ) 
    print(
        f"Hybrid score: {o.metadata.score:.3f}\n"
    )  

client.close()

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
My Fantastic Ranch (Xbox Series X / Sbox One)
Hybrid score: 1.254

Razer Essential Duo Bundle for Xbox: Kaira X Wired Headset & Universal Quick Charging Stand for Xbox Controllers - Color Matches Official Xbox Series X|S Controllers (Sold Separately) - Electric Volt
Hybrid score: 1.121

Warner Bros Batman: Arkham City
Hybrid score: 1.089

Wired Pro Gaming Controller with ALPS high-precision joystick and ALPS trigger Dual Vibration Gamepad with Back 4 Programmable Keys Turbo for PC Switch Android (Black&LED)
Hybrid score: 0.816

Harry Potter and the Goblet of Fire - Nintendo DS
Hybrid score: 0.732

YODEL WAY 12 pack Reusable Magnetic Water Balloons Yard Games Toddler bath toys Pool Toys Beach Toys Kids Outdoor Games Family Games Toddler pool toys
Hybrid score: 0.695

Goosebumps the Game 3DS - Nintendo 3DS (Renewed)
Hybrid score: 0.668

Briarpatch | Pete The Cat Terrific Taco Game
Hybrid score: 0.618

Lazmin112 2Pcs Thumbstick Soft Silicone Cover Gri