In [2]:
###Prerequisites
##To complete this tutorial, you will need:

###Docker - The easiest way to use Qdrant is to run a pre-built Docker image.
###Raw parsed data from startups-list.com.
###Python version >=3.8

In [4]:
#download the dataset.
!wget https://storage.googleapis.com/generall-shared-data/startups_demo.json

--2025-01-20 10:09:05--  https://storage.googleapis.com/generall-shared-data/startups_demo.json
Resolving storage.googleapis.com (storage.googleapis.com)... 2404:6800:4007:816::201b, 2404:6800:4007:815::201b, 2404:6800:4007:818::201b, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|2404:6800:4007:816::201b|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22205751 (21M) [application/json]
Saving to: ‘startups_demo.json’


2025-01-20 10:09:10 (5.93 MB/s) - ‘startups_demo.json’ saved [22205751/22205751]



In [13]:
# pip install sentence-transformers numpy pandas tqdm
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import json

# You will be using a pre-trained model called all-MiniLM-L6-v2. This is a performance-optimized sentence embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
model.device

device(type='mps', index=0)

In [17]:
df = pd.read_json('startups_demo.json', lines=True)
df

Unnamed: 0,name,images,alt,description,link,city
0,SaferCodes,https://safer.codes/img/brand/logo-icon.png,SaferCodes Logo QR codes generator system form...,QR codes systems for COVID-19.\nSimple tools f...,https://safer.codes,Chicago
1,Human Practice,https://d1qb2nb5cznatu.cloudfront.net/startups...,Human Practice - health care information tech...,Point-of-care word of mouth\nPreferral is a mo...,http://humanpractice.com,Chicago
2,StyleSeek,https://d1qb2nb5cznatu.cloudfront.net/startups...,StyleSeek - e-commerce fashion mass customiza...,Personalized e-commerce for lifestyle products...,http://styleseek.com,Chicago
3,Scout,https://d1qb2nb5cznatu.cloudfront.net/startups...,Scout - security consumer electronics interne...,Hassle-free Home Security\nScout is a self-ins...,http://www.scoutalarm.com,Chicago
4,Invitation codes,https://invitation.codes/img/inv-brand-fb3.png,Invitation App - Share referral codes community,The referral community\nInvitation App is a so...,https://invitation.codes,Chicago
...,...,...,...,...,...,...
40469,Drunken Moose,https://d1qb2nb5cznatu.cloudfront.net/startups...,Drunken Moose - digital media advertising des...,Branding and Marketing Consultancy Agency\nHel...,http://www.drunkenmoose.com.au,Sydney
40470,AA Adonis Rubbish Removals,https://d1qb2nb5cznatu.cloudfront.net/startups...,AA Adonis Rubbish Removals - cleaning,Rubbish Removals Sydney\nAA Adonis Rubbish Rem...,http://www.aaadonisrubbishremovals.com.au/,Sydney
40471,QualityTrade,https://d1qb2nb5cznatu.cloudfront.net/startups...,QualityTrade - B2B,Merit based wholesale trade platform. \nQualit...,https://www.qualitytrade.com/,Sydney
40472,The Myer Family Company,https://d1qb2nb5cznatu.cloudfront.net/startups...,The Myer Family Company -,MFCo is a family office specialising in design...,http://www.mfco.com.au/,Sydney


In [40]:
vectors = model.encode(
    [row.alt + ". " + row.description for row in df.itertuples()],
    show_progress_bar=True,
)

Batches:   0%|          | 0/1265 [00:00<?, ?it/s]

In [41]:
vectors.shape
# > (40474, 384)

(40474, 384)

In [42]:
np.save("startup_vectors.npy", vectors, allow_pickle=False)



In [43]:
# Import client library
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

client = QdrantClient("http://localhost:6333")

In [44]:
if not client.collection_exists("startups"):
    client.create_collection(
        collection_name="startups",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE),
    )

In [45]:
fd = open("./startups_demo.json")

# payload is now an iterator over startup data
payload = map(json.loads, fd)

# Load all vectors into memory, numpy array works as iterable for itself.
# Other option would be to use Mmap, if you don't want to load all data into RAM
vectors = np.load("./startup_vectors.npy")

In [46]:
client.upload_collection(
    collection_name="startups",
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=256,  # How many vectors will be uploaded in a single request?
)