In [1]:
import pandas as pd
from nlp_toolkit import (
    set_proxy,
    SentenceEmbedder,
    make_chroma,
    insert_df_collection
)

import chromadb

# if your internet is limited, you can use a vpn to access the internet
# this function will set the proxy in the current environment
set_proxy()

Using http proxy: http://127.0.0.1:8001
Turn on VPN with corresponding proxy.


In [2]:
description = pd.read_csv("appleStore_description.csv")
description

Unnamed: 0,id,track_name,size_bytes,app_desc
0,281656475,PAC-MAN Premium,100788224,"SAVE 20%, now only $3.99 for a limited time!\n..."
1,281796108,Evernote - stay organized,158578688,Let Evernote change the way you organize your ...
2,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,Download the most popular free weather app pow...
3,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,The eBay app is the best way to find anything ...
4,282935706,Bible,92774400,On more than 250 million devices around the wo...
...,...,...,...,...
7192,1187617475,Kubik,126644224,Place the falling blocks correctly in order to...
7193,1187682390,VR Roller-Coaster,120760320,A thrilling virtual reality roller coaster exp...
7194,1187779532,Bret Michaels Emojis + Lyric Keyboard,111322112,"Rock star Bret Michaels, winner of Celebrity A..."
7195,1187838770,VR Roller Coaster World - Virtual Reality,97235968,VR Roller Coaster World is an app for Google C...


1. Implement an embedding function
2. Make the embedding function compatible with Chromadb
3. Insert embeddings into Chromadb(Run with GPU support is prefereed.)
4. Query documents with vector database.

In [3]:
sentences = description.app_desc.tolist()

# initialize an embedder
embedder = SentenceEmbedder(mps = True, batch_size=256)

# make a chromma db compatible embedder
chroma_embedder = make_chroma(embedder)

# initialize the database
embeded_sentences = embedder(sentences)

# create client
chroma_client = chromadb.PersistentClient(path='./chroma_storage')

# create a collection
collection = chroma_client.create_collection(
    name="app", 
    embedding_function=chroma_embedder, 
    metadata={"hnsw:space": "cosine"})

# insert data into chromadb collection
insert_df_collection(
    collection,
    embeded_sentences, 
    description, 
    'app_desc', 
    'id', 
    meta_cols = ['track_name']
)


Embedding Batches: 100%|██████████| 29/29 [00:57<00:00,  1.98s/it]


Successful added to collection


In [4]:
import chromadb
from nlp_toolkit import (
    set_proxy,
    SentenceEmbedder,
    make_chroma,
    update_metafield
)

set_proxy()

embedder = SentenceEmbedder(mps = True, batch_size=256)

chroma_embedder = make_chroma(embedder)

chroma_client = chromadb.PersistentClient(path='./chroma_storage')

collection = chroma_client.get_collection(name="app", embedding_function=chroma_embedder)

Using http proxy: http://127.0.0.1:8001
Turn on VPN with corresponding proxy.


In [5]:
results = collection.query(
    query_texts=["music music score, instrument, guitar, piano, bass, band, sound, tunning, scale,"],
    n_results=20,
)

results['metadatas']

Embedding Batches: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]


[[{'track_name': 'TonalEnergy Chromatic Tuner and Metronome'},
  {'track_name': 'iReal Pro - Music Book & Play Along'},
  {'track_name': 'Easy Music - Give kids an ear for music'},
  {'track_name': 'PrestoBand Guitar and Piano'},
  {'track_name': 'Piano - Play Keyboard Music Games with Magic Tiles'},
  {'track_name': 'ABRSM Aural Trainer Grades 1-5'},
  {'track_name': 'Magic Piano by Smule'},
  {'track_name': 'Free Piano app by Yokee'},
  {'track_name': 'SOUND Canvas'},
  {'track_name': 'Mastering the piano with Lang Lang'},
  {'track_name': 'Final Guitar - absolute guitar app'},
  {'track_name': 'Guitar Suite - Metronome, Tuner, and Chords Library for Guitar, Bass, Ukulele'},
  {'track_name': 'Tongo Music - for kids and families'},
  {'track_name': 'QQ音乐-来这里“发现・音乐”'},
  {'track_name': 'OnSong'},
  {'track_name': 'Music Memos'},
  {'track_name': 'Cytus'},
  {'track_name': 'Musicloud Pro - MP3 and FLAC Music Player for Cloud Platforms.'},
  {'track_name': 'Musicloud - MP3 and FLAC Music

In [6]:
results = collection.query(
    query_texts=["sex, meet friends in your city, social, make love, communication, marry, marriage, single"],
    n_results=30,
)

results['metadatas']

Embedding Batches: 100%|██████████| 1/1 [00:00<00:00,  1.38it/s]


[[{'track_name': 'A New Life'},
  {'track_name': 'Hey! VINA - Where Women Meet New Friends'},
  {'track_name': 'Live.me – Live Video Chat & Make Friends Nearby'},
  {'track_name': 'Match™ - #1 Dating App.'},
  {'track_name': 'Hero Generations'},
  {'track_name': 'Cougar Dating & Life Style App for Mature Women'},
  {'track_name': 'My Virtual Girlfriend - Deluxe Dating Sim'},
  {'track_name': 'Dice Mania - Play Free Online Classic Board Game with Friends'},
  {'track_name': 'Mystic Messenger'},
  {'track_name': 'HARVEST MOON: Seeds Of Memories'},
  {'track_name': 'Megapolis'},
  {'track_name': 'Suppin Detective: Expose their true visage!'},
  {'track_name': 'eHarmony™ Dating App - Meet Singles'},
  {'track_name': 'Beast Poker'},
  {'track_name': 'Face Up - The Selfie Game'},
  {'track_name': 'Village Life: Love, Marriage and Babies'},
  {'track_name': 'happn — Dating app — Find and meet your crush'},
  {'track_name': 'Toca Life: City'},
  {'track_name': 'Century City'},
  {'track_name':

In [7]:
from nlp_toolkit import mean_seed_query

In [15]:
seed_apps = ['1093776653', '1043337296', '1089836344', '305939712', '1072881532']

mean_seed_query(collection, seed_apps,n =50)['metadatas']

[[{'track_name': 'Hey! VINA - Where Women Meet New Friends'},
  {'track_name': 'Live.me – Live Video Chat & Make Friends Nearby'},
  {'track_name': 'A New Life'},
  {'track_name': 'Match™ - #1 Dating App.'},
  {'track_name': 'Hero Generations'},
  {'track_name': 'Cougar Dating & Life Style App for Mature Women'},
  {'track_name': 'Toca Life: City'},
  {'track_name': 'The Sims™ FreePlay'},
  {'track_name': 'Meipai'},
  {'track_name': 'Dice With Buddies: Fun New Social Dice Game'},
  {'track_name': 'LINE PLAY - Your Avatar World'},
  {'track_name': 'Super Fashion Show - Girls Makeup, Dressup Games'},
  {'track_name': 'Village Life: Love, Marriage and Babies'},
  {'track_name': 'Builder Buddies: 3D City Building Simulator'},
  {'track_name': 'ReRunners: Race for the World'},
  {'track_name': 'Toca Life: Town'},
  {'track_name': 'Grindr - Gay and same sex guys chat, meet and date'},
  {'track_name': 'The Game of Life'},
  {'track_name': 'Whispered Secrets: Into the Beyond - A Hidden Object

In [16]:
from nlp_toolkit import concat_text_query

concat_text_query(collection, seed_apps,n =50)['metadatas']

Embedding Batches: 100%|██████████| 1/1 [00:00<00:00,  1.34it/s]


[[{'track_name': 'Match™ - #1 Dating App.'},
  {'track_name': 'Hey! VINA - Where Women Meet New Friends'},
  {'track_name': 'Cougar Dating & Life Style App for Mature Women'},
  {'track_name': 'eHarmony™ Dating App - Meet Singles'},
  {'track_name': 'Bumble – Find a Date, Meet Friends & Network'},
  {'track_name': 'MeetMe - Chat and Meet New People'},
  {'track_name': 'Grindr - Gay and same sex guys chat, meet and date'},
  {'track_name': 'POF - Best Dating App for Conversations'},
  {'track_name': 'Tinder'},
  {'track_name': 'Badoo - Meet New People, Chat, Socialize.'},
  {'track_name': 'IAmNaughty – Dating App to Meet New People Online'},
  {'track_name': 'happn — Dating app — Find and meet your crush'},
  {'track_name': "Kitty Powers' Matchmaker"},
  {'track_name': 'It Girl Story - Styling, Fashion & Celebrity Life'},
  {'track_name': 'Live.me – Live Video Chat & Make Friends Nearby'},
  {'track_name': 'Tantan'},
  {'track_name': 'TETRIS® FREE'},
  {'track_name': 'LongStory: Choose 

In [19]:
collection.query(query_texts=['videos, movies, tv shows, talk show'])['metadatas']

Embedding Batches: 100%|██████████| 1/1 [00:00<00:00,  8.28it/s]


[[{'track_name': 'TED'},
  {'track_name': 'Vlogger Go Viral - Clicker Game & Vlog Simulator'},
  {'track_name': 'BuzzFeed Video'},
  {'track_name': 'Talking Tom & Ben News for iPad'},
  {'track_name': 'Disney XD – Watch Full Episodes, Movies & Live TV'},
  {'track_name': 'YouTube Kids'},
  {'track_name': 'Disney Channel – Watch Full Episodes, Movies & TV'},
  {'track_name': 'Fullscreen'},
  {'track_name': 'Hoodclips'},
  {'track_name': 'Disney LOL'}]]

In [20]:
update_metafield(collection, ids = None, label = "unknown")

Update metdata successfully.


In [22]:
collection.get(limit = 10)['metadatas']

[{'label': 'unknown',
  'track_name': 'PAC-MAN Premium',
  'update_time': '2023-10-31 22:43:48'},
 {'label': 'unknown',
  'track_name': 'Evernote - stay organized',
  'update_time': '2023-10-31 22:43:48'},
 {'label': 'unknown',
  'track_name': 'WeatherBug - Local Weather, Radar, Maps, Alerts',
  'update_time': '2023-10-31 22:43:48'},
 {'label': 'unknown',
  'track_name': 'eBay: Best App to Buy, Sell, Save! Online Shopping',
  'update_time': '2023-10-31 22:43:48'},
 {'label': 'unknown',
  'track_name': 'Bible',
  'update_time': '2023-10-31 22:43:48'},
 {'label': 'unknown',
  'track_name': 'Shanghai Mahjong',
  'update_time': '2023-10-31 22:43:48'},
 {'label': 'unknown',
  'track_name': 'PayPal - Send and request money safely',
  'update_time': '2023-10-31 22:43:48'},
 {'label': 'unknown',
  'track_name': 'Pandora - Music & Radio',
  'update_time': '2023-10-31 22:43:48'},
 {'label': 'unknown',
  'track_name': 'PCalc - The Best Calculator',
  'update_time': '2023-10-31 22:43:48'},
 {'labe