In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [2]:
import cohere
co = cohere.Client(os.environ['COHERE_API_KEY'])

In [3]:
import pandas as pd

### Word Embeddings

In [4]:
three_words = pd.DataFrame({'text':
    [
        'mac',
        'linux',
        'windows'
    ]
})
three_words

Unnamed: 0,text
0,mac
1,linux
2,windows


create embeddings for three words for sake of testing

In [5]:
three_words_emb = co.embed(texts=list(three_words['text']), model='embed-english-v2.0').embeddings

In [6]:
word_1 = three_words_emb[0]
word_2 = three_words_emb[1]
word_3 = three_words_emb[2]

In [7]:
word_1[:10]

[2.203125,
 -0.30615234,
 -0.7167969,
 -1.2265625,
 0.31054688,
 -1.4736328,
 0.3876953,
 -1.5556641,
 -0.15014648,
 0.8671875]

### Sentence Embeddings

In [8]:
sentences = pd.DataFrame({'text':
  [
   'Where is the world cup?',
   'The world cup is in Qatar',
   'What color is the sky?',
   'The sky is blue',
   'Where does the bear live?',
   'The bear lives in the the woods',
   'What is an apple?',
   'An apple is a fruit',
  ]})

sentences

Unnamed: 0,text
0,Where is the world cup?
1,The world cup is in Qatar
2,What color is the sky?
3,The sky is blue
4,Where does the bear live?
5,The bear lives in the the woods
6,What is an apple?
7,An apple is a fruit


In [9]:
emb = co.embed(texts=list(sentences['text']), model='embed-english-v2.0').embeddings
for e in emb:
    print(e[:3])

[0.27319336, -0.37768555, -1.0273438]
[0.49804688, 1.2236328, 0.4074707]
[-0.23571777, -0.9375, 0.9614258]
[0.08300781, -0.32080078, 0.9272461]
[0.49780273, -0.35058594, -1.6171875]
[1.2294922, -1.3779297, -1.8378906]
[0.15686035, -0.92041016, 1.5996094]
[1.0761719, -0.7211914, 0.9296875]


In [10]:
len(emb[0])

4096

In [11]:
from utils import umap_plot

In [12]:
chart = umap_plot(sentences, emb)

In [13]:
chart.interactive()

Notice that every question and its answer comes closer to each others during embedding similarty even though this can be an application for question answering or replace this dataset with search queries and documents matches this query

### Wikipedia dump articles embeddings

In [14]:
wiki_articles = pd.read_pickle('wikipedia.pkl')
wiki_articles

Unnamed: 0,id,title,text,url,wiki_id,views,paragraph_id,langs,emb
0,0,24-hour clock,The 24-hour clock is a way of telling the time...,https://simple.wikipedia.org/wiki?curid=9985,9985,2450.625488,0,30,"[0.07711287587881088, 0.3197174072265625, -0.2..."
1,14,The Dark Knight Trilogy,"The ""Dark Knight"" Series is a set of three Chr...",https://simple.wikipedia.org/wiki?curid=377304,377304,1897.982666,0,13,"[0.2566547989845276, -0.17023412883281708, 0.1..."
2,19,Abella Danger,"Abella Danger (born November 19, 1995) is an A...",https://simple.wikipedia.org/wiki?curid=797944,797944,1748.024170,0,30,"[-0.20083625614643097, -0.14190533757209778, -..."
3,24,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro (born 5 Fe...,https://simple.wikipedia.org/wiki?curid=65655,65655,1564.210449,0,156,"[-0.4638298451900482, -0.0351627953350544, 0.7..."
4,61,Carles Puigdemont,Carles Puigdemont i Casamajó (born 29 December...,https://simple.wikipedia.org/wiki?curid=540154,540154,1542.519531,0,81,"[-0.12089978903532028, 0.06191902980208397, 0...."
...,...,...,...,...,...,...,...,...,...
1995,34633,1st century,"During this period Europe, North Africa and th...",https://simple.wikipedia.org/wiki?curid=24589,24589,122.295425,0,133,"[-0.33043625950813293, -0.234648197889328, -0...."
1996,34653,Operation Restore Hope,The Operation Restore Hope was an operation of...,https://simple.wikipedia.org/wiki?curid=427200,427200,122.192032,0,12,"[0.5195494294166565, -0.06794795393943787, 0.2..."
1997,34655,Rumi,Jalal ad-Din Muhammad Rumi (30 September 1207 ...,https://simple.wikipedia.org/wiki?curid=64995,64995,122.192032,0,101,"[-0.254226416349411, 0.6597043871879578, -0.00..."
1998,34669,Korean War,"The Korean War (Korean: 한국전잴, Russian: Корейск...",https://simple.wikipedia.org/wiki?curid=7537,7537,122.175140,0,120,"[0.22879508137702942, -0.1242295652627945, -0...."


In [15]:
import numpy as np
from utils import umap_plot_big

In [16]:
articles = wiki_articles[['title', 'text']]
wiki_embeds = np.array([d for d in wiki_articles['emb']])

chart = umap_plot_big(articles, wiki_embeds)

In [17]:
chart.interactive()