**Importing text lib**

To install gensim: pip install gensim

In [49]:
from gensim.models import Word2Vec
import gensim.downloader

**Defining text**

In [57]:
para = '''The cat chased the mouse.
I enjoy eating pizza.
The sun is shining brightly.
She played the piano skillfully.
The car engine revved as it sped down the highway.
The raindrops tapped gently on the windowpane.
He was reading an engaging novel.
The children laughed and played in the playground.
I love going for long walks on the beach.
The teacher explained the concept with clarity and enthusiasm.'''.replace(".", "")

sent = para.split("\n")

corpus = list(map(lambda x: x.lower().split(), sent))
corpus

[['the', 'cat', 'chased', 'the', 'mouse'],
 ['i', 'enjoy', 'eating', 'pizza'],
 ['the', 'sun', 'is', 'shining', 'brightly'],
 ['she', 'played', 'the', 'piano', 'skillfully'],
 ['the',
  'car',
  'engine',
  'revved',
  'as',
  'it',
  'sped',
  'down',
  'the',
  'highway'],
 ['the', 'raindrops', 'tapped', 'gently', 'on', 'the', 'windowpane'],
 ['he', 'was', 'reading', 'an', 'engaging', 'novel'],
 ['the', 'children', 'laughed', 'and', 'played', 'in', 'the', 'playground'],
 ['i', 'love', 'going', 'for', 'long', 'walks', 'on', 'the', 'beach'],
 ['the',
  'teacher',
  'explained',
  'the',
  'concept',
  'with',
  'clarity',
  'and',
  'enthusiasm']]

In [50]:
# Loading pretrained model which have 25 dimention vector
glove_vectors = gensim.downloader.load('glove-twitter-25')



In [62]:
# Checking if model is accurate

In [58]:
glove_vectors.most_similar("I enjoy eating pizza".lower().split())

[('food', 0.9221873879432678),
 ('eat', 0.921718180179596),
 ('fun', 0.9110204577445984),
 ('home', 0.9097210168838501),
 ('instead', 0.9087727069854736),
 ('drink', 0.906402587890625),
 ('some', 0.9060022830963135),
 ("'m", 0.9031291604042053),
 ('made', 0.901669979095459),
 ('making', 0.8984105587005615)]

In [72]:
glove_vectors["enjoy"]

array([-0.8138  ,  0.69989 , -0.23549 , -0.096892, -0.35515 ,  0.44054 ,
        1.7738  ,  0.03799 , -0.37699 ,  0.16381 , -0.81882 ,  0.080593,
       -3.5741  ,  0.11813 , -0.092855,  0.25156 ,  0.33449 , -1.0556  ,
       -0.47396 , -0.36944 , -0.79883 , -0.62922 , -1.0395  ,  0.70612 ,
        0.15686 ], dtype=float32)

Since we want to query sentences not words, we will average vector of words in sentance

In [None]:
id_to_sent = {sent[itr]: itr for }

In [114]:
def sent_to_vector(corpus):
    vector_data = []
    for sent in corpus:
        vectors = []
        for word in sent:
            vectors.append(glove_vectors[word].tolist())
        vector_data.append((" ".join(sent), np.sum(vectors, axis=0).tolist()))
    return vector_data

In [115]:
vector_data = sent_to_vector(corpus)

In [116]:
list(vector_data)[0]

('the cat chased the mouse',
 [-2.097753996029496,
  -0.6997420080006123,
  3.74414986371994,
  1.3985699713230133,
  -1.0467499792575836,
  -0.4337830077856779,
  5.812720134854317,
  -1.2122599333524704,
  1.1291100680828094,
  0.2251499891281128,
  1.4787900149822235,
  1.618649959564209,
  -20.55910015106201,
  -1.4676549565047026,
  -0.6231200098991394,
  0.39038002490997314,
  3.4229598939418793,
  -0.6336942254565656,
  1.2204300314188004,
  -0.3917349874973297,
  -1.6268289387226105,
  2.1921599209308624,
  2.0557399690151215,
  -3.919379949569702,
  -0.6049499809741974])

**Importing libs for pinecone**

In [94]:
import json
import numpy as np
import pinecone

Reading api info saved in json file

In [5]:
with open("./api_info.json", "r") as file:
    api_info = json.loads(file.read())[0]

**Connecting to pinecone**

In [7]:
pinecone.init(api_key=api_info["api_key"], environment=api_info["environment"])

**Creating index**

default value for metric is cosine

Can not use _ in naming index

The following example creates an index without a metadata configuration. By default, Pinecone indexes all metadata.

In [104]:
pinecone.create_index("index-op", dimension=25)

**Connecting to index**

In [105]:
index = pinecone.Index("index-op")

**Inserting data**

In [106]:
index.upsert(vector_data)

{'upserted_count': 10}

**Querying DB**

In [120]:
sent = ["teacher and children are playing with cat".split()]
test_vector = sent_to_vector(sent)[0]
test_vector

('teacher and children are playing with cat',
 [-3.2150900214910507,
  0.45599398016929626,
  2.628921963274479,
  -0.6650390140712261,
  0.2121719792485237,
  -1.426639936864376,
  10.166300058364868,
  -2.2912970362231135,
  0.021769024431705475,
  0.5260280072689056,
  0.22117400914430618,
  1.8445659428834915,
  -32.64740037918091,
  -1.6807800121605396,
  0.5909821628592908,
  -0.4874569969251752,
  2.169450044631958,
  -2.7750430554151535,
  1.8976910039782524,
  -1.7930880882777274,
  -0.30639907717704773,
  1.3508699834346771,
  -2.4706099033355713,
  -3.3444470278918743,
  -1.3290099650621414])

In [122]:
index.query(
    vector=test_vector[1],
    top_k=3,
)

{'matches': [{'id': 'the children laughed and played in the playground',
              'score': 0.975262046,
              'values': []},
             {'id': 'the cat chased the mouse',
              'score': 0.959757924,
              'values': []},
             {'id': 'i love going for long walks on the beach',
              'score': 0.95922786,
              'values': []}],
 'namespace': ''}