In [9]:
# %pip install sentence-transformers pinecone-datasets pandas pinecone-client flask flask_cors

In [2]:
import pinecone
pinecone.__version__

'3.2.2'

In [10]:
import pandas as pd
dataset = pd.read_csv('E:\Documents\KYH\Data\Quran\sahihinternational.csv')

#remove index column
#dataset = dataset.drop(columns=['Unnamed: 0'])

In [11]:
print(len(dataset))
dataset.head()

6236


Unnamed: 0,id,surah,verse,text
0,1:1,1,1,"In the name of Allah, the Entirely Merciful, t..."
1,1:2,1,2,"All praise is due to Allah, Lord of the worlds -"
2,1:3,1,3,"The Entirely Merciful, the Especially Merciful,"
3,1:4,1,4,Sovereign of the Day of Recompense.
4,1:5,1,5,It is You we worship and You we ask for help.


In [12]:
df = dataset['text'].copy()

df.head()

0    In the name of Allah, the Entirely Merciful, t...
1     All praise is due to Allah, Lord of the worlds -
2      The Entirely Merciful, the Especially Merciful,
3                  Sovereign of the Day of Recompense.
4        It is You we worship and You we ask for help.
Name: text, dtype: object

In [13]:
df[70:80]

70    Then you turned away after that. And if not fo...
71    And you had already known about those who tran...
72    And We made it a deterrent punishment for thos...
73    And recall when Moses said to his people, "Ind...
74    They said, "Call upon your Lord to make clear ...
75    They said, "Call upon your Lord to show us wha...
76    They said, "Call upon your Lord to make clear ...
77    He said, "He says, 'It is a cow neither traine...
78    And recall when you slew a man and disputed ov...
79    So, We said, "Strike the slain man with part o...
Name: text, dtype: object

In [14]:
# df = df.str[:255]

df.reset_index(drop=True, inplace=True)

In [15]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print(f"You are using {device}. This is much slower than using "
          "a CUDA-enabled GPU. If on Colab you can change this by "
          "clicking Runtime > Change runtime type > GPU.")

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
model

You are using cpu. This is much slower than using a CUDA-enabled GPU. If on Colab you can change this by clicking Runtime > Change runtime type > GPU.




SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [16]:
input = 'Satan leads mankind astray'

xq = model.encode(input)
xq.shape

(384,)

In [17]:
_id = '0'
metadata = {'text': input}

vectors = [(_id, xq, metadata)]

In [18]:
from pinecone import Pinecone, ServerlessSpec
import api

pc = Pinecone(api_key=api.API_KEY)
spec = ServerlessSpec(cloud='aws', region='us-east-1')

In [19]:
index_name = 'semantic-search'

In [20]:
import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=model.get_sentence_embedding_dimension(),
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 6236}},
 'total_vector_count': 6236}

In [21]:
pc.list_indexes().names()

['semantic-search']

In [None]:
from tqdm.auto import tqdm

batch_size = 128

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in df[i:i_end]]
    # create embeddings
    xc = model.encode(df.loc[i:i_end].to_list())
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

    print('batch_upserted', i)

# check number of records in the index
index.describe_index_stats()

In [23]:
query = "Satan leads mankind astray"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(vector=xq, top_k=5, include_metadata=True)
xc

{'matches': [{'id': '2811',
              'metadata': {'text': 'O you who have believed, do not follow the '
                                   'footsteps of Satan. And whoever follows '
                                   'the footsteps of Satan - indeed, he '
                                   'enjoins immorality and wrongdoing. And if '
                                   'not for the favor of Allah upon you and '
                                   'His mercy, not one of you would have been '
                                   'pure, ever, but Allah purifies whom He '
                                   'wills, and Allah is Hearing and Knowing.'},
              'score': 0.610572815,
              'values': []},
             {'id': '174',
              'metadata': {'text': 'O mankind, eat from whatever is on earth '
                                   'that is lawful and good and do not follow '
                                   'the footsteps of Satan. Indeed, he is to '
              

In [33]:
from flask import jsonify

print(type(xc.get('matches')))

<class 'list'>


In [37]:
output = {}

# Add text and score to final_output
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

0.61: O you who have believed, do not follow the footsteps of Satan. And whoever follows the footsteps of Satan - indeed, he enjoins immorality and wrongdoing. And if not for the favor of Allah upon you and His mercy, not one of you would have been pure, ever, but Allah purifies whom He wills, and Allah is Hearing and Knowing.
0.61: O mankind, eat from whatever is on earth that is lawful and good and do not follow the footsteps of Satan. Indeed, he is to you a clear enemy.
0.58: My Lord, indeed they have led astray many among the people. So whoever follows me - then he is of me; and whoever disobeys me - indeed, You are yet Forgiving and Merciful.
0.57: Whoever Allah sends astray - there is no guide for him. And He leaves them in their transgression, wandering blindly.
0.57: Indeed, those who disbelieve and avert people from the way of Allah have certainly gone far astray.
{'text': 'Indeed, those who disbelieve and avert people from the way of Allah have certainly gone far astray.', 's

In [None]:
query = "which metropolis has the highest number of people?"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(vector=xq, top_k=5, include_metadata=True)
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

0.45: And the people of the city came rejoicing.
0.45: And how many a city which was unjust have We shattered and produced after it another people.
0.42: I swear by this city, Makkah -
0.4: And there is no city but that We will destroy it before the Day of Resurrection or punish it with a severe punishment. That has ever been in the Register inscribed.
0.38: And how many a city have We destroyed that was insolent in its way of living, and those are their dwellings which have not been inhabited after them except briefly. And it is We who were the inheritors.


In [None]:
# pc.delete_index(index_name)