In [1]:
import re
import os
import ast
import json
import pickle
import pandas as pd
import qdrant_client
from rouge import Rouge
from dotenv import load_dotenv
from langchain.schema import Document
from langchain.vectorstores import Qdrant
from nltk.translate.bleu_score import corpus_bleu
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_text(self, text):
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()

In [3]:
df = pd.read_csv('preprocessed_data.csv')

In [4]:
df.head(20)

Unnamed: 0,id,question,context,answers
0,1,ููู ูุนุจุฏ ุงููู,ูู ุนูู ุนููุง ููุณ ุนููู ุฃูุฑูุง ููู ุฑุฏ,"{'answer_start': [0], 'text': ['ูู ุนูู ุนููุง ูู..."
1,2,ูู ูุนุจุฏ ุงููู ุฎููุง ูุทูุนุง,ุฃุณุฃู ุงููู ุงูุฌูุฉ ูุฃุนูุฐ ุจู ูู ุงููุงุฑ,"{'answer_start': [0], 'text': ['ุฃุณุฃู ุงููู ุงูุฌู..."
2,3,ูุง ูู ุงูุฅุญุณุงู ูู ุงูุนุจุงุฏุฉ,ุงูุฅุญุณุงู ุฃู ุชุนุจุฏ ุงููู ูุฃูู ุชุฑุงู ูุฅู ูู ุชูู ุชุฑุงู...,"{'answer_start': [8], 'text': ['ุฃู ุชุนุจุฏ ุงููู ู..."
3,4,ูุง ูุนูู ูุง ุฅูู ุฅูุง ุงููู,ูู ูุงู ูุข ุฅูู ุฅูุง ุงููู ูููุฑ ุจูุง ูุนุจุฏ ูู ุฏูู ุงู...,"{'answer_start': [0], 'text': ['ูู ูุงู ูุข ุฅูู ..."
4,5,ูุง ูู ุงูุชูุญูุฏ ูู ุตูุงุช ุงููู,ููุฒู ุฑุจูุง ุชุจุงุฑู ูุชุนุงูู ูู ูู ูููุฉ ุฅูู ุงูุณูุงุก ุง...,"{'answer_start': [0], 'text': ['ููุฒู ุฑุจูุง ุชุจุงุฑ..."
5,6,ูุง ูู ูุงุฆุฏุฉ ุงูุชูุญูุฏ ูููุณูู,ุญู ุงูุนุจุงุฏ ุนูู ุงููู ุฃู ูุง ูุนุฐุจ ูู ูุง ูุดุฑู ุจู ุดูุฆุง,"{'answer_start': [19], 'text': ['ุฃู ูุง ูุนุฐุจ ูู..."
6,7,ุฃูู ุงููู,ุฅู ุงููู ูุชุจ ูุชุงุจุง ุฅู ุฑุญูุชู ุณุจูุช ุบุถุจู ููู ููุชูุจ...,"{'answer_start': [37], 'text': ['ููู ููุชูุจ ุนูุฏ..."
7,8,ูู ุงููู ูุนูุง ุจุฐุงุชู ุฃู ุจุนููู,ุฅููู ุชุฏุนูู ุณููุนุง ูุฑูุจุง ููู ูุนูู,"{'answer_start': [0], 'text': ['ุฅููู ุชุฏุนูู ุณูู..."
8,9,ูุง ูู ุฃุนุธู ุงูุฐููุจ,ุณุฆู ุตูู ุงููู ุนููู ูุณูู ุฃู ุงูุฐูุจ ุฃุนุธู ูุงู ุฃู ุชุฏ...,"{'answer_start': [41], 'text': ['ุฃู ุชุฏุนู ููู ู..."
9,10,ูุง ูู ุงูุดุฑู ุงูุฃูุจุฑ,ุฃูุจุฑ ุงููุจุงุฆุฑ ุงูุฅุดุฑุงู ุจุงููู,"{'answer_start': [13], 'text': ['ุงูุฅุดุฑุงู ุจุงููู']}"


In [4]:
langchain_data_dicts = [
    Document(page_content=item['context'],
             metadata={
                 'id':item['id'],
              'question':item['question'],
                 'answers':item['answers']})
    for i,item in df.iterrows()
]
langchain_data_dicts[31]

Document(page_content='ุฅููู ุชุฏุนูู ุณููุนุง ูุฑูุจุง ููู ูุนูู', metadata={'id': 32, 'question': 'ูู ูุญุชุงุฌ ุงูุฏุนุงุก ููุงุณุทุฉ ูุฎููู', 'answers': "{'answer_start': [0], 'text': ['ุฅููู ุชุฏุนูู ุณููุนุง ูุฑูุจุง ููู ูุนูู']}"})

In [5]:
_ = load_dotenv()

In [6]:
QDRANT_URL= os.getenv('QDRANT_URL')
QDRANT_API_KEY= os.getenv('QDRANT_API_KEY')

In [7]:
QDRANT_URL

'https://71ff8ef3-b516-4636-98fa-1c30709ac4b6.us-east4-0.gcp.cloud.qdrant.io:6333'

In [8]:
QDRANT_API_KEY

'xs05V-jNZf5dpkeXlWEsWW4pQDzeHCEPBn6hiMoxB5RwDORipYVYig'

In [9]:
client = qdrant_client.QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY
)
client

<qdrant_client.qdrant_client.QdrantClient at 0x20a895276d0>

In [10]:
collection_name = "Cluster0"

In [11]:
# collection_config = qdrant_client.http.models.VectorParams(
#     size = 384,
#     distance = qdrant_client.http.models.Distance.COSINE
# )

In [11]:
# #Setting up and createing Qdrant Which is Already Created
# client.recreate_collection(
#     collection_name = collection_name,
#     vectors_config = collection_config
# )

In [37]:
# we will use the embeddings of E5-Large
embeddings = HuggingFaceEmbeddings(
    model_name = "intfloat/multilingual-e5-small"
)

In [13]:
vectorStore = Qdrant(
    client = client,
    collection_name = collection_name,
    embeddings = embeddings
)
vectorStore

<langchain_community.vectorstores.qdrant.Qdrant at 0x20ab73ce440>

In [14]:
# #Run For only one time to upload the data on Qdrant Which is Already Uploaded
# vectorStore.add_documents(langchain_data_dicts)

In [15]:
q1 = "ููู ูุนุจุฏ ุงููู"
vectorStore.similarity_search_with_score(query=q1,k=4)

In [16]:
retriever = vectorStore.as_retriever()

In [17]:
q = "ูู ุงูุฏุนุงุก ุนุจุงุฏุฉ ููู ุชุนุงูู"
docs = retriever.get_relevant_documents(q)
docs

  warn_deprecated(


[Document(page_content='ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ', metadata={'answers': "{'answer_start': [0], 'text': ['ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ']}", 'id': 15, 'question': 'ูู ุงูุฏุนุงุก ุนุจุงุฏุฉ ููู ุชุนุงูู', '_id': '4b13ba82-c836-4f76-996d-3b4fe81ff864', '_collection_name': 'Cluster0'}),
 Document(page_content='ูุงู ุฑุณูู ุงููู ๏ทบ ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ', metadata={'answers': "{'answer_start': [16], 'text': ['ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ']}", 'id': 97, 'question': 'ูุง ุฃูููุฉ ุงูุฏุนุงุก', '_id': 'fdf255be-85f5-4208-9fb3-56d195f3250e', '_collection_name': 'Cluster0'}),
 Document(page_content='ูุงู ุงููุจู ๏ทบ ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ', metadata={'answers': "{'answer_start': [12], 'text': ['ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ']}", 'id': 1594, 'question': 'ูู ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ', '_id': '5062fd88-9d7c-4c03-8e0e-7473639e9b8c', '_collection_name': 'Cluster0'}),
 Document(page_content='ุงูููู ูู ุจูุบุช ุงูููู 

In [18]:
# unique elements from the 'question' column
unique_questions_df = df.drop_duplicates(subset=['question'])
len(unique_questions_df)

1356

In [19]:
def get_relevant_docs(question,k):
  relevant_docs = vectorStore.similarity_search_with_score(query=question,k=k)
  return relevant_docs

In [13]:
def evaluate_precision_OR_recall_at_top_k_docs(unique_questions_df,k):
    total_count = len(unique_questions_df)
    correct_count = sum(1 for _, row in unique_questions_df.iterrows() if any(doc[0].metadata['question']== row['question'] for doc in get_relevant_docs(row['question'],k=k)))
    precision_OR_recall = (correct_count / total_count) * 100
    return precision_OR_recall

In [18]:
# precision_OR_recall_at_top_8_docs = evaluate_precision_OR_recall_at_top_k_docs(unique_questions_df,k=8)
# print("Precision OR Recall at top 8 Documents:", precision_OR_recall_at_top_8_docs)

Precision OR Recall at top 8 Documents: 46.976401179941


In [None]:
# def extract_contexts(relevant_docs):
#   contexts = []

#   for doc in relevant_docs:
#     contexts.append(doc[0].page_content)

#   return contexts

In [None]:
# extract_contexts(get_relevant_docs("ูุง ุงูุฏุนุงุก ุงููุงุฑุฏ ูู ุตูุงุฉ ุงูุงุณุชุฎุงุฑุฉ",k=5))

['ูุงู ุฑุณูู ุงููู ๏ทบ ูุนูููุง ุงูุงุณุชุฎุงุฑุฉ ูู ุงูุฃููุฑ ูููุง ููุง ูุนูููุง ุงูุณูุฑุฉ ูู ุงููุฑุขู ูููู ุฅุฐุง ูู ุฃุญุฏูู ุจุงูุฃูุฑ ูููุฑูุน ุฑูุนุชูู ูู ุบูุฑ ุงููุฑูุถุฉ ุซู ูููู ุงูููู ุฅูู ุฃุณุชุฎูุฑู ุจุนููู ูุฃุณุชูุฏุฑู ุจูุฏุฑุชู ูุฃุณุฃูู ูู ูุถูู ุงูุนุธูู ูุฅูู ุชูุฏุฑ ููุง ุฃูุฏุฑ ูุชุนูู ููุง ุฃุนูู ูุฃูุช ุนูุงู ุงูุบููุจ ุงูููู ุฅู ููุช ุชุนูู ุฃู ูุฐุง ุงูุฃูุฑ ุฎูุฑ ูู ูู ุฏููู ููุนุงุดู ูุนุงูุจุฉ ุฃูุฑู ุฃู ูุงู ุนุงุฌู ุฃูุฑู ูุขุฌูู ูุงูุฏุฑู ูู ููุณุฑู ูู ุซู ุจุงุฑู ูู ููู ูุฅู ููุช ุชุนูู ุฃู ูุฐุง ุงูุฃูุฑ ุดุฑ ูู ูู ุฏููู ููุนุงุดู ูุนุงูุจุฉ ุฃูุฑู ุฃู ูุงู ูู ุนุงุฌู ุฃูุฑู ูุขุฌูู ูุงุตุฑูู ุนูู ูุงุตุฑููู ุนูู ูุงูุฏุฑ ูู ุงูุฎูุฑ ุญูุซ ูุงู ุซู ุฃุฑุถูู ูุงู ููุณูู ุญุงุฌุชู',
 'ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ',
 'ุงู

In [20]:
def create_template(question,k):

  relevant_docs = get_relevant_docs(question,k)

  template = f"""
  Engage in a conversation with the user, responding to their question:
  
    {question}
    
    within this documents of Hadiths:
    {relevant_docs} 
    
    Encourage the model to provide informative and culturally sensitive answers, reflecting Islamic teachings. Maintain a conversational tone and aim for clarity in responses and make sure they are restricted extracted from the provided contexts and i want you to answer me in arabic."""
  
  return template

In [21]:
question = "ูู ุซุจุช ุณุฌูุฏ ุงูุดูุฑ ูู ุงููุจู"
query = create_template(question,5)
query

'\n  Engage in a conversation with the user, responding to their question:\n  \n    ูู ุซุจุช ุณุฌูุฏ ุงูุดูุฑ ูู ุงููุจู\n    \n    within this documents of Hadiths:\n    [(Document(page_content=\'ูุงู ุณุฌุฏ ุงููุจู ๏ทบ ุจุงููุฌู\', metadata={\'answers\': "{\'answer_start\': [17], \'text\': [\'ุงููุฌู\']}", \'id\': 585, \'question\': \'ูุง ูู ุงูุณุฌุฏุงุช ุงูุชู ุณุฌุฏูุง ุงููุจู ๏ทบ\', \'_id\': \'099e0a88-2fbd-49bf-ac28-2b690e183b9e\', \'_collection_name\': \'Cluster0\'}), 0.8963401), (Document(page_content=\'ูุงู ุณุฌุฏูุง ูุน ุงููุจู ๏ทบ ูู ุฅุฐุง ุงูุณูุงุก ุงูุดูุช ูุงูุฑุฃ ุจุงุณู ุฑุจู\', metadata={\'answers\': "{\'answer_start\': [24], \'text\': [\'ุฅุฐุง ุงูุณูุงุก ุงูุดูุช ูุงูุฑุฃ ุจุงุณู ุฑุจู\']}", \'id\': 586, \'question\': \'ูุง ูู ุงูุณุฌุฏุงุช ุงูุชู ุณุฌุฏูุง ุงููุจู ๏ทบ\', \'_id\': \'83f6b074-96b8-414d-b235-5d33cba2cfd7\', \'_collection_name\': \'Cluster0\'}), 0.88707983), (Document(page_content=\'ุ

In [25]:
GOOGLE_API_KEY= os.getenv('GOOGLE_API_KEY')

In [26]:
GOOGLE_API_KEY

'AIzaSyAXlAY4M7InDTbiW8OZIGmSWpimQef-Ias'

In [42]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest",google_api_key="AIzaSyAXlAY4M7InDTbiW8OZIGmSWpimQef-Ias")

In [43]:
query = create_template(q,5)

In [44]:
result = llm.invoke(query)

In [45]:
clean_text(result.content)

'ูุนู ุงูุฏุนุงุก ุนุจุงุฏุฉ ููู ุชุนุงูู ูููู ุฑุณูู ุงููู ุตูู ุงููู ุนููู ูุณูู ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ ููุฐุง ูุนูู ุฃู ุงูุฏุนุงุก ูู ุดูู ูู ุฃุดูุงู ุงูุนุจุงุฏุฉ ุงูุชู ูููู ุจูุง ููู ุชุนุงูู'

In [None]:
# unique elements from the 'question' column
unique_questions_df[:20]

Unnamed: 0,id,question,context,answers
0,1,ููู ูุนุจุฏ ุงููู,ูู ุนูู ุนููุง ููุณ ุนููู ุฃูุฑูุง ููู ุฑุฏ,"{'answer_start': [0], 'text': ['ูู ุนูู ุนููุง ูู..."
1,2,ูู ูุนุจุฏ ุงููู ุฎููุง ูุทูุนุง,ุฃุณุฃู ุงููู ุงูุฌูุฉ ูุฃุนูุฐ ุจู ูู ุงููุงุฑ,"{'answer_start': [0], 'text': ['ุฃุณุฃู ุงููู ุงูุฌู..."
2,3,ูุง ูู ุงูุฅุญุณุงู ูู ุงูุนุจุงุฏุฉ,ุงูุฅุญุณุงู ุฃู ุชุนุจุฏ ุงููู ูุฃูู ุชุฑุงู ูุฅู ูู ุชูู ุชุฑุงู...,"{'answer_start': [8], 'text': ['ุฃู ุชุนุจุฏ ุงููู ู..."
3,4,ูุง ูุนูู ูุง ุฅูู ุฅูุง ุงููู,ูู ูุงู ูุข ุฅูู ุฅูุง ุงููู ูููุฑ ุจูุง ูุนุจุฏ ูู ุฏูู ุงู...,"{'answer_start': [0], 'text': ['ูู ูุงู ูุข ุฅูู ..."
4,5,ูุง ูู ุงูุชูุญูุฏ ูู ุตูุงุช ุงููู,ููุฒู ุฑุจูุง ุชุจุงุฑู ูุชุนุงูู ูู ูู ูููุฉ ุฅูู ุงูุณูุงุก ุง...,"{'answer_start': [0], 'text': ['ููุฒู ุฑุจูุง ุชุจุงุฑ..."
5,6,ูุง ูู ูุงุฆุฏุฉ ุงูุชูุญูุฏ ูููุณูู,ุญู ุงูุนุจุงุฏ ุนูู ุงููู ุฃู ูุง ูุนุฐุจ ูู ูุง ูุดุฑู ุจู ุดูุฆุง,"{'answer_start': [19], 'text': ['ุฃู ูุง ูุนุฐุจ ูู..."
6,7,ุฃูู ุงููู,ุฅู ุงููู ูุชุจ ูุชุงุจุง ุฅู ุฑุญูุชู ุณุจูุช ุบุถุจู ููู ููุชูุจ...,"{'answer_start': [37], 'text': ['ููู ููุชูุจ ุนูุฏ..."
7,8,ูู ุงููู ูุนูุง ุจุฐุงุชู ุฃู ุจุนููู,ุฅููู ุชุฏุนูู ุณููุนุง ูุฑูุจุง ููู ูุนูู,"{'answer_start': [0], 'text': ['ุฅููู ุชุฏุนูู ุณูู..."
8,9,ูุง ูู ุฃุนุธู ุงูุฐููุจ,ุณุฆู ุตูู ุงููู ุนููู ูุณูู ุฃู ุงูุฐูุจ ุฃุนุธู ูุงู ุฃู ุชุฏ...,"{'answer_start': [41], 'text': ['ุฃู ุชุฏุนู ููู ู..."
9,10,ูุง ูู ุงูุดุฑู ุงูุฃูุจุฑ,ุฃูุจุฑ ุงููุจุงุฆุฑ ุงูุฅุดุฑุงู ุจุงููู,"{'answer_start': [13], 'text': ['ุงูุฅุดุฑุงู ุจุงููู']}"


In [56]:
predictions = []
references = []

for _,row in unique_questions_df[:25].iterrows():

  query = create_template(row['question'],k=10)
  predictions.append(clean_text(chat.invoke(query).content))

  documents_answers = []
  relevant_docs = get_relevant_docs(row['question'],k=10)

  for doc in relevant_docs:
    answers = ast.literal_eval(doc[0].metadata['answers'])['text']
    for ans in answers:
      documents_answers.append(ans)
  references.append(documents_answers)

In [59]:
data = {'predictions': predictions, 'references': references}
with open('predictions_references.json', 'w') as json_file:
    json.dump(data, json_file)

In [11]:
# Now you can load the predictions and references from the JSON file anytime later
with open('predictions_references.json', 'r') as json_file:
    loaded_data = json.load(json_file)

predictions = loaded_data['predictions']
references = loaded_data['references']

In [12]:
predictions[0]

'ููู ูุนุจุฏ ุงููู ุงูุนุจุงุฏุฉ ููู ุชุนุงูู ุชุฃุชู ูู ุฎูุงู ุงูุนุฏูุฏ ูู ุงูุฃุนูุงู ูุงูุฃุฐูุงุฑ ุงูุชู ุชูุฑุจูุง ุฅููู ูู ุจูู ูุฐู ุงูุฃุนูุงู ูู ุงูุฏุนุงุก ูุงูุฏุนุงุก ูู ุนุจุงุฏุฉ ููู ุชุนุงูู ููุง ุฌุงุก ูู ุงูุญุฏูุซ ุงูุดุฑูู ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ ุจุงูุฅุถุงูุฉ ุฅูู ุฐูู ุงูุฅุญุณุงู ูู ุงูุนุจุงุฏุฉ ูุนูู ุฃู ุชุนุจุฏ ุงููู ูุฃูู ุชุฑุงู ูุฅู ูู ุชูู ุชุฑุงู ูุฅูู ูุฑุงู ูุฐุง ูุฌุจ ุนูููุง ุฃู ูููู ูุฎูุตูู ูู ุนุจุงุฏุชูุง ูุฃู ูุนุจุฏ ุงููู ุจูู ุฅุฎูุงุต ูุชููู'

In [13]:
references[0]

['ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ',
 'ุฃู ุชุนุจุฏ ุงููู ูุฃูู ุชุฑุงู ูุฅู ูู ุชูู ุชุฑุงู ูุฅูู ูุฑุงู',
 'ุงูููู',
 'ุงูููู ูู ุจูุบุช ุงูููู ุงุดูุฏ',
 'ูุนู ุงููู ูู ุฐุจุญ ูุบูุฑ ุงููู',
 'ุณุจุญุงูู ุงูููู ุฑุจูุง ูุจุญูุฏู ุงูููู ุงุบูุฑ ูู',
 'ุณุจุญุงู ุฑุจู ุงูุนุธูู',
 'ูุง ูุนูู ุงูุบูุจ ุฅูุง ุงููู',
 'ุงูุฏุนุงุก ูู ุงูุนุจุงุฏุฉ',
 'ุงููู ูู ุงูุญูู']

In [14]:
# Calculate BLEU-1 score
bleu_2_score = corpus_bleu(references, predictions, weights=(1, 0, 0, 0))
print("BLEU-1 Score:", bleu_2_score)

BLEU-1 Score: 0.24643030597377372


In [15]:
# Calculate BLEU-2 score
bleu_2_score = corpus_bleu(references, predictions, weights=(0.5, 0.5))
print("BLEU-2 Score:", bleu_2_score)

BLEU-2 Score: 0.2521173975250671


In [16]:
concatencated_refrences = []
for reference in references:
    concatencated_refrences.append(' '.join(reference))

In [17]:
# Calculate ROUGE-L score
rouge = Rouge()
scores = rouge.get_scores(predictions, concatencated_refrences, avg=True)
rouge_l_score = scores["rouge-l"]["f"]
print("ROUGE-L Score:", rouge_l_score)

ROUGE-L Score: 0.14742646354664704


In [20]:
from datasets import load_metric
squad_v2 = False
# Load the SQuAD metric
metric = load_metric("squad_v2" if squad_v2 else "squad",trust_remote_code=True)

# Format predictions
formatted_predictions = [{"id": i, "prediction_text": prediction} for i, prediction in enumerate(predictions)]

# Format references
formatted_references = [{"id": i, "answers": {"text": [reference]}} for i, reference in enumerate(references)]

# Compute metrics
metrics = metric.compute(predictions=formatted_predictions, references=formatted_references)

print(metrics)

  from .autonotebook import tqdm as notebook_tqdm
  metric = load_metric("squad_v2" if squad_v2 else "squad",trust_remote_code=True)


{'exact_match': 0.0, 'f1': 21.068373499501504}


In [31]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Encode predictions and references into embeddings
prediction_embeddings = [model.encode(prediction) for prediction in predictions]
reference_embeddings = [model.encode(reference) for reference in concatencated_refrences]

# Compute cosine similarity matrix
cos_sim_matrix = cosine_similarity(prediction_embeddings, reference_embeddings)

# Calculate the average cosine similarity across all pairs
avg_cos_similarity = cos_sim_matrix.mean()

# Print the average cosine similarity
print("Average Cosine Similarity:", avg_cos_similarity)

Average Cosine Similarity: 0.9731169
