@ajaz_ahmed

In [None]:
from docx import Document
import PyPDF2
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from pinecone import Pinecone, PodSpec
import re
import tqdm

In [2]:
filepath =  '../../../../AJAZ_AHMED_3YOE.pdf' # #'../../../../file-sample_1MB.docx'

## Project 1
```
Simple Contextual Chat Bot
1. Read a long PDF/ Word Document. 
2. Build a chat bot that will use the document as a context to answer the question. 
3. If the answer is not found in the document - it should say I don't know the answer. 
```

### 1. Read a long PDF/ Word Document. 

In [3]:
def read_pdf(file_path):
    # Open the PDF file in binary mode
    with open(file_path, 'rb') as pdf_file:
        # Create a PDF reader object
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Get the total number of pages
        num_pages = len(pdf_reader.pages)

        # Initialize an empty string to store the extracted text
        extracted_text = ""

        # Loop through each page and extract text
        for page_num in range(num_pages):
            # Get a specific page
            page = pdf_reader.pages[page_num]

            # Extract text from the page
            page_text = page.extract_text()

            # Append the text to the result string
            extracted_text += page_text

    return extracted_text

def read_word_document(file_path):
    doc = Document(file_path)
    text_content = [paragraph.text for paragraph in doc.paragraphs]
    return '\n'.join(text_content)

In [4]:
if filepath.endswith('.pdf'):
    word_content = read_pdf(filepath)
else:
    word_content = read_word_document(filepath)

### 2. Build a chat bot that will use the document as a context to answer the question.
### 3. If the answer is not found in the document - it should say I don't know the answer. 

In [5]:
# https://stackoverflow.com/a/47091490/4084039- reference

def preprocess_text(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r'\s', ' ', phrase)
    return phrase


sentences = preprocess_text(word_content)

In [6]:
# Preprocess the document
sentences = nltk.sent_tokenize(sentences.lower())
# Create TF-IDF vectors for each sentence in the document
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

In [7]:
def get_answer(user_input):
    # Preprocess user input
    user_input = user_input.lower()

    # Vectorize user input
    user_vector = tfidf_vectorizer.transform([user_input])

    # Calculate cosine similarity scores between user input and document sentences
    similarity_scores = cosine_similarity(user_vector, tfidf_matrix).flatten()

    # Find the sentence with the highest similarity score
    max_similarity_index = similarity_scores.argmax()
    max_similarity_score = similarity_scores[max_similarity_index]

    # Set a threshold for considering the answer
    similarity_threshold = 0.2

    if max_similarity_score > similarity_threshold:
        # Return the corresponding sentence from the document as the answer
        return sentences[max_similarity_index]
    else:
        # 3. If the answer is not found in the document - it should say I don't know the answer. 
        return "I don't know the answer."

In [8]:
# Example usage
while True:
    user_input = input("User: ")
    if user_input.lower() == 'exit':
        break
    answer = get_answer(user_input)
    print("Chatbot:", answer)

User: ajaz ahmed
Chatbot: ajaz ahmed +919955546834 |ajazahmeddnr@gmail.com |linkedin |github |leetcode experience machine learning engineer aug 2021 – till date sony india software centre - on the payroll of ignitarium bengalore, india •object detection :- successfully deployed and custom trained yolov3 to aws sagemaker.
User: python
Chatbot: •python(basic) from hackerrank.
User: deep learning
Chatbot: •gotfeatured onthewalloffame atiotiot.courses & certificates •appliedaicourseassignments •supervised machine learning course from scaler •keras & tensorflow for deep learning from scaler •deep learning course: deep dive into deep learning from scaler •aws foundations: machine learning basics •aws machine learning terminology and process •deep learning with pytorch course by jovian.
User: sony
Chatbot: •object detection :- worked on sony alpha 9 iii dslr on autofocus.
User: classification
Chatbot: I don't know the answer.
User: exit


## Project 2

```
Advanced Challenge:
- Break down the document into multiple chunks/ paragraphs. 
- Store them in a vector database like pinecone.  
- When you ask a question find out the top 3 chunks that will likely have the answer to the question using semantic similarity search. 
```

- Break down the document into multiple chunks/ paragraphs.
     - This part is already done since we've broken documents into multiple chunks(sentences) and converted into tfidf matrix. From here onwards would move the matrix to pinecone.

In [9]:
def push_to_pinecone(embedded_vectors, index_name, api_key):
    '''
    This function push the vectors to pinecone.
    embedded_vectors: vectors generated using embedding, type: sparse_matrix.
    index_name: name of the index to create.
    api_key: api_key to access pinecone account.
    '''
    # Initialize Pinecone client
    pinecone = Pinecone(api_key=api_key)
    dimension = tfidf_matrix.shape[1]  
    spec=PodSpec(environment="gcp-starter", pod_type="us-central-1")


    # create an index    
    pinecone.create_index(index_name, dimension, spec)

    # Connect to the existing or newly created index
    index = pinecone.Index(index_name)
    
    # Ids of vectors
    vector_ids = [f"vector_{i}" for i in range(embedded_vectors.shape[0])]
    # sparse_matrix converting into list.
    values = [{'id': vector_id, 'values': value.toarray().flatten().tolist()} for vector_id, value in zip(vector_ids, tfidf_matrix)]


    # Upsert vector to Pinecone index
    for row in tqdm.tqdm(values):
        index.upsert(vectors=[row])


- Store them in a vector database like pinecone.
    - This part is done by below function(push_to_pinecone).

In [10]:
# upsert all the vectors to pinecone
api_key = 'enter_your_api_key'
index_name = 'demo'
push_to_pinecone(tfidf_matrix, index_name, api_key)

100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:16<00:00,  1.53it/s]


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

def text_to_vectors(text):
 
    # Preprocess the document
    sentence = nltk.sent_tokenize(text.lower())
    # Create TF-IDF vectors for each sentence in the document
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.transform(sentence)
    vectors = tfidf_matrix.toarray().flatten().tolist()
    return vectors


- When you ask a question find out the top 3 chunks that will likely have the answer to the question using semantic similarity search. 
    - This part is being done here.

In [12]:
from pinecone import Pinecone
import numpy as np

# Initialize Pinecone client
pinecone = Pinecone(api_key=api_key)

# Connect to the existing or newly created index
index = pinecone.Index(index_name)

# Function to perform semantic similarity search
def find_similar_chunks(question, top_k=3):
    # Convert question to vector using the same vectorization method used for sentences
    sentence = preprocess_text(question)
   
    vectors = tfidf_vectorizer.transform([sentence])
    vectors = vectors.toarray().flatten().tolist()
    # Perform semantic similarity search
    results = index.query(vector=vectors, top_k=top_k)
    # Extract top_k chunks
    top_chunks = [result.id for result in results.matches]

    return top_chunks


# Example usage
question = "who is ajaz ahmed? Is he deep learning engineer?"
top_chunks = find_similar_chunks(question)

print("Top 3 Chunks:", top_chunks)


Top 3 Chunks: ['vector_16', 'vector_0', 'vector_17']
