In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
!pip install transformers rank_bm25 PyMuPDF python-telegram-bot==13.7

In [None]:
import re
import fitz
import json
import pandas as pd

In [None]:
def extract_text_with_fonts(pdf_path):
    doc = fitz.open(pdf_path)
    text_data = []

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        text_data.append({
                            "text": span["text"],
                            "size": span["size"],
                            "flags" : span["flags"],
                            "color" : span["color"]
                        })
    return text_data

def parse_text_with_fonts(text_data):
    data = []
    chapter = None
    title = None
    subtitle = None
    headline = None
    comment = None
    contents = ""
    current_size = None
    current_flag = None
    chapter_comp = False

    for item in text_data:
        text = item["text"].strip()
        size = int(item["size"])
        flags = int(item["flags"])
        color = int(item["color"])
        chapter_match = re.match(r'Chapter (\d+)\. (.+)', text)

        if flags == 16 and size >= 14:
            if (chapter_match or chapter_comp) and size > 18 and color == 0:
                if chapter and contents != "":
                    data.append([chapter, title, subtitle, headline, comment, contents.strip()])

                if size == current_size and current_flag == flags and chapter:
                    chapter = chapter + " " + text
                else:
                    chapter = text

                title = None
                subtitle = None
                headline = None
                comment = None
                contents = ""
                current_size = size
            elif size >= 18 and (not color == 0):
                if chapter and contents != "":
                    data.append([chapter, title, subtitle, headline, comment, contents.strip()])

                if size == current_size and current_flag == flags and title:
                    title = title + " " + text
                else:
                    title = text

                subtitle = None
                headline = None
                comment = None
                contents = ""
                current_size = size
                
            elif text.isupper():
                if chapter and contents != "":
                    data.append([chapter, title, subtitle, headline, comment, contents.strip()])
                    
                comment = text
                contents = ""
                current_size = size
                
            elif 14 <= size <18 and color ==0:
                if chapter and contents != "":
                    data.append([chapter, title, subtitle, headline, comment, contents.strip()])

                if size == current_size and current_flag == flags and subtitle:
                    subtitle = subtitle + " " + text
                else:
                    subtitle = text

                headline = None
                comment = None
                contents = ""
                current_size = size

            elif 15 <= size <16 and (not color == 0):
                if chapter and contents != "":
                    data.append([chapter, title, subtitle, headline, comment, contents.strip()])

                if size == current_size and current_flag == flags and headline:
                    headline = headline + " " + text
                else:
                    headline = text
                comment = None
                contents = ""
                current_size = size
            current_flag = 16

        elif size <= 16:
            contents = contents + " " + text
            current_size = size

        if chapter_match:
            chapter_comp = True
        else:
            chapter_comp = False
        current_flag = flags

    if chapter and contents != "":
        data.append([chapter, title, subtitle, headline, comment, contents.strip()])
    return data

text_data = extract_text_with_fonts("HML.pdf")
parsed_data = parse_text_with_fonts(text_data)

df = pd.DataFrame(parsed_data, columns=['Chapter', 'Title', "Subtitle", "Headline", "Comment", 'Contents'])

In [None]:
df.head(15)

In [None]:
new_rows = []

for i in df["Title"].unique():
    filtered_subtitles = df[(df["Title"] == i) & (~df["Subtitle"].isin([None]))]["Subtitle"].unique()
    if filtered_subtitles.size > 0:
        ch = df[df["Title"] == i]['Chapter'].iloc[0]  
        str_con = ", ".join(filtered_subtitles)
        new_rows.append({'Chapter': ch, 'Title': i, 'Subtitle': None, "Headline": None, "Comment" : None, "Contents": str_con})

for i in df["Subtitle"].unique():
    filtered_headlines = df[(df["Subtitle"] == i) & (~df["Headline"].isin([None]))]["Headline"].unique()
    if filtered_headlines.size > 0:
        ch = df[df["Subtitle"] == i]['Chapter'].iloc[0]
        t = df[df["Subtitle"] == i]['Title'].iloc[0]
        str_con = ", ".join(filtered_headlines)
        new_rows.append({'Chapter': ch, 'Title': t, 'Subtitle': i, "Headline": None, "Comment" : None, "Contents": str_con})

new = pd.DataFrame(new_rows)
df = pd.concat([df, new], ignore_index=True)

In [None]:
new[new['Subtitle'] == "Training Supervision"]

In [None]:
df[df['Title'] == "Boosting"]

In [None]:
js_data = df.to_json()
json_data = json.loads(js_data)

In [None]:
titles = []
for key, value in json_data['Title'].items():
    if value is not None:
        if not (value in titles):
            titles.append(value)
        else:
            titles.append("")
    else:
        titles.append("")
subtitles = []
for key, value in json_data['Subtitle'].items():
    if value is not None:
        if not (value in subtitles):
            subtitles.append(value)
        else:
            subtitles.append("")
    else:
        subtitles.append("")
headlines = [value if value is not None else "" for key, value in json_data['Headline'].items()]
contents = [value for key, value in json_data['Contents'].items()]

In [None]:
from rank_bm25 import BM25Okapi
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenized_contents = [tokenizer.tokenize(content) for content in contents]
bm25_contents = BM25Okapi(tokenized_contents)
combined_texts = [f"{title} {subtitle} {headline}" for title, subtitle, headline in zip(titles, subtitles, headlines)]
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(combined_texts)

In [None]:
def process_query(query):
    tokenized_query = tokenizer.tokenize(query)
    contents_scores = bm25_contents.get_scores(tokenized_query)
    
    best_contents_idx, _ = sorted(enumerate(contents_scores), key=lambda x: x[1], reverse=True)[0]
    matched_content_c = contents[best_contents_idx]
    
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    best_index = np.argmax(cosine_similarities)
    matched_content = contents[best_index]

    def get_answer_from_context(context):
        encoded_text = tokenizer.encode_plus(
            text=query, text_pair=context, max_length=512, truncation=True,
            padding='max_length', return_tensors="pt"
        )
        
        inputs = encoded_text['input_ids']
        token_type_ids = encoded_text['token_type_ids']
        attention_mask = encoded_text['attention_mask']

        with torch.no_grad():
            outputs = model(input_ids=inputs, token_type_ids=token_type_ids, attention_mask=attention_mask)

        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        start_index = torch.argmax(start_logits)
        end_index = torch.argmax(end_logits)

        if start_index >= end_index:
            return "Sorry, I couldn't find an answer"
        
        answer_tokens = inputs.squeeze().tolist()[start_index:end_index + 1]
        answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
        
        corrected_ans = ' '.join([word[2:] if word.startswith('##') else word for word in answer.split()])
        return corrected_ans.strip()
    
    answer_from_tf_idf = get_answer_from_context(matched_content)
    if answer_from_tf_idf != "Sorry, I couldn't find an answer":
        return answer_from_tf_idf
    else:
        answer_from_c = get_answer_from_context(matched_content_c)
    
    return answer_from_c

In [None]:
import pandas as pd
queries = ["What is boosting?", "What is gradient boosting?", "What are the types of Machine learning?", "What are the things typically used to detect tumors in brain scans?", "What happen in supervised learning?", "What are the types of learning?"]
data = {"Question": queries, "Answer": [process_query(q) for q in queries]}
test = pd.DataFrame(data)
pd.set_option('display.max_colwidth', None)
test.head()

In [None]:
from telegram import Update
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext

TOKEN = '7360551119:AAED2_VS4T6OWLg1fdNpjgZb4ydSNfEr8uQ'

def start(update: Update, context: CallbackContext):
    update.message.reply_text('Welcome!\nAsk me any question you want to know from the Hands-on Machine Learning book')

def handle_message(update: Update, context: CallbackContext):
    update.message.reply_text(f'{process_query(update.message.text)}')
updater = Updater(token=TOKEN, use_context=True)
dispatcher = updater.dispatcher
start_handler = CommandHandler('start', start)
dispatcher.add_handler(start_handler)
message_handler = MessageHandler(Filters.text & ~Filters.command, handle_message)
dispatcher.add_handler(message_handler)
updater.start_polling()
updater.idle()