In [2]:
import requests
from pymongo import MongoClient
from transformers import BertModel, BertTokenizer
import torch
import pymongo
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='tqdm', message='.*IProgress not found.*')

Connection with MongDB Atlas Client

In [2]:
api_key = 'W6DrwilOumPcxeg02w2nLP8ALAymcExSW4HjSSLuam5xaXpR5geKrDflPAn0t4Qp'
client = MongoClient('mongodb+srv://nguyenvanhon732k3:dg3jLFfKeQp6IJ2x@book.yfa6wlr.mongodb.net/')
db = client['Books']
book_collection = db['Book']
author_collection = db['Author']

Prepare BERT model and tokenizer

In [3]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Encode model

In [4]:
def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy().tolist()

Call API TextGears check and fix query

In [5]:
def correct_spelling(text, api_key):
    url = "https://api.textgears.com/spelling"
    params = {
        "key": api_key,
        "text": text,
        "language": "en-US"
    }
    response = requests.get(url, params=params)
    result = response.json()
    if "response" in result and "errors" in result["response"]:
        for error in result["response"]["errors"]:
            correction = error["better"][0] if error["better"] else error["bad"]
            text = text.replace(error["bad"], correction)
    return text

In [6]:
def create_indexes():
    book_collection.create_index([('title_vector', pymongo.TEXT)], background=True)
    author_collection.create_index([('author_vector', pymongo.TEXT)], background=True)
    print("Text indexes created.")

In [7]:
def prepare_book_vectors():
    books = list(book_collection.find({}))
    for book in books:
        if 'title_vector' not in book:
            title = book['title']
            title_vector = encode_text(title)
            book_collection.update_one(
                {'_id': book['_id']},
                {'$set': {'title_vector': title_vector}}
            )
    print("Book title vectors prepared and updated.")

In [8]:
def prepare_author_vectors():
    authors = list(author_collection.find({}))
    for author in authors:
        if 'author_vector' not in author:
            name_author = author['Name Author']
            author_vector = encode_text(name_author)
            author_collection.update_one(
                {'_id': author['_id']},
                {'$set': {'author_vector': author_vector}}
            )
    print("Author vectors prepared and updated.")

In [9]:
def search_books(query_text):
    query_title_vector = encode_text(query_text)

    pipeline = [
        {
            "$vectorSearch": {
                "index": "title_vector_index",  
                "path": "title_vector",
                "queryVector": query_title_vector,
                "numCandidates": 100,
                "limit": 10
            }
        }
    ]

    results = list(book_collection.aggregate(pipeline))
    if not results:
        print("No results found.")
    for result in results:
        print("Book Information:")
        print(f"Title: {result.get('title', '')}")
        print(f"Author: {result.get('author', '')}")
        print(f"Url_book: {result.get('url', '')}")
        print(f"TitleComplete: {result.get('titleComplete', '')}")
        print(f"Description: {result.get('description', '')}")
        print(f"ImageUrl: {result.get('imageUrl', '')}")
        print(f"Asin: {result.get('asin', '')}")
        print(f"Isbn13: {result.get('isbn13', '')}")
        print(f"Isbn: {result.get('isbn', '')}")
        print(f"Publisher: {result.get('publisher', '')}")
        print(f"PublishDate: {result.get('publishDate', '')}")
        print(f"RatingsCount_book: {result.get('ratingsCount_book', '')}")
        print(f"ReviewsCount_book: {result.get('reviewsCount_book', '')}")
        print(f"RatingHistogram: {result.get('ratingHistogram', '')}")
        print(f"NumPages: {result.get('numPages', '')}")
        print(f"Language: {result.get('language', '')}")
        print(f"Places: {result.get('places', '')}")
        print(f"Genres_book: {result.get('genres', '')}")
        print(f"Series: {result.get('series', '')}")
        print(f"Characters: {result.get('characters', '')}")
        print(f"")
        

In [10]:
def search_authors(query_text):
    query_author_vector = encode_text(query_text)
    pipeline = [
        {
            "$vectorSearch": {
                "index": "author_vector_index",  
                "path": "author_vector",
                "queryVector": query_author_vector,
                "numCandidates": 100,
                "limit": 10
            }
        }
    ]

    results = list(author_collection.aggregate(pipeline))
    if not results:
        print("No results found.")
    for result in results:
        print("Author Information:")
        print(f"Name: {result.get('Name Author', '')}")
        print(f"Url author: {result.get('Url author', '')}")
        print(f"Images Author: {result.get('Images Url', '')}")
        print(f"BirthDate: {result.get('BirthDate', '')}")
        print(f"DeathDate: {result.get('DeathDate', '')}")
        print(f"About: {result.get('About', '')}")
        print(f"avgRating: {result.get('avgRating', '')}")
        print(f"reviewsCount: {result.get('reviewsCount', '')}")
        print(f"RatingsCount: {result.get('RatingsCount', '')}")
        print("")


In [11]:
create_indexes()

Text indexes created.


In [1]:
prepare_book_vectors()

Book title vectors prepared and updated.


In [13]:
search_books("Sanctuary")

Book Information:
Title: Sanctuary
Author: Meg Cabot, Jenny Carroll
Url_book: https://www.goodreads.com/book/show/199781.Sanctuary
TitleComplete: Sanctuary (1-800-Where-R-You, #4)
Description: JESS MASTRIANI Knew she wasn't going to be able to hide her psychic powers from the U.S. government forever. But she never thought that she and Dr. Krantz, the special agent brought in to convince Jess to join his elite team of "specially gifted" crime solvers, would have something in common.  When a local boy's disappearance is attributed to a backwoods militia group, it turns out that Jess and Dr. Krantz have the same goal. Suddenly Jess finds herself collaborating with one enemy in order to stop a far worse one. In an atmosphere of hate and fear, Jess and Dr. Krantz must work together to unite a community and save a life...without losing their own.
ImageUrl: https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1512656586i/199781.jpg
Asin: 1416927077
Isbn13: 978

In [None]:
prepare_author_vectors()

Author vectors prepared and updated.


In [14]:
search_authors("Michael Shaara")

Author Information:
Name: Michael Shaara
Url author: https://www.goodreads.com/author/show/16892.Michael_Shaara
Images Author: https://images.gr-assets.com/authors/1447009089p5/16892.jpg
BirthDate: 23/06/1928 0:00
DeathDate: 05/05/1988 0:00
About: Michael Shaara was an American writer of science fiction, sports fiction, and historical fiction. He was born to Italian immigrant parents (the family name was originally spelled Sciarra, which in Italian is pronounced the same way) in Jersey City, New Jersey, graduated from Rutgers University in 1951, and served as a sergeant in the 82nd Airborne division prior to the Korean War.Before Shaara began selling science fiction stories to fiction magazines in the 1950s, he was an amateur boxer and police officer. He later taught literature at Florida State University while continuing to write fiction. The stress of this and his smoking caused him to have a heart attack at the early age of 36; from which he fully recovered. His novel about the Ba M