In [2]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset
from datasets import Dataset
from tqdm.auto import tqdm
from datetime import datetime
import re
import glob
from transformers import AutoTokenizer, AutoModel
import torch
tqdm.pandas()

In [3]:
def load_parquet_file(file_dir):
    books_12 = pd.read_parquet(file_dir)
    books_12 = books_12.query("book_id !=-1")
    books_12 =  Dataset.from_pandas(books_12)
    try:
        books_12 = books_12.remove_columns("__index_level_0__")
    except:
        pass
    return books_12

In [4]:
all_books = pd.read_parquet("3_clean/books_with_id_and_embedding.parquet")
all_books["authors"].fillna("Unknown", inplace=True)
all_books =  Dataset.from_pandas(all_books)
all_books

Dataset({
    features: ['isbn', 'title', 'authors', 'publication_year', 'text', 'text_embedding', 'ID'],
    num_rows: 271360
})

In [5]:
all_books.add_faiss_index(column="text_embedding")

  0%|          | 0/272 [00:00<?, ?it/s]

Dataset({
    features: ['isbn', 'title', 'authors', 'publication_year', 'text', 'text_embedding', 'ID'],
    num_rows: 271360
})

In [6]:
model_ckpt = "sentence-transformers/paraphrase-MiniLM-L3-v2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

device = torch.device("cpu")
model = model.to(device)
model = model.eval()

In [7]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0]

In [8]:
def create_text(row):
    titles = row['title']
    authors = row['authors']
    authors = [author.replace(":", " and") for author in authors]
    row["text"] =  [f"{title} by {author}" for title, author in zip(titles, authors)]
    return row

In [9]:
# books_12 = books_12.map(create_text, batched=True, batch_size=1000)

In [10]:
# all_books = all_books.map(create_text, batched=True, batch_size=1000)

In [11]:
class GetEmbeddings:
    def __init__(self, df):
        self.df = df
        model_ckpt = "sentence-transformers/paraphrase-MiniLM-L3-v2"
        self.tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        self.model = AutoModel.from_pretrained(model_ckpt)

        self.device = torch.device("cpu")
        self.model = self.model.to(device)
        self.model = self.model.eval()
    
    def get_embeddings(self, row):
        text_list = row["text"]
        encoded_input = self.tokenizer(
            text_list, padding=True, truncation=True, return_tensors="pt"
        )
        encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
        model_output = self.model(**encoded_input)
        embedding = model_output.last_hidden_state[:, 0]
        row["text_embedding"] = embedding
        return row

    def map(self, **kwargs):
        self.df = self.df.map(
            self.get_embeddings,
            **kwargs

            )
        return self.df

In [12]:
# embedding_getter = GetEmbeddings(all_books)
# all_books = embedding_getter.map(batch_size = 128,
#     num_proc = 6, batched = True)

In [13]:
# ids = np.arange(0,len(all_books),1)
# all_books = all_books.add_column("ID", ids)

In [14]:
# all_books.to_parquet("3_clean/books_with_id_and_embedding.parquet")

In [15]:
threshold = 10

In [16]:
def match_book_batched_row(rows):
    """Use when batch is passed along with the map function"""
    text = rows["text"]
    text_embeddings = get_embeddings(text).cpu().detach().numpy()
    scores, samples = all_books.get_nearest_examples_batch(
        "text_embedding", text_embeddings, k=1
    )
    scores = np.array(scores)
    scores = np.squeeze(scores)
    mask = scores<=threshold
    book_ids = np.ones_like(mask)*-1
    sample_ids = np.array([sample["ID"][0] for sample in samples])
    book_ids[mask] = sample_ids[mask]
    rows["book_id_3"] = book_ids
    rows["score_3"] = scores
    return rows


In [17]:
# books_12 = books_12.map(match_book_batched_row, batched=True, batch_size=512)

In [18]:
# len(books_12.filter(lambda x: x["book_id_3"] !=-1))

In [19]:
def merge_one(file_dir):
    """Merges one smaller book file with the larger book file."""
    print(f"Working on: {file_dir}")
    books_12 = load_parquet_file(file_dir)
    books_12 = books_12.map(create_text, batched=True, batch_size=800)
    books_12 = books_12.map(match_book_batched_row, batched=True, batch_size=512)
    file_name = file_dir.split(os.path.sep)[-1]
    file_name = os.path.join("2_final", file_name)
    books_12.to_parquet(file_name)
    print(f"saved as: {file_name}")
    return books_12

In [20]:
class MatchBook:
    def __init__(self, all_books, books):
        self.all_books = all_books
        self.books = books
        model_ckpt = "sentence-transformers/paraphrase-MiniLM-L3-v2"
        self.tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        self.model = AutoModel.from_pretrained(model_ckpt)

        self.device = torch.device("cpu")
        self.model = self.model.to(device)
        self.model = self.model.eval()
        self.array = np.array
        self.squeeze = np.squeeze
        self.ones_like = np.ones_like
        self.threshold = 10

    def get_embeddings(self, text_list):
        encoded_input = self.tokenizer(
            text_list, padding=True, truncation=True, return_tensors="pt"
        )
        encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
        model_output = self.model(**encoded_input)
        return model_output.last_hidden_state[:, 0]

    def match_book_batched_row(self, rows):
        """Use when batch is passed along with the map function"""
        text = rows["text"]
        text_embeddings = self.get_embeddings(text).cpu().detach().numpy()
        scores, samples = self.all_books.get_nearest_examples_batch(
            "text_embedding", text_embeddings, k=1
        )
        scores = self.array(scores)
        scores = self.squeeze(scores)
        mask = scores<=self.threshold
        book_ids = self.ones_like(mask)*-1
        sample_ids = self.array([sample["ID"][0] for sample in samples])
        book_ids[mask] = sample_ids[mask]
        rows["book_id"] = book_ids
        rows["score"] = scores
        return rows
        
    def map(self, **kwargs):
        self.books = self.books.map(
            self.match_book_batched_row,
            **kwargs

            )
        return self.books


In [21]:
all_books_dir = glob.glob("1_2/book*.parquet")
all_books_dir.sort()
len(all_books_dir)

22

In [22]:
start_id = 0
for current_id in range(start_id, len(all_books_dir)):
    print(f"CURRENTLY WORKING ON: {current_id}")
    merge_one(all_books_dir[current_id])
    print("----"*10)
    print("----"*10)

CURRENTLY WORKING ON: 0
Working on: 1_2\book1-100k.parquet


Map:   0%|          | 0/16146 [00:00<?, ? examples/s]

Map:   0%|          | 0/16146 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

saved as: 2_final\book1-100k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 1
Working on: 1_2\book1000k-1100k.parquet


Map:   0%|          | 0/6331 [00:00<?, ? examples/s]

Map:   0%|          | 0/6331 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

saved as: 2_final\book1000k-1100k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 2
Working on: 1_2\book100k-200k.parquet


Map:   0%|          | 0/11662 [00:00<?, ? examples/s]

Map:   0%|          | 0/11662 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

saved as: 2_final\book100k-200k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 3
Working on: 1_2\book1100k-1200k.parquet


Map:   0%|          | 0/6467 [00:00<?, ? examples/s]

Map:   0%|          | 0/6467 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

saved as: 2_final\book1100k-1200k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 4
Working on: 1_2\book1200k-1300k.parquet


Map:   0%|          | 0/6136 [00:00<?, ? examples/s]

Map:   0%|          | 0/6136 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

saved as: 2_final\book1200k-1300k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 5
Working on: 1_2\book1300k-1400k.parquet


Map:   0%|          | 0/5125 [00:00<?, ? examples/s]

Map:   0%|          | 0/5125 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

saved as: 2_final\book1300k-1400k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 6
Working on: 1_2\book1400k-1500k.parquet


Map:   0%|          | 0/4700 [00:00<?, ? examples/s]

Map:   0%|          | 0/4700 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

saved as: 2_final\book1400k-1500k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 7
Working on: 1_2\book1500k-1600k.parquet


Map:   0%|          | 0/3869 [00:00<?, ? examples/s]

Map:   0%|          | 0/3869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

saved as: 2_final\book1500k-1600k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 8
Working on: 1_2\book1600k-1700k.parquet


Map:   0%|          | 0/3725 [00:00<?, ? examples/s]

Map:   0%|          | 0/3725 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

saved as: 2_final\book1600k-1700k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 9
Working on: 1_2\book1700k-1800k.parquet


Map:   0%|          | 0/3801 [00:00<?, ? examples/s]

Map:   0%|          | 0/3801 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

saved as: 2_final\book1700k-1800k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 10
Working on: 1_2\book1800k-1900k.parquet


Map:   0%|          | 0/4418 [00:00<?, ? examples/s]

Map:   0%|          | 0/4418 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

saved as: 2_final\book1800k-1900k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 11
Working on: 1_2\book1900k-2000k.parquet


Map:   0%|          | 0/4783 [00:00<?, ? examples/s]

Map:   0%|          | 0/4783 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

saved as: 2_final\book1900k-2000k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 12
Working on: 1_2\book2000k-3000k.parquet


Map:   0%|          | 0/38540 [00:00<?, ? examples/s]

Map:   0%|          | 0/38540 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/39 [00:00<?, ?ba/s]

saved as: 2_final\book2000k-3000k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 13
Working on: 1_2\book200k-300k.parquet


Map:   0%|          | 0/10157 [00:00<?, ? examples/s]

Map:   0%|          | 0/10157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

saved as: 2_final\book200k-300k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 14
Working on: 1_2\book3000k-4000k.parquet


Map:   0%|          | 0/21446 [00:00<?, ? examples/s]

Map:   0%|          | 0/21446 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

saved as: 2_final\book3000k-4000k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 15
Working on: 1_2\book4000k-5000k.parquet


Map:   0%|          | 0/18425 [00:00<?, ? examples/s]

Map:   0%|          | 0/18425 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

saved as: 2_final\book4000k-5000k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 16
Working on: 1_2\book400k-500k.parquet


Map:   0%|          | 0/8169 [00:00<?, ? examples/s]

Map:   0%|          | 0/8169 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

saved as: 2_final\book400k-500k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 17
Working on: 1_2\book500k-600k.parquet


Map:   0%|          | 0/8889 [00:00<?, ? examples/s]

Map:   0%|          | 0/8889 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

saved as: 2_final\book500k-600k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 18
Working on: 1_2\book600k-700k.parquet


Map:   0%|          | 0/7973 [00:00<?, ? examples/s]

Map:   0%|          | 0/7973 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

saved as: 2_final\book600k-700k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 19
Working on: 1_2\book700k-800k.parquet


Map:   0%|          | 0/7926 [00:00<?, ? examples/s]

Map:   0%|          | 0/7926 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

saved as: 2_final\book700k-800k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 20
Working on: 1_2\book800k-900k.parquet


Map:   0%|          | 0/9384 [00:00<?, ? examples/s]

Map:   0%|          | 0/9384 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

saved as: 2_final\book800k-900k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 21
Working on: 1_2\book900k-1000k.parquet


Map:   0%|          | 0/7325 [00:00<?, ? examples/s]

Map:   0%|          | 0/7325 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

saved as: 2_final\book900k-1000k.parquet
----------------------------------------
----------------------------------------
