In [1]:
!pip install transformers
!pip install faiss-cpu
!pip install torch
!pip install numpy

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [140]:
import sys
sys.path.append("/home/nicho206/.local/lib/python3.11/site-packages")
import json
import faiss
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

In [141]:
ENTRYPATH = "/home/nicho206/ScholarSearch/ml-papers.txt"
MODELTYPE = "allenai/scibert_scivocab_uncased"
EMBEDDING_DIM = 768 # embedding.pooler_output.shape[0]
INDEX_PATH = "/home/nicho206/ScholarSearch/papers.index"
INDEX_METADATA_PATH = "/home/nicho206/ScholarSearch/papers.txt"

In [142]:
model = AutoModel.from_pretrained(MODELTYPE)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODELTYPE)

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [143]:
index = faiss.IndexFlatL2(EMBEDDING_DIM)
index_metadata = []

In [None]:
title_index = faiss.IndexFlatL2(EMBEDDING_DIM)
abstract_index = faiss.IndexFlatL2(EMBEDDING_DIM)

# Helper Functions

In [145]:
def tokenize_entry(entry):
    tokenized = tokenizer(entry, return_tensors="pt")
    return tokenized

def generate_chunks(encoded):
    chunk_size = 512  # BERT's maximum input size

    input_ids = encoded['input_ids'].squeeze(0)  
    attention_mask = encoded['attention_mask'].squeeze(0)

    input_ids_chunks = []
    attention_mask_chunks = []

    for i in range(0, len(input_ids), chunk_size - 2):  # Account for special tokens
        input_ids_chunk = input_ids[i:i + chunk_size - 2]
        attention_mask_chunk = attention_mask[i:i + chunk_size - 2]

        input_ids_chunk = torch.cat([
            torch.tensor([tokenizer.cls_token_id]),  # [CLS] at the beginning
            input_ids_chunk,
            torch.tensor([tokenizer.sep_token_id])  # [SEP] at the end
        ])
        attention_mask_chunk = torch.cat([
            torch.tensor([1]),  # Attention mask for [CLS]
            attention_mask_chunk,
            torch.tensor([1])  # Attention mask for [SEP]
        ])

        padding_length = chunk_size - input_ids_chunk.size(0)
        if padding_length > 0:
            input_ids_chunk = torch.cat([input_ids_chunk, torch.zeros(padding_length, dtype=torch.long)])
            attention_mask_chunk = torch.cat([attention_mask_chunk, torch.zeros(padding_length, dtype=torch.long)])

        input_ids_chunks.append(input_ids_chunk.unsqueeze(0))  # Add batch dimension
        attention_mask_chunks.append(attention_mask_chunk.unsqueeze(0))  # Add batch dimension

    input_ids_chunks = torch.cat(input_ids_chunks, dim=0)
    attention_mask_chunks = torch.cat(attention_mask_chunks, dim=0)

    return input_ids_chunks, attention_mask_chunks


def generate_chunks_embedding(input_chunks, attention_chunks):
    embeddings = []
    with torch.no_grad():
        for input_chunk, attention_chunk in zip(input_chunks, attention_chunks): # tensor [512] & tensor [512]
            embeddings.append(model(input_ids=input_chunk.unsqueeze(0), attention_mask=attention_chunk.unsqueeze(0))# Model expects [1,512] shape
                            .pooler_output[0]) # pooler_output.shape = [1, 768]

    if len(embeddings) == 1: 
        return embeddings[0]

    return torch.mean(torch.stack(embeddings), dim=0)       

def generate_embedding(encoded_input):
    encoded_input.input_ids = encoded_input.input_ids[:, 0:512]
    encoded_input.attention_mask = encoded_input.attention_mask[:, 0:512]
    with torch.no_grad():
        embedding = model(**encoded_input)
    return embedding

def index_embedding(title_embedding, abstract_embedding, title, id, categories):
    # Add the embedding to the indexing system 
    title_index.add(title_embedding.numpy().reshape(1,-1))
    abstract_index.add(abstract_embedding.numpy().reshape(1,-1))
    index_metadata.append({
        "id": id,
        "title": title,
        "categories": categories
    })

def process_batch(batch):
    title_embeddings = []
    abstract_embeddings = []
    metadata = []
    for line in batch:
        entry = json.loads(line)
        encoded_title = tokenize_entry(entry["title"])
        encoded_abstract = tokenize_entry(entry["abstract"])
        title_embedding = generate_embedding(encoded_title)
        abstract_embedding = generate_embedding(encoded_abstract)
        title_embeddings.append(title_embedding)
        abstract_embeddings.append(abstract_embedding)
        metadata.append((entry["title"], entry["id"], entry["categories"]))
    return title_embeddings, abstract_embeddings, metadata

In [144]:
index.ntotal

0

# Pipeline

In [146]:
with open(ENTRYPATH, "r") as f:
    print("Creating batches")
    batch_size = 25  # Or another size suitable for your dataset
    lines = f.readlines()
    batches = [lines[i:i + batch_size] for i in range(0, 15000, batch_size)]

    print("Starting batch processing")
    with ThreadPoolExecutor(12) as executor:
        print("Submitting batches")
        futures = {executor.submit(process_batch, batch): batch for batch in batches}
        print("Waiting for completion")
        for future in as_completed(futures):
            title_embeddings, abstract_embeddings, metadata = future.result()
            # Now, outside the parallel part, add embeddings to the index
            try:
                for t_em, ab_em, meta in zip(title_embeddings, abstract_embeddings, metadata):
                    index_embedding(t_em, ab_em, *meta)
            except Exception as e:
                print(e)
        print("Done")
            

Creating batches
Starting batch processing
Submitting batches
Waiting for completion
Done


In [149]:
assert index.ntotal == len(index_metadata)

# Save the index and metadata

In [150]:
faiss.write_index(index, "sci.index")
with open("sci_metadata.txt", "w") as f:
    for data in index_metadata:
        f.write(json.dumps(data) + "\n")

print("done")

done


In [151]:
index.ntotal

15000

In [158]:
query = "Image recognition"

In [159]:
encoded_query = tokenizer(query, return_tensors="pt")

In [160]:
embedding = model(**encoded_query).pooler_output[0]

In [164]:
with torch.no_grad():
    D, I = index.search(embedding.numpy().reshape(1,-1), 20)

In [165]:
print(I)

[[ 7189 14411 10773  7980  2993  4423 11237  6952  7846   968  9158  2006
   7119  5439 10837  6301 14261  8600  1740  3018]]


In [166]:
for i in I[0]:
    print(index_metadata[i]["title"])

A Review: Expert System for Diagnosis of Myocardial Infarction
Modified Weibull distribution for Biomedical signals denoising
Facial Expressions recognition Based on Principal Component Analysis
  (PCA)
A Review of Image Mosaicing Techniques
Cognitive Memory Network
Fingertip Detection: A Fast Method with Natural Hand
Thinning Algorithm Using Hypergraph Based Morphological Operators
Feature Extraction of Human Lip Prints
A Comparative study Between Fuzzy Clustering Algorithm and Hard
  Clustering Algorithm
Swarm Intelligence
Similarity- based approach for outlier detection
Automatic Extraction of Open Space Area from High Resolution Urban
  Satellite Imagery
Content Based Image Indexing and Retrieval
A Robust Rapid Approach to Image Segmentation with Optimal Thresholding
  and Watershed Transform
A Review of Feature and Data Fusion with Medical Images
Syntactic sensitive complexity for symbol-free sequence
Obstacle evasion using fuzzy logic in a sliding blades problem
  environment
Int