# ArXiv RAG search system

## Install Required librairies

In [None]:
!pip install -U sentence-transformers
!pip install faiss-gpu
!pip install -q transformers einops accelerate langchain bitsandbytes

## Import Rquired librairies

In [3]:
import pandas as pd
import numpy as np
import os

import json
import re

from google.colab import drive

import faiss
import transformers
import torch

from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer

import warnings
warnings.filterwarnings('ignore')

## Import Dataset

In [None]:
drive.mount('/content/drive', force_remount=True)

## Update ArXiv categories

In [5]:
category_map = {
'acc-phys': 'Accelerator Physics',
'adap-org': 'Not available',
'q-bio': 'Not available',
'cond-mat': 'Not available',
'chao-dyn': 'Not available',
'patt-sol': 'Not available',
'dg-ga': 'Not available',
'solv-int': 'Not available',
'bayes-an': 'Not available',
'comp-gas': 'Not available',
'alg-geom': 'Not available',
'funct-an': 'Not available',
'q-alg': 'Not available',
'ao-sci': 'Not available',
'atom-ph': 'Atomic Physics',
'chem-ph': 'Chemical Physics',
'plasm-ph': 'Plasma Physics',
'mtrl-th': 'Not available',
'cmp-lg': 'Not available',
'supr-con': 'Not available',
'econ.GN': 'General Economics',
'econ.TH': 'Theoretical Economics',
'eess.SY': 'Systems and Control',
'astro-ph': 'Astrophysics',
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
'astro-ph.EP': 'Earth and Planetary Astrophysics',
'astro-ph.GA': 'Astrophysics of Galaxies',
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
'astro-ph.SR': 'Solar and Stellar Astrophysics',
'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
'cond-mat.mtrl-sci': 'Materials Science',
'cond-mat.other': 'Other Condensed Matter',
'cond-mat.quant-gas': 'Quantum Gases',
'cond-mat.soft': 'Soft Condensed Matter',
'cond-mat.stat-mech': 'Statistical Mechanics',
'cond-mat.str-el': 'Strongly Correlated Electrons',
'cond-mat.supr-con': 'Superconductivity',
'cs.AI': 'Artificial Intelligence',
'cs.AR': 'Hardware Architecture',
'cs.CC': 'Computational Complexity',
'cs.CE': 'Computational Engineering, Finance, and Science',
'cs.CG': 'Computational Geometry',
'cs.CL': 'Computation and Language',
'cs.CR': 'Cryptography and Security',
'cs.CV': 'Computer Vision and Pattern Recognition',
'cs.CY': 'Computers and Society',
'cs.DB': 'Databases',
'cs.DC': 'Distributed, Parallel, and Cluster Computing',
'cs.DL': 'Digital Libraries',
'cs.DM': 'Discrete Mathematics',
'cs.DS': 'Data Structures and Algorithms',
'cs.ET': 'Emerging Technologies',
'cs.FL': 'Formal Languages and Automata Theory',
'cs.GL': 'General Literature',
'cs.GR': 'Graphics',
'cs.GT': 'Computer Science and Game Theory',
'cs.HC': 'Human-Computer Interaction',
'cs.IR': 'Information Retrieval',
'cs.IT': 'Information Theory',
'cs.LG': 'Machine Learning',
'cs.LO': 'Logic in Computer Science',
'cs.MA': 'Multiagent Systems',
'cs.MM': 'Multimedia',
'cs.MS': 'Mathematical Software',
'cs.NA': 'Numerical Analysis',
'cs.NE': 'Neural and Evolutionary Computing',
'cs.NI': 'Networking and Internet Architecture',
'cs.OH': 'Other Computer Science',
'cs.OS': 'Operating Systems',
'cs.PF': 'Performance',
'cs.PL': 'Programming Languages',
'cs.RO': 'Robotics',
'cs.SC': 'Symbolic Computation',
'cs.SD': 'Sound',
'cs.SE': 'Software Engineering',
'cs.SI': 'Social and Information Networks',
'cs.SY': 'Systems and Control',
'econ.EM': 'Econometrics',
'eess.AS': 'Audio and Speech Processing',
'eess.IV': 'Image and Video Processing',
'eess.SP': 'Signal Processing',
'gr-qc': 'General Relativity and Quantum Cosmology',
'hep-ex': 'High Energy Physics - Experiment',
'hep-lat': 'High Energy Physics - Lattice',
'hep-ph': 'High Energy Physics - Phenomenology',
'hep-th': 'High Energy Physics - Theory',
'math.AC': 'Commutative Algebra',
'math.AG': 'Algebraic Geometry',
'math.AP': 'Analysis of PDEs',
'math.AT': 'Algebraic Topology',
'math.CA': 'Classical Analysis and ODEs',
'math.CO': 'Combinatorics',
'math.CT': 'Category Theory',
'math.CV': 'Complex Variables',
'math.DG': 'Differential Geometry',
'math.DS': 'Dynamical Systems',
'math.FA': 'Functional Analysis',
'math.GM': 'General Mathematics',
'math.GN': 'General Topology',
'math.GR': 'Group Theory',
'math.GT': 'Geometric Topology',
'math.HO': 'History and Overview',
'math.IT': 'Information Theory',
'math.KT': 'K-Theory and Homology',
'math.LO': 'Logic',
'math.MG': 'Metric Geometry',
'math.MP': 'Mathematical Physics',
'math.NA': 'Numerical Analysis',
'math.NT': 'Number Theory',
'math.OA': 'Operator Algebras',
'math.OC': 'Optimization and Control',
'math.PR': 'Probability',
'math.QA': 'Quantum Algebra',
'math.RA': 'Rings and Algebras',
'math.RT': 'Representation Theory',
'math.SG': 'Symplectic Geometry',
'math.SP': 'Spectral Theory',
'math.ST': 'Statistics Theory',
'math-ph': 'Mathematical Physics',
'nlin.AO': 'Adaptation and Self-Organizing Systems',
'nlin.CD': 'Chaotic Dynamics',
'nlin.CG': 'Cellular Automata and Lattice Gases',
'nlin.PS': 'Pattern Formation and Solitons',
'nlin.SI': 'Exactly Solvable and Integrable Systems',
'nucl-ex': 'Nuclear Experiment',
'nucl-th': 'Nuclear Theory',
'physics.acc-ph': 'Accelerator Physics',
'physics.ao-ph': 'Atmospheric and Oceanic Physics',
'physics.app-ph': 'Applied Physics',
'physics.atm-clus': 'Atomic and Molecular Clusters',
'physics.atom-ph': 'Atomic Physics',
'physics.bio-ph': 'Biological Physics',
'physics.chem-ph': 'Chemical Physics',
'physics.class-ph': 'Classical Physics',
'physics.comp-ph': 'Computational Physics',
'physics.data-an': 'Data Analysis, Statistics and Probability',
'physics.ed-ph': 'Physics Education',
'physics.flu-dyn': 'Fluid Dynamics',
'physics.gen-ph': 'General Physics',
'physics.geo-ph': 'Geophysics',
'physics.hist-ph': 'History and Philosophy of Physics',
'physics.ins-det': 'Instrumentation and Detectors',
'physics.med-ph': 'Medical Physics',
'physics.optics': 'Optics',
'physics.plasm-ph': 'Plasma Physics',
'physics.pop-ph': 'Popular Physics',
'physics.soc-ph': 'Physics and Society',
'physics.space-ph': 'Space Physics',
'q-bio.BM': 'Biomolecules',
'q-bio.CB': 'Cell Behavior',
'q-bio.GN': 'Genomics',
'q-bio.MN': 'Molecular Networks',
'q-bio.NC': 'Neurons and Cognition',
'q-bio.OT': 'Other Quantitative Biology',
'q-bio.PE': 'Populations and Evolution',
'q-bio.QM': 'Quantitative Methods',
'q-bio.SC': 'Subcellular Processes',
'q-bio.TO': 'Tissues and Organs',
'q-fin.CP': 'Computational Finance',
'q-fin.EC': 'Economics',
'q-fin.GN': 'General Finance',
'q-fin.MF': 'Mathematical Finance',
'q-fin.PM': 'Portfolio Management',
'q-fin.PR': 'Pricing of Securities',
'q-fin.RM': 'Risk Management',
'q-fin.ST': 'Statistical Finance',
'q-fin.TR': 'Trading and Market Microstructure',
'quant-ph': 'Quantum Physics',
'stat.AP': 'Applications',
'stat.CO': 'Computation',
'stat.ME': 'Methodology',
'stat.ML': 'Machine Learning',
'stat.OT': 'Other Statistics',
'stat.TH': 'Statistics Theory'
}

## Load the Arxiv metadata

In [None]:
cols = ['id', 'title', 'abstract', 'categories']
data = []
file_name = '/content/drive/MyDrive/RAG/arxiv-metadata-oai-snapshot.json'


with open(file_name, encoding='latin-1') as f:
    for line in f:
        doc = json.loads(line)
        lst = [doc['id'], doc['title'], doc['abstract'], doc['categories']]
        data.append(lst)

df_data = pd.DataFrame(data=data, columns=cols)

print(df_data.shape)

df_data.head()

## Convert the category codes into text

In [None]:
def get_cat_text(x):

    cat_text = ''

    cat_list = x.split(' ')

    for i, item in enumerate(cat_list):

        cat_name = category_map[item]

        if cat_name != 'Not available':

            if i == 0:
                cat_text = cat_name
            else:
                cat_text = cat_text + ', ' + cat_name

    cat_text = cat_text.strip()

    return cat_text


df_data['cat_text'] = df_data['categories'].apply(get_cat_text)

df_data.head()

In [None]:
i = 1

print('Id:',df_data.loc[i, 'id'])
print()
print('Title:',df_data.loc[i, 'title'])
print()
print('Categories:',df_data.loc[i, 'cat_text'])
print()
print('Abstract:',df_data.loc[i, 'abstract'])

## Text Preprocessing

### Text Cleaning

In [9]:
def clean_text(x):

    new_text = x.replace("\n", " ")
    new_text = new_text.strip()

    return new_text

df_data['title'] = df_data['title'].apply(clean_text)
df_data['abstract'] = df_data['abstract'].apply(clean_text)

### Create the text string that will be vectorized

In [10]:
df_data['prepared_text'] = df_data['title'] + ' {title} ' + df_data['abstract']

### Get the data ready for vectorizing

In [None]:
chunk_list = list(df_data['prepared_text'])

arxiv_id_list = list(df_data['id'])
cat_list = list(df_data['cat_text'])

print(len(chunk_list))
print(len(arxiv_id_list))
print(len(cat_list))

### Create the embedding vetors

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(chunk_list)

print(embeddings.shape)
print('Embedding length', embeddings.shape[1])

In [15]:
np.savez_compressed('compressed_array.npz', array_data=embeddings)

### Save the embedding vectors and the dataframe

In [None]:
file_size_bytes = os.path.getsize('compressed_array.npz')

file_size_mb = file_size_bytes / (1024 * 1024)

print("File size:", file_size_mb, "MB")

In [None]:
loaded_embeddings = np.load('compressed_array.npz')

loaded_embeddings = loaded_embeddings['array_data']

loaded_embeddings.shape

In [18]:
df_data.to_csv('compressed_dataframe.csv.gz', compression='gzip', index=False)

In [None]:
df = pd.read_csv('compressed_dataframe.csv.gz', compression='gzip')

print(df.shape)

df.head(2)

## Set up FAISS with Nearest Neighbor Search

In [None]:
embed_length = embeddings.shape[1]

num_centroids = 5

quantizer = faiss.IndexFlatL2(embed_length)

index = faiss.IndexIVFFlat(quantizer, embed_length, num_centroids)

index.train(embeddings)

if not index.is_trained:
  raise ValueError("error happenned in the training")
else:
  print("training done")

index.add(embeddings)
index.nprobe = 5

## Import CrossEncoder for Re-ranking the predicted results

In [None]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

## Import Llama 2 for Text summerization Task

In [None]:
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K")
model = AutoModelForCausalLM.from_pretrained("togethercomputer/LLaMA-2-7B-32K")

In [None]:
pipeline=transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=100,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
    )

In [None]:
def summary_generator(text):
  llm=HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature':0})
  summary_prompt = "summarize this text: " + text
  return llm(summary_prompt)

## Test the RAG Search System

In [None]:
query_text = "How to use the Simplex Method for minimizing cost in a business?"
query = [query_text]
query_embedding = model.encode(query)

top_k = 5

scores, index_vals = index.search(query_embedding, top_k)

pred_list = list(index_vals[0])

pred_strings_list = [chunk_list[item] for item in pred_list]

cross_input_list = []

for item in pred_strings_list:

    new_list = [query[0], item]

    cross_input_list.append(new_list)


cross_scores = cross_encoder.predict(cross_input_list)

df = pd.DataFrame(cross_input_list, columns=['query_text', 'pred_text'])
df['original_index'] = index_vals[0]
df['cross_scores'] = cross_scores

df_sorted = df.sort_values(by='cross_scores', ascending=False)

df_sorted = df_sorted.reset_index(drop=True)

### Display Results

In [None]:
num_results = 3

for i in range(0,num_results):

    text = df_sorted.loc[i, 'pred_text']

    original_index = df_sorted.loc[i, 'original_index']
    arxiv_id = df_data.loc[original_index, 'id']
    cat_text = df_data.loc[original_index, 'cat_text']

    link_to_pdf = f'https://arxiv.org/pdf/{arxiv_id}'

    print('Link to pdf:', link_to_pdf)
    print('Categories:', cat_text)
    print('Abstract:', summary_generator(text))
    print()