In [4]:
import warnings
warnings.filterwarnings("ignore")
import arxiv
import pymupdf
#import fitz  # PyMuPDF for extracting text from PDFs

from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, PointStruct
import os
import requests



In [5]:
import ollama

COLLECTION_NAME = "arvix_papers"

# Initialize Ollama client
oclient = ollama.Client(host="localhost")

# Initialize Qdrant client
qclient = QdrantClient(host="localhost", port=6333)



# Function to fetch papers from arXiv
def fetch_arxiv_papers(query="quantum computing", max_results=5):
    search = arxiv.Search(query=query, max_results=max_results)
    papers = search.results()
    return papers

# Function to parse and extract the title and abstract from the arXiv paper
def parse_arxiv_paper(paper):
    title = paper.title
    abstract = paper.summary
    pdf_url = paper.pdf_url  # PDF URL for extracting content from the PDF
    return title, abstract, pdf_url

# Function to extract text from a PDF using PyMuPDF
def extract_text_from_pdf(pdf_url):
    # Open the PDF using PyMuPDF (fitz)
    print(f"Extracting text from PDF: {pdf_url}")
    r = requests.get(pdf_url)
    data = r.content
    pdf_doc = pymupdf.Document(stream=data)
    text = ""
    for page in pdf_doc:
        text += page.get_text()  # Extract text from each page
    return text

def chunk_text_by_length(text, chunk_size):
    """
    Splits the given text into smaller chunks of a specified character length.

    Args:
    - text (str): The input text to chunk.
    - chunk_size (int): The maximum number of characters per chunk.

    Returns:
    - list of str: A list of text chunks.
    """
    # Split text into chunks of the specified size
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    
    return chunks

def get_embeddings(model, chunks):

    embed = []
    text = []
    for ch in chunks:
        embed.append(oclient.embeddings(model=model, prompt=ch)['embedding'])
        text.append(ch)

    return embed, text


def create_qdrant_index(collection_name=COLLECTION_NAME):
    collections = qclient.get_collections()
    existing_coll = [collection.name for collection in collections.collections]
    print(f"Existing collections {existing_coll}")
    if collection_name not in existing_coll:
        qclient.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=2048, distance="Cosine")  # tinyllama embed dimension
        )


# Function to index the chunks and their embeddings into Qdrant
def index_chunks_in_qdrant(chunks, embeddings, collection_name=COLLECTION_NAME):
    points = []
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        point = PointStruct(
            id=i,
            vector=embedding,
            payload={"text": chunk}  # The chunk of text as metadata
        )
        points.append(point)
    
    # Upsert the points in Qdrant
    qclient.upsert(collection_name=collection_name, points=points)

In [37]:
papers = fetch_arxiv_papers(query="quantum computing", max_results=1)
for paper in papers:
    print(f"Processing paper {paper.title}")

    final_embed = []
    final_text = []

    title, abstract, url = parse_arxiv_paper(paper)
    print(f"Extracting {title}")

    pdf_text = extract_text_from_pdf(url)
    
    full_text = abstract + "\n\n" + pdf_text

    print(f"Chunking text into smaller pieces.")
    chunks = chunk_text_by_length(full_text, 500)

    embeds, text = get_embeddings(model="tinyllama", chunks=chunks)
    print(embeds[0], text[0])

    # list concatenation
    final_embed += embeds
    final_text += text

    # create collection
create_qdrant_index()

index_chunks_in_qdrant(final_text[0:2], final_embed[0:2])


Processing paper The Rise of Quantum Internet Computing
Extracting The Rise of Quantum Internet Computing
Extracting text from PDF: http://arxiv.org/pdf/2208.00733v1
Chunking text into smaller pieces.
>>>>>>> localhost 11434
[1.0391780138015747, 0.5044060945510864, -1.2625844478607178, 1.7051310539245605, 0.43921470642089844, 0.794536828994751, 1.0942450761795044, 2.873713493347168, -2.12315034866333, -0.6853867173194885, 0.46445050835609436, -1.7939075231552124, -2.0619542598724365, -1.2293833494186401, 2.081066846847534, -3.161661148071289, 3.252192974090576, 0.3250139355659485, -1.9646117687225342, -2.192922592163086, -1.3241469860076904, -0.14608533680438995, 2.866281270980835, -0.06599985808134079, 2.4832851886749268, 0.6131629347801208, 1.1946098804473877, 3.487541675567627, 0.1328343003988266, 1.799081802368164, -0.5285810828208923, 1.4253334999084473, 0.3634167015552521, -3.608160972595215, 0.963557779788971, 1.7489545345306396, 1.351354718208313, -0.0648336410522461, 1.2288811

In [21]:
from qdrant_client import QdrantClient, models
import ollama

COLLECTION_NAME = "NicheApplications"

# Initialize Ollama client
oclient = ollama.Client(host="localhost")

# Initialize Qdrant client
qclient = QdrantClient(host="localhost", port=6333)

# Text to embed
text = "Ollama excels in niche applications with specific embeddings"

# Generate embeddings
response = oclient.embeddings(model="tinyllama", prompt=text)
embeddings = response["embedding"]
print(len(embeddings), isinstance(embeddings, list))
# Create a collection if it doesn't already exist
if not qclient.collection_exists(COLLECTION_NAME):
    qclient.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=len(embeddings), distance=models.Distance.COSINE
        ),
    )

# Upload the vectors to the collection along with the original text as payload
qclient.upsert(
    collection_name=COLLECTION_NAME,
    points=[models.PointStruct(id=1, vector=embeddings, payload={"text": text})],
)

>>>>>>> localhost 11434
2048 True
>>>>>>> localhost 6333
>>>>>>> localhost 6333


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)