In [28]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import requests
from urllib.parse import quote
import requests
import fitz
from io import BytesIO
import getpass
import os
import time
from uuid import uuid4
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from langchain.llms import HuggingFaceEndpoint
from langchain.chains import RetrievalQA
from dotenv import load_dotenv

In [29]:
load_dotenv()

True

In [30]:
# Extract text from url
def extract_text_from_pdf_url(pdf_url):
    response = requests.get(pdf_url)
    with fitz.open("pdf", BytesIO(response.content)) as doc:
        return "\n".join(page.get_text() for page in doc)

# Connecting with Pinecone
def connect_to_pinecone():
    if not os.getenv("PINECONE_API_KEY"):
        os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

    pinecone_api_key = os.environ.get("PINECONE_API_KEY")

    pc = Pinecone(api_key=pinecone_api_key)

    index_name = "rpapers" 

    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    print(existing_indexes)

    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=384,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region=os.environ["PINECONE_ENV"]),
        )
        while not pc.describe_index(index_name).status["ready"]:
            time.sleep(1)

    index = pc.Index(index_name)
    return index

# Set up embeddings
def set_embddings(url):
    embeddings = HuggingFaceBgeEmbeddings(model_name="all-MiniLM-L6-v2")
    text = extract_text_from_pdf_url(url)
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=240)
    chunks = splitter.create_documents([text])
    return embeddings, chunks

# add element to vector store
def vector_store_and_chunking(index, embeddings, chunks):
    vector_store = PineconeVectorStore(index=index, embedding=embeddings)
    documents = [Document(page_content=chunk.page_content, metadata={"source": "arxiv_2301.00001"}) for chunk in chunks]
    uuids = [str(uuid4()) for _ in documents]
    vector_store.add_documents(documents=documents, ids=uuids)
    return vector_store

def query_llm(vector_store, question, llm):
    retriever = vector_store.as_retriever(search_type="similarity", k=4)

    qa = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )

    response = qa(question)
    return response


In [40]:
import feedparser
import requests
import fitz  # PyMuPDF
import os

def search_arxiv(query, max_results=3):
    query = quote(query)
    base_url = "http://export.arxiv.org/api/query?"
    search_url = f"{base_url}search_query=all:{query}&start=0&max_results={max_results}"
    return feedparser.parse(search_url)

results = search_arxiv("natural language processing", max_results=3)

all_urls = []
for i, entry in enumerate(results.entries):
    # print(f"\n📌 Title: {entry.title}")
    # print(f"📝 Abstract: {entry.summary[:300]}...\n")
    pdf_url = next(link.href for link in entry.links if link.type == "application/pdf")
    all_urls.append(pdf_url)

print(all_urls)
index = connect_to_pinecone()
embeddings, chunks = set_embddings(all_urls[0])
vector_store = vector_store_and_chunking(index, embeddings, chunks)

llm = HuggingFaceEndpoint(
    repo_id = "mistralai/Mistral-7B-Instruct-v0.3",
    temperature=0.6,
    task="text-generation",
    huggingfacehub_api_token=os.getenv("HF_TOKEN"), 
    max_new_tokens=512
)

question = "What is the paper about?"
response = query_llm(vector_store, question, llm)

print("🧠 Answer:\n", response["result"])
print("\n🔍 Source Chunks:")
for doc in response["source_documents"]:
    print("-", doc.page_content[:300].strip(), "\n")

['http://arxiv.org/pdf/1608.04434v1', 'http://arxiv.org/pdf/2503.02435v1', 'http://arxiv.org/pdf/2202.07138v2']
['rpapers']


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


HfHubHTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.3 (Request ID: Root=1-680947ec-2c4ff9922ea1ecec1ecc5715;8d1384f5-f889-4dc1-a406-6fe0147e1411)

You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.

In [75]:
def extract_abstract(data):
    if ".pdf" in data:
        doc = fitz.open("1706.03762v7.pdf")

        text = ""

        for page in doc:
            text+=page.get_text().lower()
    else:
        text=data
    abstract_index = 0
    try:
        abstract_index = text.index("abstract")
        end_index = text.index("introduction")
    except Exception as e:
        end_index = abstract_index+1000
        print(e)
    
    return text[abstract_index:end_index]

In [81]:
results = search_arxiv("gen ai", max_results=3)
# print(results.entries)
all_urls = []
for i, entry in enumerate(results.entries):
    pdf_url = next(link.href for link in entry.links if link.type == "application/pdf")
    all_urls.append(pdf_url)

print(all_urls)
text = extract_text_from_pdf_url(all_urls[2])
abstract = extract_abstract(text)
print(abstract)

['http://arxiv.org/pdf/2410.11977v4', 'http://arxiv.org/pdf/2502.08056v1', 'http://arxiv.org/pdf/2305.02878v1']
substring not found
The AI generation gap: Are Gen Z students more interested in adopting generative AI 
such as ChatGPT in teaching and learning than their Gen X and Millennial Generation 
teachers? 
Authors: Cecilia Ka Yuk Chan1* & Katherine K. W. Lee1 
 
Affiliation1: The University of Hong Kong 
Address: Centre for the Enhancement of Teaching and Learning (CETL), Room CPD-1.81, 
Centennial Campus, The University of Hong Kong, Pokfulam, Hong Kong 
 
* Corresponding author. Email: cecilia.chan@cetl.hku.hk 
Email: kathkw@connect.hku.hk 
 
Abstract  
This study aimed to explore the experiences, perceptions, knowledge, concerns, and 
intentions of Gen Z students with Gen X and Gen Y teachers regarding the use of generative 
AI (GenAI) in higher education. A sample of students and teachers were recruited to 
investigate the above using a survey consisting of both open and close

In [80]:
from keybert import KeyBERT

kw_model = KeyBERT(model='all-MiniLM-L6-v2')

def extract_topics(text, top_n=5):
    keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
    return [kw for kw, _ in keywords]

extract_topics(abstract)

['minimax', 'recognition', 'learning', 'min', 'samples']

NEXT TASK - Extract DOI, search the paper for title and abstract

In [1]:
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

images = convert_from_path("1706.03762v7.pdf", first_page=1, last_page=1)
image = images[0]

text = pytesseract.image_to_string(image)
print("🔍 OCR Text Preview:\n", text[:1000])

🔍 OCR Text Preview:
 1706.03762v7 [cs.CL] 2 Aug 2023

arXiv

Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.

Attention Is All You Need

Ashish Vaswani* Noam Shazeer* Niki Parmar* Jakob Uszkoreit*
Google Brain Google Brain Google Research Google Research
avaswani@google.com noam@google.com nikip@google.com usz@google.com

Llion Jones* Aidan N. Gomez* ¢ Lukasz Kaiser*
Google Research University of Toronto Google Brain
llion@google.com aidan@cs.toronto.edu lukaszkaiser@google.com

Illia Polosukhin* +
illia.polosukhin@gmail.com

Abstract

The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention me

In [2]:
import re

def extract_arxiv_id(text):
    match = re.search(r'(?:arXiv:)?(\d{4}\.\d{4,5})(v\d+)?', text)
    return match.group(1) if match else None

arxiv_id = extract_arxiv_id(text)
print("✅ arXiv ID:", arxiv_id)

✅ arXiv ID: 1706.03762


In [3]:
import feedparser

def fetch_arxiv_metadata(arxiv_id):
    url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
    feed = feedparser.parse(url)
    if feed.entries:
        entry = feed.entries[0]
        return {
            "title": entry.title.strip(),
            "abstract": entry.summary.strip(),
            "authors": [author.name for author in entry.authors],
            "published": entry.published,
            "url": entry.link
        }
    return None

metadata = fetch_arxiv_metadata(arxiv_id)
print("📄 Title:", metadata["title"])
print("🧠 Abstract:", metadata["abstract"])


📄 Title: Attention Is All You Need
🧠 Abstract: The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks in an encoder-decoder configuration. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer, based
solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to be
superior in quality while being more parallelizable and requiring significantly
less time to train. Our model achieves 28.4 BLEU on the WMT 2014
English-to-German translation task, improving over the existing best results,
including ensembles by over 2 BLEU. On the WMT 2014 English-to-French
translation task, our model establishes a new single-model state-of-the-art
BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction
of the training costs of the best models f

In [4]:
from keybert import KeyBERT

kw_model = KeyBERT(model='all-MiniLM-L6-v2')

def extract_topics(text, top_n=5):
    keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
    return [kw for kw, _ in keywords]

extract_topics(metadata["title"] + " " + metadata["abstract"])

  from .autonotebook import tqdm as notebook_tqdm


['attention', 'decoder', 'encoder', 'parsing', 'neural']

In [7]:
arxiv_tag_to_fullname = {
    # 🧠 Computer Science
    "cs.AI": "Artificial Intelligence",
    "cs.CL": "Computation and Language (NLP)",
    "cs.CV": "Computer Vision and Pattern Recognition",
    "cs.LG": "Machine Learning",
    "cs.IR": "Information Retrieval",
    "cs.SE": "Software Engineering",
    "cs.DB": "Databases",
    "cs.DS": "Data Structures and Algorithms",
    "cs.RO": "Robotics",
    "cs.CR": "Cryptography and Security",
    "cs.SI": "Social and Information Networks",

    # 🔬 Other Scientific Domains
    "math.PR": "Probability",
    "math.OC": "Optimization and Control",
    "stat.ML": "Machine Learning (Statistics side)",
    "physics.comp-ph": "Computational Physics",
    "econ.EM": "Econometrics",
    "q-fin.ML": "Quantitative Finance – ML"
}

In [54]:
topics = list(arxiv_tag_to_fullname.values())
topics 

['Artificial Intelligence',
 'Computation and Language (NLP)',
 'Computer Vision and Pattern Recognition',
 'Machine Learning',
 'Information Retrieval',
 'Software Engineering',
 'Databases',
 'Data Structures and Algorithms',
 'Robotics',
 'Cryptography and Security',
 'Social and Information Networks',
 'Probability',
 'Optimization and Control',
 'Machine Learning (Statistics side)',
 'Computational Physics',
 'Econometrics',
 'Quantitative Finance – ML']

In [60]:
import feedparser
from datetime import datetime

def fetch_data(query, max_results, start_date, end_date):
    base_url = "http://export.arxiv.org/api/query?"
    query = query.replace(" ", "+")
    search_url = f"{base_url}search_query=all:{query}&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending"
    
    feed = feedparser.parse(search_url)
    papers = []

    for entry in feed.entries:
        published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")

        if start_date and published_date < start_date:
            continue
        if end_date and published_date > end_date:
            continue

    
        papers.append({
                "title": entry.title.strip(),
                "abstract": entry.summary.strip(),
                "published": published_date.strftime("%Y-%m-%d"),
                "tags": entry.tags[0]["term"] if entry.tags else "unknown"
            })
    
    return papers

from datetime import datetime

start = datetime(2016, 1, 1)
end = datetime(2020, 12, 31)

data = []

# for topic in topics:
# print(topic)
papers = fetch_data("Artificial Intelligence", 200, start, end)
len(papers)
# print("===============================================")

62

In [58]:
(data)

[[{'title': 'The Governance of Physical Artificial Intelligence',
   'abstract': 'Physical artificial intelligence can prove to be one of the most important\nchallenges of the artificial intelligence. The governance of physical\nartificial intelligence would define its responsible intelligent application in\nthe society.',
   'published': '2023-04-06',
   'tags': 'cs.AI'}],
 [{'title': 'NLP for The Greek Language: A Longer Survey',
   'abstract': 'English language is in the spotlight of the Natural Language Processing (NLP)\ncommunity with other languages, like Greek, lagging behind in terms of offered\nmethods, tools and resources. Due to the increasing interest in NLP, in this\npaper we try to condense research efforts for the automatic processing of Greek\nlanguage covering the last three decades. In particular, we list and briefly\ndiscuss related works, resources and tools, categorized according to various\nprocessing layers and contexts. We are not restricted to the modern form o

In [37]:
from datetime import datetime

start = datetime(2019, 1, 1)
end = datetime(2024, 1, 1)

data = []

for topic in topics:
    print(topic)
    papers = fetch_arxiv_data(topic, 200)
    data.append(papers)
    print("===============================================")

Artificial Intelligence
Computation and Language (NLP)
Computer Vision and Pattern Recognition
Machine Learning
Information Retrieval
Software Engineering
Databases
Data Structures and Algorithms
Robotics
Cryptography and Security
Social and Information Networks
Probability
Optimization and Control
Machine Learning (Statistics side)
Computational Physics
Econometrics
Quantitative Finance – ML


In [46]:
flattened_papers = [paper for batch in data for paper in batch]

In [53]:
import pandas as pd

pd.DataFrame(flattened_papers)['tags'].unique()

array(['cs.AI', 'q-fin.TR', 'q-fin.GN', 'cs.CY', 'cs.IR', 'cs.HC',
       'astro-ph.IM', 'math.GM', 'cs.LG', 'cs.RO', 'cs.CR', 'cs.NE',
       'physics.pop-ph', 'q-bio.NC', 'cs.CL', 'q-bio.QM', 'cs.CV',
       'cs.IT', 'cs.ET', 'physics.data-an', 'cs.SE', 'cs.DL', 'quant-ph',
       'cs.NI', 'q-fin.RM', 'cs.DC', 'cmp-lg', 'q-bio.BM', 'eess.IV',
       'nlin.AO', 'cs.CG', 'cs.CC', 'eess.SP', 'math.AG', 'cs.GR',
       'physics.med-ph', 'physics.ao-ph', 'stat.ML', 'physics.comp-ph',
       'stat.CO', 'cond-mat.mtrl-sci', 'cs.CE', 'cs.FL', 'math.NA',
       'q-fin.ST', 'stat.ME', 'q-bio.MN', 'econ.EM', 'q-bio.PE',
       'physics.chem-ph', 'stat.AP', 'cs.GT', 'cs.MM', 'math.FA', 'cs.MS',
       'cs.MA', 'cs.DB', 'physics.acc-ph', 'math.GR', 'cs.DS', 'astro-ph',
       'cs.SD', 'cs.LO', 'math.CO', 'q-bio.GN', 'astro-ph.SR',
       'astro-ph.HE', 'math.OC', 'eess.SY', 'cs.PL', 'cs.SI',
       'physics.soc-ph', 'econ.TH', 'math.PR', 'math.SP', 'math-ph',
       'stat.OT', 'math.LO', 'math.ST

In [20]:
topics[:2]

['Artificial Intelligence', 'Computation and Language (NLP)']