In [3]:
from google.cloud import storage
import fitz  # PyMuPDF

# Initialize Cloud Storage client
storage_client = storage.Client()
bucket_name = "research_gcp"
prefix = "documents/"  # Change to the uploaded folder name

def list_pdfs_in_gcs(bucket_name, prefix):
    """List all PDFs in a GCS folder."""
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)  # Get all files in the folder
    return [blob.name for blob in blobs if blob.name.endswith(".pdf")]

def download_and_extract_text(bucket_name, pdf_blob_name):
    """Download a PDF from GCS and extract text."""
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(pdf_blob_name)

    # Download the PDF to memory
    pdf_bytes = blob.download_as_bytes()

    # Extract text using PyMuPDF
    doc = fitz.open("pdf", pdf_bytes)
    text = ""
    for page in doc:
        text += page.get_text()

    return text

# List PDFs in the folder
pdf_files = list_pdfs_in_gcs(bucket_name, prefix)

# Extract text from each PDF
for pdf_file in pdf_files:
    text = download_and_extract_text(bucket_name, pdf_file)
    print(f"Extracted text from {pdf_file}:\n", text[:500])  # Preview first 500 chars


Extracted text from documents/He_Deep_Residual_Learning_CVPR_2016_paper.pdf:
 Deep Residual Learning for Image Recognition
Kaiming He
Xiangyu Zhang
Shaoqing Ren
Jian Sun
Microsoft Research
{kahe, v-xiangz, v-shren, jiansun}@microsoft.com
Abstract
Deeper neural networks are more difﬁcult to train. We
present a residual learning framework to ease the training
of networks that are substantially deeper than those used
previously. We explicitly reformulate the layers as learn-
ing residual functions with reference to the layer inputs, in-
stead of learning unreferenced functio
Extracted text from documents/NIPS-2017-attention-is-all-you-need-Paper.pdf:
 Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser∗
Google Bra

In [6]:
from vertexai.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("text-embedding-005")

def generate_embedding(text):
    response = model.get_embeddings([text])
    return response[0].values  # Returns a vector

embedding = generate_embedding("This is a sample text.")
print(embedding)


[-0.06342040002346039, -0.0075930203311145306, 0.007861461490392685, -0.021867837756872177, 0.0505375862121582, -0.036087390035390854, -0.009791307151317596, 0.015289794653654099, 0.05745871365070343, 0.029468629509210587, -0.03695201501250267, -0.060899171978235245, -0.005590120796114206, -0.06503001600503922, 0.00953863374888897, 0.03386051952838898, 0.060100361704826355, -0.07576009631156921, -0.029078669846057892, -0.004188651219010353, -2.6081756004714407e-05, -0.031244754791259766, -0.09096626937389374, -0.029172629117965698, 0.048780228942632675, -0.02397044189274311, -0.01663057692348957, -0.024809688329696655, -0.043611232191324234, -0.04390714690089226, 0.0017592888325452805, -0.02581915818154812, 0.0280209518969059, 0.016490239650011063, 0.005416413769125938, -0.0012446145992726088, 0.06263220310211182, 0.013571856543421745, 0.017047639936208725, -0.039034124463796616, -0.028913697227835655, -0.04759407415986061, -0.021188586950302124, -0.015150039456784725, -0.0299077518284

In [14]:
from google.cloud import storage
import fitz  # PyMuPDF

# Initialize Cloud Storage client
storage_client = storage.Client()
bucket_name = "research_gcp"
prefix = "documents/"  # Change to the uploaded folder name

def list_pdfs_in_gcs(bucket_name, prefix):
    """List all PDFs in a GCS folder."""
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)  # Get all files in the folder
    return [blob.name for blob in blobs if blob.name.endswith(".pdf")]

def download_and_extract_text(bucket_name, pdf_blob_name):
    """Download a PDF from GCS and extract text."""
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(pdf_blob_name)

    # Download the PDF to memory
    pdf_bytes = blob.download_as_bytes()

    # Extract text using PyMuPDF
    doc = fitz.open("pdf", pdf_bytes)
    text = ""
    for page in doc:
        text += page.get_text()

    return text

# List PDFs in the folder
pdf_files = list_pdfs_in_gcs(bucket_name, prefix)

# Extract text from each PDF
extracted_texts = []
for pdf_file in pdf_files:
    text = download_and_extract_text(bucket_name, pdf_file)
    extracted_texts.append(text)

print("Extracted text from PDFs successfully!")


Extracted text from PDFs successfully!


In [12]:
!pip install google-cloud-aiplatform numpy




In [15]:
import vertexai
from vertexai.language_models import TextEmbeddingModel
import numpy as np

# Initialize Vertex AI
vertexai.init(project="resolute-winter-447814-t5", location="us-central1")

# Load the Text Embedding Model
model = TextEmbeddingModel.from_pretrained("text-embedding-005")

# Convert extracted text into embeddings
embeddings = model.get_embeddings(extracted_texts)

# Convert embeddings into a NumPy array
np_embeddings = np.array([embedding.values for embedding in embeddings])

print("Generated embeddings successfully!")


Generated embeddings successfully!


In [17]:
import faiss
import numpy as np

# Ensure embeddings are float32
np_embeddings = np.array(np_embeddings, dtype=np.float32)

# Define FAISS index (L2 norm for similarity search)
embedding_dimension = np_embeddings.shape[1]  # Get embedding size
index = faiss.IndexFlatL2(embedding_dimension)

# Add embeddings to FAISS index
index.add(np_embeddings)

print("Stored embeddings in FAISS successfully!")


Stored embeddings in FAISS successfully!


In [19]:
def search_faiss(query_text, top_k=10):
    """Retrieve the top-k most relevant documents for a query."""
    # Convert query to embedding
    query_embedding = model.get_embeddings([query_text])
    query_vector = np.array([query_embedding[0].values], dtype=np.float32)  # Ensure float32 format

    # Search FAISS for the top-k nearest neighbors
    distances, indices = index.search(query_vector, top_k)

    print(f"Top-{top_k} relevant document indices:", indices[0])
    print(f"Distances:", distances[0])

    # Retrieve the corresponding text documents
    top_documents = [extracted_texts[i] for i in indices[0]]

    return top_documents

# Example Query
query = "What is deep residual?"
top_results = search_faiss(query, top_k=10)

# Print the first result
print("Most relevant document:", top_results[0])


Top-10 relevant document indices: [ 0  1  2 -1 -1 -1 -1 -1 -1 -1]
Distances: [7.2774649e-01 1.0805527e+00 1.1320330e+00 3.4028235e+38 3.4028235e+38
 3.4028235e+38 3.4028235e+38 3.4028235e+38 3.4028235e+38 3.4028235e+38]
Most relevant document: Deep Residual Learning for Image Recognition
Kaiming He
Xiangyu Zhang
Shaoqing Ren
Jian Sun
Microsoft Research
{kahe, v-xiangz, v-shren, jiansun}@microsoft.com
Abstract
Deeper neural networks are more difﬁcult to train. We
present a residual learning framework to ease the training
of networks that are substantially deeper than those used
previously. We explicitly reformulate the layers as learn-
ing residual functions with reference to the layer inputs, in-
stead of learning unreferenced functions. We provide com-
prehensive empirical evidence showing that these residual
networks are easier to optimize, and can gain accuracy from
considerably increased depth. On the ImageNet dataset we
evaluate residual nets with a depth of up to 152 layers—8×
de

In [9]:
!pip install faiss-gpu


Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [2]:
!pip install google-cloud-storage pymupdf


Collecting pymupdf
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.2


In [29]:
from vertexai.language_models import TextEmbeddingModel
from google.cloud import storage
import fitz  # PyMuPDF
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Initialize models
embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-005")
storage_client = storage.Client()

# GCS bucket details
bucket_name = "research_gcp"
prefix = "documents/"

# Function to generate embeddings
def generate_embedding(text):
    response = embedding_model.get_embeddings([text])
    return np.array(response[0].values)  # Convert to numpy array

# Function to list PDFs in GCS
def list_pdfs_in_gcs(bucket_name, prefix):
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    return [blob.name for blob in blobs if blob.name.endswith(".pdf")]

# Function to extract text from PDF
def download_and_extract_text(bucket_name, pdf_blob_name):
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(pdf_blob_name)
    pdf_bytes = blob.download_as_bytes()
    doc = fitz.open("pdf", pdf_bytes)
    text = "".join([page.get_text() for page in doc])
    return text

# Extract and embed text from all PDFs
pdf_files = list_pdfs_in_gcs(bucket_name, prefix)
text_chunks = []
embeddings = []

for pdf_file in pdf_files:
    text = download_and_extract_text(bucket_name, pdf_file)
    text_chunks.append(text)
    embeddings.append(generate_embedding(text))

# Convert to numpy array
embeddings = np.array(embeddings)

# Function to answer questions
def answer_question(query):
    query_embedding = generate_embedding(query)
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    best_match_idx = np.argmax(similarities)
    return text_chunks[best_match_idx]  # Return the best-matching text

# Example usage
user_question = "what is bert pretraining"
response = answer_question(user_question)
print("Best matching passage:", response[:500])  # Print first 500 characters


Best matching passage: Proceedings of NAACL-HLT 2019, pages 4171–4186
Minneapolis, Minnesota, June 2 - June 7, 2019. c⃝2019 Association for Computational Linguistics
4171
BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding
Jacob Devlin
Ming-Wei Chang
Kenton Lee
Kristina Toutanova
Google AI Language
{jacobdevlin,mingweichang,kentonl,kristout}@google.com
Abstract
We introduce a new language representa-
tion model called BERT, which stands for
Bidirectional Encoder Representations from
Transf
