In [None]:
!pip install torch transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import os

# Load LaBSE model and tokenizer
model_name = "sentence-transformers/LaBSE"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# --- Step 1: Load .txt File ---
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# --- Step 2: Create chunks ---
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))
        i += chunk_size - overlap
    return chunks

# --- Step 3: Generate LaBSE Embeddings ---
def generate_embedding(text_chunk):
    inputs = tokenizer(text_chunk, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embedding
    return embeddings.squeeze().cpu()

# --- Step 4: Process File ---
def process_file(file_path):
    text = load_text_file(file_path)
    chunks = chunk_text(text)
    print(f"Total Chunks: {len(chunks)}\n")

    for idx, chunk in enumerate(chunks):
        embedding = generate_embedding(chunk)
        print(f"Chunk {idx+1}:")
        print(f"Embedding (first 5 dims): {embedding[:5].numpy()}\n")  # Print first 5 values for brevity

# === Example Usage ===
# Replace with your actual file path
if __name__ == "__main__":
    file_path = "/content/extracted_text.txt"  # <-- change this to your txt file path
    if os.path.exists(file_path):
        process_file(file_path)
    else:
        print(f"File not found: {file_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Total Chunks: 30

Chunk 1:
Embedding (first 5 dims): [ 0.20635833 -1.7693748  -0.5702164  -0.13670465 -0.27200317]

Chunk 2:
Embedding (first 5 dims): [ 1.5799331e-03 -2.3937399e+00  7.2274506e-01  1.0395436e+00
  6.9671381e-01]

Chunk 3:
Embedding (first 5 dims): [ 0.24166483 -2.2793846   0.08347116  0.20987248  0.16683964]

Chunk 4:
Embedding (first 5 dims): [ 0.6131148  -1.841223    0.96683174 -0.2893359   0.3893452 ]

Chunk 5:
Embedding (first 5 dims): [-0.11776094 -0.5233538   1.6014001  -0.559241    1.0050087 ]

Chunk 6:
Embedding (first 5 dims): [-0.4282556 -0.9141731  0.8412945 -0.2984397  1.1481879]

Chunk 7:
Embedding (first 5 dims): [-0.05406962 -0.9355334   0.98325586 -0.35334522  0.15905167]

Chunk 8:
Embedding (first 5 dims): [ 1.0195798  -1.193814    1.7520727  -0.04919074  0.02007863]

Chunk 9:
Embedding (first 5 dims): [ 0.19371504 -1.0069826   0.7257872   0.33956596  0.6176164 ]

Chunk 10:
Embedding (first 5 dims): [-0.54686356 -1.0244311   0.5623857   0.24564438  0.3

In [None]:
# prompt: apply similarity search on vectors by asking a query about growth of wheat

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- Step 5: Similarity Search ---
def find_most_similar_chunk(query, chunks, chunk_embeddings):
    query_embedding = generate_embedding(query)
    similarities = cosine_similarity(query_embedding.unsqueeze(0), chunk_embeddings)[0]
    most_similar_chunk_index = np.argmax(similarities)
    return chunks[most_similar_chunk_index], similarities[most_similar_chunk_index]

# --- Step 4 (Modified): Process File and Store Embeddings ---
def process_file_and_store_embeddings(file_path):
    text = load_text_file(file_path)
    chunks = chunk_text(text)
    print(f"Total Chunks: {len(chunks)}\n")

    chunk_embeddings = []
    for idx, chunk in enumerate(chunks):
        embedding = generate_embedding(chunk)
        chunk_embeddings.append(embedding)

    return chunks, torch.stack(chunk_embeddings) # Stack embeddings into a single tensor

# === Example Usage ===
# Replace with your actual file path
if __name__ == "__main__":
    file_path = "/content/extracted_text.txt"  # <-- change this to your txt file path
    if os.path.exists(file_path):
        # Process the file and get chunks and embeddings
        chunks, chunk_embeddings = process_file_and_store_embeddings(file_path)

        # Define the query
        query = "growth of wheat"

        # Find the most similar chunk
        most_similar_chunk, similarity_score = find_most_similar_chunk(query, chunks, chunk_embeddings)

        print(f"Query: '{query}'")
        print(f"Most Similar Chunk (Similarity: {similarity_score:.4f}):")
        print(most_similar_chunk)

    else:
        print(f"File not found: {file_path}")


Total Chunks: 1

Query: 'growth of wheat'
Most Similar Chunk (Similarity: 0.3709):
This is the first sentence. This is the second sentence. This is the third sentence. This is a longer text to test chunking capabilities. It needs to be long enough to create multiple chunks based on the chunk size. Let's add more words to ensure we reach the desired length for the demonstration. This helps to illustrate how the text is split and processed. More content to make it longer. Even more content to be sure. This will definitely be long enough for chunking. Adding more text here. And some more text here. This should do the trick for testing chunking and embedding generation.
