In [None]:
!pip install laserembeddings[torch]



In [None]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.11/dist-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [None]:
import os
from laserembeddings import Laser
import numpy as np

# --- Step 1: Initialize LASER model ---
laser = Laser()

# --- Step 2: Load .txt File ---
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# --- Step 3: Create chunks (500 words with 50 overlap) ---
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))
        i += chunk_size - overlap
    return chunks

# --- Step 4: Generate LASER Embeddings ---
def generate_embedding(text_chunk):
    return laser.embed_sentences(text_chunk, lang='ur')[0]  # Returns a numpy array

# --- Step 5: Process File ---
def process_file(file_path):
    text = load_text_file(file_path)
    chunks = chunk_text(text)
    print(f"Total Chunks: {len(chunks)}\n")

    for idx, chunk in enumerate(chunks):
        emb = generate_embedding(chunk)
        print(f"Chunk {idx+1}: Embedding (first 5 dims): {emb[:5]}\n")

# === Example Usage ===
if __name__ == "__main__":
    file_path = "/content/extracted_text1.txt"  # <-- Replace with your actual file
    if os.path.exists(file_path):
        process_file(file_path)
    else:
        print(f"File not found: {file_path}")


Total Chunks: 155

Chunk 1: Embedding (first 5 dims): [ 0.0506123  -0.00010796  0.02291874  0.06528897 -0.00026663]

Chunk 2: Embedding (first 5 dims): [ 0.02469403  0.00201142  0.03704242  0.02620159 -0.01220574]

Chunk 3: Embedding (first 5 dims): [ 0.02908794  0.00523216  0.03650692  0.0248464  -0.00941808]

Chunk 4: Embedding (first 5 dims): [ 6.1991587e-02  3.6680706e-02  4.2167511e-02  1.4587443e-01
 -3.1575317e-07]

Chunk 5: Embedding (first 5 dims): [0.00521287 0.00124115 0.01672483 0.03176388 0.00118154]

Chunk 6: Embedding (first 5 dims): [0.01906445 0.00146967 0.03441738 0.02913912 0.00443868]

Chunk 7: Embedding (first 5 dims): [ 0.01855467  0.0034041   0.03789367  0.07192599 -0.00017382]

Chunk 8: Embedding (first 5 dims): [ 8.7888557e-03  9.0090916e-06 -4.6455378e-05  3.7670892e-02
  1.9004548e-04]

Chunk 9: Embedding (first 5 dims): [0.01221285 0.00017919 0.00177065 0.04409258 0.00104841]

Chunk 10: Embedding (first 5 dims): [ 0.01644565  0.00069443  0.04747762  0.040502

In [None]:
# prompt: write code to perform similarity search on vectors using a query about cotton growth areas

# --- Step 6: Store Embeddings ---
# We'll store chunks and their embeddings in a list of tuples
chunk_embeddings = []

# --- Step 5: Process File (Modified to store embeddings) ---
def process_file_with_storage(file_path):
    global chunk_embeddings
    chunk_embeddings = [] # Clear previous data

    text = load_text_file(file_path)
    chunks = chunk_text(text)
    print(f"Total Chunks: {len(chunks)}\n")

    for idx, chunk in enumerate(chunks):
        emb = generate_embedding(chunk)
        chunk_embeddings.append((chunk, emb))
        # print(f"Chunk {idx+1}: Embedding (first 5 dims): {emb[:5]}\n") # Optional print

# --- Step 7: Perform Similarity Search ---
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def find_similar_chunks(query, embeddings_data, top_n=3):
    query_embedding = generate_embedding(query)
    similarities = []
    for chunk, embedding in embeddings_data:
        similarity = cosine_similarity(query_embedding, embedding)
        similarities.append((chunk, similarity))

    # Sort by similarity in descending order
    similarities.sort(key=lambda item: item[1], reverse=True)

    return similarities[:top_n]

# === Example Usage ===
if __name__ == "__main__":
    file_path = "/content/extracted_text1.txt"  # <-- Replace with your actual file
    if os.path.exists(file_path):
        process_file_with_storage(file_path)

        # Define your query
        query = "cotton growth areas" # Your query about cotton growth areas

        # Perform similarity search
        top_similar_chunks = find_similar_chunks(query, chunk_embeddings, top_n=5)

        # Print the results
        print(f"\nTop chunks similar to '{query}':")
        for chunk, similarity in top_similar_chunks:
            print(f"Similarity: {similarity:.4f}")
            print(f"Chunk:\n{chunk[:200]}...\n") # Print only the first 200 characters of the chunk

    else:
        print(f"File not found: {file_path}")



Total Chunks: 155


Top chunks similar to 'cotton growth areas':
Similarity: 0.2854
Chunk:
روڈ فیصل آباد 10 تحقیقاتی ادار و برائے زرخیزی زمین ٹھوکر نیاز بیگ لاہور 11 اداره کراپ رپورٹنگ سروس پنجاب لاہور 12 ڈائر یکٹر نیشنل کپاس بریڈنگ انسٹیٹیوٹ (IUB)، بہاولپور ڈائریکٹوریٹ آف ایگریکلچر زونل دف...

Similarity: 0.2591
Chunk:
پہلے ظاہر ہو جاتی ہیں۔ کپاس کی موسمی کاشت کے لئے کھادوں کا استعمال کیمیائی کھادوں کا استعمال زمین کے لیبارٹری تجزیہ ، کپاس کی قسم ، طریقہ کاشت، وقت کاشت اور سابقہ فصل کو مد نظر رکھ کر کریں۔ کا شتکاروں...

Similarity: 0.2570
Chunk:
کاشت کو نشو و نماء اور دیگر مراحل کے لئے زیادہ وقت ملتا ہے اور اس طرح پیداوار اور کوالٹی بہتر ہوتی ہے۔ ان حوصلہ افزاء نتائج سے یہ بات سامنے آئی ہے کہ کپاس کی موسمی کاشت کے ساتھ گیتی کاشت کو بھی ترویج ...

Similarity: 0.2546
Chunk:
لائن اور ویب سائٹ کاشتکار کپاس کے کاشتی امور کے بارے میں زرعی ہیلپ لائن 17000-0800 پر پھیرتا ہفتہ صبح 8 بجے سے شام 8 بجے تک فون کر کے معلومات حاصل کر سکتے ہیں۔ اس کے علاوہ محکمہ زراعت کی ویب سائٹ www....

Similarity: