<a href="https://colab.research.google.com/github/Karn2898/MultiLang_NLP_Sysytem/blob/main/MultilangSearch_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
from datasets import load_dataset
import json
from pathlib import Path
import os

def download_xnli():
    

    
    Path("/workspaces/MultiLang_NLP_Sysytem/data").mkdir(parents=True, exist_ok=True)

   
    languages = [
        'en', 'fr', 'es', 'de', 'el', 'bg', 'ru', 'tr', 'hi', 'th', 
        'vi', 'zh', 'ar', 'ur', 'sw'
    ]
    
    all_docs = []
    for lang in languages:
        print(f"Loading XNLI {lang}...")
        try:
           
            dataset = load_dataset("xnli", lang, split="validation")
            
            for i, item in enumerate(dataset):
                if i >= 2500:  
                    break
                
               
                doc_text = f"Premise: {item['premise']} Hypothesis: {item['hypothesis']} Label: {item['label']}"
                doc = {
                    "text": doc_text[:800],  
                    "lang": lang,
                    "premise": item['premise'][:100],
                    "id": f"{lang}_{i}"
                }
                all_docs.append(doc)
                
        except Exception as e:
            print(f" {lang}: {e}")
            continue
   
    with open("/workspaces/MultiLang_NLP_Sysytem/data/xnli.json", "w") as f:
        json.dump(all_docs, f)
    
    print(f" XNLI ready: {len(all_docs)} NLI pairs, {len(set(d['lang'] for d in all_docs))} languages")
    return all_docs

if __name__ == "__main__":
    download_xnli()


Loading XNLI en...
Loading XNLI fr...
Loading XNLI es...
Loading XNLI de...
Loading XNLI el...
Loading XNLI bg...
Loading XNLI ru...
Loading XNLI tr...
Loading XNLI hi...
Loading XNLI th...
Loading XNLI vi...
Loading XNLI zh...
Loading XNLI ar...
Loading XNLI ur...


ur/train-00000-of-00001.parquet:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

ur/test-00000-of-00001.parquet:   0%|          | 0.00/428k [00:00<?, ?B/s]

ur/validation-00000-of-00001.parquet:   0%|          | 0.00/216k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Loading XNLI sw...


sw/train-00000-of-00001.parquet:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

sw/test-00000-of-00001.parquet:   0%|          | 0.00/312k [00:00<?, ?B/s]

sw/validation-00000-of-00001.parquet:   0%|          | 0.00/158k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

 XNLI ready: 37350 NLI pairs, 15 languages


In [19]:
import json
from pathlib import Path

# Create data folder if it doesn't exist
Path("/workspaces/MultiLang_NLP_Sysytem/data").mkdir(parents=True, exist_ok=True)

# Save all_docs to JSON file
with open("/workspaces/MultiLang_NLP_Sysytem/data/xnli.json", "w") as f:
    json.dump(all_docs, f)

print(f"Dataset saved: {len(all_docs)} documents")

Dataset saved: 37350 documents


In [17]:
all_docs = download_xnli()

Loading XNLI en...
Loading XNLI fr...
Loading XNLI es...
Loading XNLI de...
Loading XNLI el...
Loading XNLI bg...
Loading XNLI ru...
Loading XNLI tr...
Loading XNLI hi...
Loading XNLI th...
Loading XNLI vi...
Loading XNLI zh...
Loading XNLI ar...
Loading XNLI ur...
Loading XNLI sw...
 XNLI ready: 37350 NLI pairs, 15 languages


In [9]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device set to: {device}")

Device set to: cuda


In [18]:
import json
import numpy as np
import faiss
import pickle
from sentence_transformers import SentenceTransformer

def xnli_index():
   
    with open("/workspaces/MultiLang_NLP_Sysytem/data/xnli.json", "r") as f:
        docs = json.load(f)
    
    texts = [doc["text"] for doc in docs]
    metadata = [{"lang": d["lang"], "premise": d["premise"], "id": d["id"]} for d in docs]
    
    print(f" Embedding {len(texts)} XNLI docs...")
    
    embedder = SentenceTransformer('intfloat/multilingual-e5-base')
    embeddings = embedder.encode(texts, normalize_embeddings=True, batch_size=64).astype('float32')
    
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    
  
    faiss.write_index(index, "/workspaces/MultiLang_NLP_Sysytem/data/xnli_index.bin")
    with open("/workspaces/MultiLang_NLP_Sysytem/data/xnli_metadata.pkl", "wb") as f:
        pickle.dump(metadata, f)
    
    print(f" XNLI indexed: {index.ntotal} docs ")
    return index

if __name__ == "__main__":
    xnli_index()


 Embedding 32370 XNLI docs...
 XNLI indexed: 32370 docs 


In [14]:
import subprocess

subprocess.run(["pip", "install", "faiss-cpu"], check=True)

CompletedProcess(args=['pip', 'install', 'faiss-cpu'], returncode=0)