<a href="https://colab.research.google.com/github/Karn2898/MultiLang_NLP_Sysytem/blob/main/MultilangSearch_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import json
from datasets import load_dataset
import os
from typing import List

def generate_synthetic_docs():
    """Generates a small synthetic multilingual corpus."""
    print("Generating synthetic documents...")
    synthetic_data = [
        {"lang": "en", "title": "Synthetic English Article 1", "text": "This is a synthetic document in English. It discusses various concepts in AI and machine learning."},
        {"lang": "hi", "title": "Synthetic Hindi Article 1", "text": "यह हिंदी में एक सिंथेटिक दस्तावेज़ है। इसमें कृत्रिम बुद्धिमत्ता और मशीन लर्निंग के विभिन्न अवधारणाओं पर चर्चा की गई है।"},
        {"lang": "fr", "title": "Synthetic French Article 1", "text": "Ceci est un document synthétique en français. Il aborde divers concepts en IA et en apprentissage automatique."}
    ]
    return synthetic_data

def download_wiki_multilingual(output_dir="data"):

    os.makedirs(output_dir, exist_ok=True)
    languages = {
        "en": "20231101.en",
        "hi": "20231101.hi",
        "fr": "20231101.fr",
        "es": "20231101.es",
        "de": "20231101.de",
        "bn": "20231101.bn"
    }
    all_docs = []
    for lang_code, date in languages.items():
        print(f"📥 Loading Wikipedia {lang_code}...")
        try:

            # Removed lang=lang_code as it conflicts with the builder config for this dataset
            dataset = load_dataset("wikimedia/wikipedia", date, split="train")


            docs = []
            for i, article in enumerate(dataset.select(range(2000))):
                if len(article['text']) > 100:
                    docs.append({
                        "lang": lang_code,
                        "title": article.get('title', f"Article_{i}"),
                        "text": article['text'][:800] + "..."
                    })
            all_docs.extend(docs)
            print(f"   {len(docs)} {lang_code} articles")
        except Exception as e:
            print(f"    {lang_code} failed: {e}")
            continue

    if not all_docs:
        print(" Using synthetic multilingual corpus...")
        synthetic_docs = generate_synthetic_docs()
        all_docs = synthetic_docs
    with open(os.path.join(output_dir, "wiki_multilingual.json"), "w") as f:
        json.dump(all_docs, f, indent=1)

    print(f" Total: {len(all_docs)} multilingual docs saved!")
    return all_docs

In [4]:
download_wiki_multilingual()

📥 Loading Wikipedia en...
    en failed: BuilderConfig ParquetConfig(name='20231101.en', version=0.0.0, data_dir=None, data_files={'train': ['20231101.en/train-*']}, description=None, batch_size=None, columns=None, features=None, filters=None) doesn't have a 'lang' key.
📥 Loading Wikipedia hi...
    hi failed: BuilderConfig ParquetConfig(name='20231101.hi', version=0.0.0, data_dir=None, data_files={'train': ['20231101.hi/train-*']}, description=None, batch_size=None, columns=None, features=None, filters=None) doesn't have a 'lang' key.
📥 Loading Wikipedia fr...
    fr failed: BuilderConfig ParquetConfig(name='20231101.fr', version=0.0.0, data_dir=None, data_files={'train': ['20231101.fr/train-*']}, description=None, batch_size=None, columns=None, features=None, filters=None) doesn't have a 'lang' key.
📥 Loading Wikipedia es...
    es failed: BuilderConfig ParquetConfig(name='20231101.es', version=0.0.0, data_dir=None, data_files={'train': ['20231101.es/train-*']}, description=None, bat

[{'lang': 'en',
  'title': 'Synthetic English Article 1',
  'text': 'This is a synthetic document in English. It discusses various concepts in AI and machine learning.'},
 {'lang': 'hi',
  'title': 'Synthetic Hindi Article 1',
  'text': 'यह हिंदी में एक सिंथेटिक दस्तावेज़ है। इसमें कृत्रिम बुद्धिमत्ता और मशीन लर्निंग के विभिन्न अवधारणाओं पर चर्चा की गई है।'},
 {'lang': 'fr',
  'title': 'Synthetic French Article 1',
  'text': 'Ceci est un document synthétique en français. Il aborde divers concepts en IA et en apprentissage automatique.'}]

In [6]:
download_wiki_multilingual()

📥 Loading Wikipedia en...


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

20231101.en/train-00000-of-00041.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

20231101.en/train-00001-of-00041.parquet:   0%|          | 0.00/351M [00:00<?, ?B/s]

20231101.en/train-00002-of-00041.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

20231101.en/train-00003-of-00041.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

20231101.en/train-00004-of-00041.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

20231101.en/train-00005-of-00041.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

20231101.en/train-00006-of-00041.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

20231101.en/train-00007-of-00041.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

20231101.en/train-00008-of-00041.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

20231101.en/train-00009-of-00041.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

20231101.en/train-00010-of-00041.parquet:   0%|          | 0.00/234M [00:00<?, ?B/s]

20231101.en/train-00011-of-00041.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

20231101.en/train-00012-of-00041.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

20231101.en/train-00013-of-00041.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

20231101.en/train-00014-of-00041.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

20231101.en/train-00015-of-00041.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

20231101.en/train-00016-of-00041.parquet:   0%|          | 0.00/503M [00:00<?, ?B/s]

20231101.en/train-00017-of-00041.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

20231101.en/train-00018-of-00041.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

20231101.en/train-00019-of-00041.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

20231101.en/train-00020-of-00041.parquet:   0%|          | 0.00/225M [00:00<?, ?B/s]

20231101.en/train-00021-of-00041.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

20231101.en/train-00022-of-00041.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

20231101.en/train-00023-of-00041.parquet:   0%|          | 0.00/213M [00:00<?, ?B/s]

20231101.en/train-00024-of-00041.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

20231101.en/train-00025-of-00041.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

20231101.en/train-00026-of-00041.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

20231101.en/train-00027-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

20231101.en/train-00028-of-00041.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

20231101.en/train-00029-of-00041.parquet:   0%|          | 0.00/218M [00:00<?, ?B/s]

20231101.en/train-00030-of-00041.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

20231101.en/train-00031-of-00041.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

20231101.en/train-00032-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

20231101.en/train-00033-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

20231101.en/train-00034-of-00041.parquet:   0%|          | 0.00/219M [00:00<?, ?B/s]

20231101.en/train-00035-of-00041.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

20231101.en/train-00036-of-00041.parquet:   0%|          | 0.00/610M [00:00<?, ?B/s]

20231101.en/train-00037-of-00041.parquet:   0%|          | 0.00/674M [00:00<?, ?B/s]

20231101.en/train-00038-of-00041.parquet:   0%|          | 0.00/538M [00:00<?, ?B/s]

20231101.en/train-00039-of-00041.parquet:   0%|          | 0.00/465M [00:00<?, ?B/s]

20231101.en/train-00040-of-00041.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6407814 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

   2000 en articles
📥 Loading Wikipedia hi...


20231101.hi/train-00000-of-00002.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

20231101.hi/train-00001-of-00002.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/163093 [00:00<?, ? examples/s]

   1942 hi articles
📥 Loading Wikipedia fr...


Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/17 [00:00<?, ?files/s]

20231101.fr/train-00000-of-00017.parquet:   0%|          | 0.00/769M [00:00<?, ?B/s]

20231101.fr/train-00001-of-00017.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

20231101.fr/train-00002-of-00017.parquet:   0%|          | 0.00/348M [00:00<?, ?B/s]

20231101.fr/train-00003-of-00017.parquet:   0%|          | 0.00/296M [00:00<?, ?B/s]

20231101.fr/train-00004-of-00017.parquet:   0%|          | 0.00/284M [00:00<?, ?B/s]

20231101.fr/train-00005-of-00017.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

20231101.fr/train-00006-of-00017.parquet:   0%|          | 0.00/219M [00:00<?, ?B/s]

20231101.fr/train-00007-of-00017.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

20231101.fr/train-00008-of-00017.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

20231101.fr/train-00009-of-00017.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

20231101.fr/train-00010-of-00017.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

20231101.fr/train-00011-of-00017.parquet:   0%|          | 0.00/169M [00:00<?, ?B/s]

20231101.fr/train-00012-of-00017.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

20231101.fr/train-00013-of-00017.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

20231101.fr/train-00014-of-00017.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

20231101.fr/train-00015-of-00017.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

20231101.fr/train-00016-of-00017.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2564646 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

   2000 fr articles
📥 Loading Wikipedia es...


20231101.es/train-00000-of-00013.parquet:   0%|          | 0.00/688M [00:00<?, ?B/s]

20231101.es/train-00001-of-00013.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

20231101.es/train-00002-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

20231101.es/train-00003-of-00013.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

20231101.es/train-00004-of-00013.parquet:   0%|          | 0.00/168M [00:00<?, ?B/s]

20231101.es/train-00005-of-00013.parquet:   0%|          | 0.00/178M [00:00<?, ?B/s]

20231101.es/train-00006-of-00013.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

20231101.es/train-00007-of-00013.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

20231101.es/train-00008-of-00013.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

20231101.es/train-00009-of-00013.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

20231101.es/train-00010-of-00013.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

20231101.es/train-00011-of-00013.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

20231101.es/train-00012-of-00013.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1841155 [00:00<?, ? examples/s]

   2000 es articles
📥 Loading Wikipedia de...


Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/20 [00:00<?, ?files/s]

20231101.de/train-00000-of-00020.parquet:   0%|          | 0.00/781M [00:00<?, ?B/s]

20231101.de/train-00001-of-00020.parquet:   0%|          | 0.00/449M [00:00<?, ?B/s]

20231101.de/train-00002-of-00020.parquet:   0%|          | 0.00/369M [00:00<?, ?B/s]

20231101.de/train-00003-of-00020.parquet:   0%|          | 0.00/293M [00:00<?, ?B/s]

20231101.de/train-00004-of-00020.parquet:   0%|          | 0.00/296M [00:00<?, ?B/s]

20231101.de/train-00005-of-00020.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

20231101.de/train-00006-of-00020.parquet:   0%|          | 0.00/271M [00:00<?, ?B/s]

20231101.de/train-00007-of-00020.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

20231101.de/train-00008-of-00020.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

20231101.de/train-00009-of-00020.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

20231101.de/train-00010-of-00020.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

20231101.de/train-00011-of-00020.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

20231101.de/train-00012-of-00020.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

20231101.de/train-00013-of-00020.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

20231101.de/train-00014-of-00020.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

20231101.de/train-00015-of-00020.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

20231101.de/train-00016-of-00020.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

20231101.de/train-00017-of-00020.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

20231101.de/train-00018-of-00020.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

20231101.de/train-00019-of-00020.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2845308 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

   1999 de articles
📥 Loading Wikipedia bn...


20231101.bn/train-00000-of-00002.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

20231101.bn/train-00001-of-00002.parquet:   0%|          | 0.00/151M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/143069 [00:00<?, ? examples/s]

   1998 bn articles
 Total: 11939 multilingual docs saved!


[{'lang': 'en',
  'title': 'Anarchism',
  'text': 'Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).\n\nHumans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of a...'},
 {'lang': 'en',
  'title': 'Albedo',
  'text': "Albedo (; ) is the fraction of sunlight that is diffusely reflected by a body. It is measure

In [11]:
!pip install faiss-cpu
import numpy as np
import faiss
import json
import pickle
import torch
from sentence_transformers import SentenceTransformer

class WikiEmbedder:
    def __init__(self):
        self.model = SentenceTransformer('intfloat/multilingual-e5-large')
        print(" multilingual-e5-large loaded (100+ languages)")
    def embed_batch(self, texts):
        return self.model.encode(
            texts,
            normalize_embeddings=True,
            batch_size=64,
            show_progress_bar=True
        ).astype('float32')

def build_wiki_index():

   with open("/content/data/wiki_multilingual.json", "r") as f:
    docs = json.load(f)
   texts = [doc["text"] for doc in docs]
   metadata = [{"lang": d["lang"], "title": d["title"]} for d in docs]

   print(f" {len(texts)} docs → embedding...")

   embedder = WikiEmbedder()
   embeddings = embedder.embed_batch(texts)

   # FAISS
   d = embeddings.shape[1]
   index = faiss.IndexFlatIP(d)
   index.add(embeddings)

   faiss.write_index(index, "wiki_index.bin")
   with open("wiki_metadata.pkl", "wb") as f:
       pickle.dump(metadata, f)

   print(f" Wiki index: {index.ntotal} vectors (dim={d})")
   return index
if __name__ == "__main__":
    build_wiki_index()

 11939 docs → embedding...


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

 multilingual-e5-large loaded (100+ languages)


Batches:   0%|          | 0/187 [00:00<?, ?it/s]

 Wiki index: 11939 vectors (dim=1024)


In [24]:
from transformers import pipeline
import faiss
import pickle
import torch
from sentence_transformers import SentenceTransformer

class RAG:
  def __init__(self):
    # Explicitly set device to 'cpu' to avoid CUDA out of memory errors
    self.device = -1 # -1 for CPU, 0 for first GPU
    self.embedder=SentenceTransformer('intfloat/Multilingual-e5-large', device='cpu')
    self.lang_detector=pipeline("text-classification",
                                model="papluca/xlm-roberta-base-language-detection",
                                device=self.device)
    self.generator=pipeline("text2text-generation",
                            model="google/mt5-base",
                            device=self.device)
    self.index = faiss.read_index("wiki_index.bin")
    with open("wiki_metadata.pkl", "rb") as f:
        self.metadata = pickle.load(f)

    print(f" WikiRAG ready: {self.index.ntotal} Wikipedia chunks")

  def search(self, query, k=5):
      """Semantic search"""
      q_emb = self.embedder.encode([query], normalize_embeddings=True)
      scores, indices = self.index.search(q_emb.astype('float32'), k)

      results = []
      for i, idx in enumerate(indices[0]):
          meta = self.metadata[idx]
          results.append({
              "text": f"{meta['title']}: {query[:200]}...",
              "score": float(scores[0][i]),
              "lang": meta['lang']
          })
      return results

  def answer(self, query, results):

      context = "\n".join([r["text"] for r in results[:3]])
      prompt = f"Context: {context}\n\nQuery: {query}\nAnswer:"

      result = self.generator(prompt, max_new_tokens=100,
                            do_sample=True, temperature=0.7)[0]['generated_text']
      return result[len(prompt):].strip()

if __name__ == "__main__":
    rag = RAG()

    queries = {
        "hi": "भारत की राजधानी क्या है?",
        "en": "What is machine learning?",
        "bn": "কৃত্রিম বুদ্ধিমত্তা কী?"
    }

    for lang, q in queries.items():
        print(f"\n{lang}: {q}")
        results = rag.search(q)
        answer = rag.answer(q, results)
        print(f"Answer: {answer}")

Device set to use cpu
Device set to use cpu


RuntimeError: Error in faiss::Index* faiss::read_index(IOReader*, int) at /project/third-party/faiss/faiss/impl/index_read.cpp:1472: Index type 0x6cbffa3d ("=\xfa\xbfl") not recognized

In [22]:
rag = RAG()

queries = {
    "hi": "भारत की राजधानी क्या है?",
    "en": "What is machine learning?",
    "bn": "কৃত্রিম বুদ্ধিমত্তা কী?"
}

for lang, q in queries.items():
    print(f"\n{lang}: {q}")
    results = rag.search(q)
    answer = rag.answer(q, results)
    print(f"Answer: {answer}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 14.12 MiB is free. Process 14261 has 14.72 GiB memory in use. Of the allocated memory 14.49 GiB is allocated by PyTorch, and 116.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)