#  Install Required Packages

In [None]:
!pip install -q langchain sentence-transformers faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

#  Load Dataset from Google Drive

In [None]:
text_path = "/content/drive/MyDrive/GUVI dataset.txt"

with open(text_path, "r", encoding="utf-8") as file:
    raw_text = file.read()

print(f"Total characters: {len(raw_text)}")
print(raw_text[:500])  # preview first 1000 characters

Total characters: 20374
Overview of GUVI
GUVI (Grab Ur Vocational Interest) is an Indian edtech startup focused on providing accessible, high-quality coding education through vernacular and English languages. It aims to bridge the digital skills gap and empower learners across India and beyond with job-oriented technology courses.

GUVI, which stands for Grab Ur Vocational Interest, is a pioneering edtech startup founded in 2017 with the mission to democratize coding education across India. Originating from the incubat


#  Split Text into Chunks for Embedding

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,  # tokens/characters
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " "]
)

chunks = text_splitter.split_text(raw_text)
print(f"✅ Total chunks created: {len(chunks)}")
print("\nExample Chunk:\n", chunks[0])


✅ Total chunks created: 87

Example Chunk:
 Overview of GUVI


#  Generate Embeddings for Chunks

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")  # lightweight & fast
embeddings = model.encode(chunks, show_progress_bar=True)

print(f"✅ Embedding shape: {embeddings[0].shape}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Embedding shape: (384,)


#  Create and Save FAISS Index

In [None]:
import faiss
import numpy as np
import os

# Create the target folder if it doesn't exist
os.makedirs("/content/drive/MyDrive/guvi_rag", exist_ok=True)

# Build FAISS index
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# Define paths
index_path = "/content/drive/MyDrive/guvi_rag/guvi_faiss.index"
chunks_path = "/content/drive/MyDrive/guvi_rag/chunks.txt"

# Save FAISS index
faiss.write_index(index, index_path)

# Save text chunks to file
with open(chunks_path, "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(chunk.strip() + "\n")

print("✅ FAISS index and chunks saved successfully.")

✅ FAISS index and chunks saved successfully.


#  Login to Hugging Face

In [None]:
!pip install -q transformers accelerate

In [None]:
!pip install -q huggingface_hub


In [None]:
from huggingface_hub import login
from getpass import getpass

token = getpass('Enter your Hugging Face token: ')
login(token)


Enter your Hugging Face token: ··········


#  Load Mistral-7B Model for Text Generation

In [None]:
!pip install -q transformers accelerate

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto" if torch.cuda.is_available() else None,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

# ❌ DO NOT pass device=... here
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

print("✅ Mistral loaded successfully!")


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


✅ Mistral loaded successfully!


#  Load NLLB Model for Translation

In [None]:
!pip install -q langdetect sentence-transformers faiss-cpu

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m35.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load FAISS index
index = faiss.read_index("/content/drive/MyDrive/guvi_rag/guvi_faiss.index")

# Load chunks
with open("/content/drive/MyDrive/guvi_rag/chunks.txt", "r", encoding="utf-8") as f:
    chunks = [line.strip() for line in f.readlines()]

# Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch

nllb_model_name = "facebook/nllb-200-distilled-600M"
device = 0 if torch.cuda.is_available() else -1

# Load tokenizer and model
nllb_tokenizer = AutoTokenizer.from_pretrained(nllb_model_name)
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    nllb_model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)

# ✅ Language code map
lang_code_map = {
    # Indian
    "ta": "tam_Taml", "hi": "hin_Deva", "te": "tel_Telu",
    "ml": "mal_Mlym", "kn": "kan_Knda", "bn": "ben_Beng", "mr": "mar_Deva",
    # Foreign
    "fr": "fra_Latn", "de": "deu_Latn", "ko": "kor_Hang",
    "zh": "zho_Hans", "zh-cn": "zho_Hans", "ja": "jpn_Jpan",
    # English
    "en": "eng_Latn"
}

# Translation pipeline (use this ONE pipeline for all languages)
translator = pipeline("translation", model=nllb_model, tokenizer=nllb_tokenizer)



#  Define RAG Chatbot Function

In [None]:
def build_rag_prompt(context_chunks, question):
    context_text = "\n".join(context_chunks)
    return f"""You are a multilingual chatbot for GUVI. The user may ask questions in any language. Answer in English, and only based on the context given.

### Context:
{context_text}

### Question:
{question}

### Answer:"""


In [None]:
from langdetect import detect

def guvi_chatbot(user_input, top_k=3, back_translate=True):
    lang = detect(user_input)
    src_lang = lang_code_map.get(lang, "eng_Latn")  # fallback to English if unknown

    # Step 1: Translate to English if needed
    if lang != "en":
        print(f"🌐 Detected language: {lang} → Translating to English")
        translated = translator(user_input, src_lang=src_lang, tgt_lang="eng_Latn")[0]['translation_text']
    else:
        translated = user_input

    # Step 2: Retrieve relevant chunks
    q_embed = embedder.encode([translated])
    D, I = index.search(np.array(q_embed), top_k)
    context_chunks = [chunks[i] for i in I[0]]

    # Step 3: Build prompt
    prompt = build_rag_prompt(context_chunks, translated)

    # Step 4: Generate answer using Mistral
    result = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
    generated = result[0]["generated_text"].split("### Answer:")[-1].strip()

    # Step 5: Translate back to user language if needed
    if lang != "en" and back_translate:
        try:
            final_answer = translator(generated, src_lang="eng_Latn", tgt_lang=src_lang)[0]['translation_text']
            print("\n🤖 Answer (translated back to your language):\n")
            return final_answer
        except Exception as e:
            print(f"⚠️ Translation back failed: {e}")
            return generated

    print("\n🤖 Answer:\n")
    return generated


#  Reload Embedding Model (all-MiniLM-L6-v2)

In [None]:
from sentence_transformers import SentenceTransformer

# Load the same embedding model used for FAISS index
embedder = SentenceTransformer("all-MiniLM-L6-v2")


# Load FAISS Index and Text Chunks from Drive

In [None]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

import faiss
index = faiss.read_index("/content/drive/MyDrive/guvi_rag/guvi_faiss.index")

with open("/content/drive/MyDrive/guvi_rag/chunks.txt", "r", encoding="utf-8") as f:
    chunks = [line.strip() for line in f.readlines()]


#  Test Chatbot

#TAMIL

In [None]:
guvi_chatbot("GUVIல வேலை வாய்ப்பு இருக்கா?")   #Tamil

🌐 Detected language: ta → Translating to English


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer (translated back to your language):



'நான் GUVI க்கான பலமொழி சாட்போட், ஆனால் நான் உங்களுக்கு நேரடியாக ஒரு வேலையை வழங்க முடியாது. இருப்பினும், GUVI மாணவர்களுக்கு பணியில் வெற்றிகரமாக மாறுவதற்கு உதவும் வாழ்க்கைத் தொடக்க உருவாக்கம், போலி நேர்காணல்கள் மற்றும் வேலை வாய்ப்பு உதவி உள்ளிட்ட தொழில் சேவைகளை வழங்குகிறது என்று நான் உங்களுக்குச் சொல்ல முடியும். கூடுதலாக, GUVI ஆட்சேர்ப்பு இயக்கங்களுக்கான பல்வேறு நிறுவனங்களுடன் கூட்டாளர்களாக உள்ளது, எனவே வேலை வாய்ப்புகளுக்கான GUVI தொழில் நுட்ப போர்டலைக் கண்காணிப்பது நல்லது.'

#HINDI

In [None]:
guvi_chatbot("GUVI में कौन से कोर्स हैं?")   # Hindi

🌐 Detected language: hi → Translating to English


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer (translated back to your language):



'GUVI विभिन्न क्षेत्रों में व्यापक पाठ्यक्रम प्रदान करता है जिसमें प्रौद्योगिकी, डेटा विज्ञान, व्यवसाय, रचनात्मक कला और स्वास्थ्य सेवा शामिल हैं। कृपया हमारी वेबसाइट पर जाएं या नवीनतम पाठ्यक्रम प्रस्तावों और उपलब्धता के लिए हमारी सहायता टीम से संपर्क करें।'

#FRENCH

In [None]:
guvi_chatbot("Quels cours GUVI propose-t-il ?")     # French

🌐 Detected language: fr → Translating to English


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer (translated back to your language):



'GUVI propose des cours complets dans les domaines de la technologie, y compris des modules de microapprentissage pour une compréhension et une rétention faciles.'

#KOREAN

In [None]:
guvi_chatbot("GUVI는 정부에서 인정하는 플랫폼인가요?")          # Korean

🌐 Detected language: ko → Translating to English


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer (translated back to your language):



'GUVI는 채봇과 인공지능 기반의 의심 해소 시스템을 제공하는 교육 기술 회사입니다. 그것은 어떤 정부 기관과 직접적으로 연관되어 있지 않습니다. 그러나 정부에 의해 운영되는 플랫폼을 포함하여 다양한 교육 플랫폼과 통합하여 그들의 제공을 향상시킬 수 있습니다.'

#ENGLISH

In [None]:
guvi_chatbot("What is GUVI?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer:



'GUVI is an edtech startup based in India that offers accessible, high-quality coding education through both English and vernacular languages. Its mission is to bridge the digital skills gap and empower learners with job-oriented technology courses.'

In [None]:
guvi_chatbot("In which languages are GUVI courses offered?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer:



'GUVI courses are offered in multiple Indian languages including Tamil, Telugu, Kannada, and Hindi, apart from English.'

In [None]:
guvi_chatbot("Does GUVI offer live or recorded classes?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer:



'GUVI offers both live and recorded classes to cater to the varying schedules and learning preferences of its students. Live classes provide an interactive learning experience, while recorded classes offer the flexibility to learn at your own pace.'

In [None]:
guvi_chatbot("How long does Zen Data Science Course take to complete?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer:



"The duration of the Zen Data Science Course may vary depending on the individual's learning pace. GUVI provides flexible learning options, including self-paced courses and instructor-led workshops, allowing students to learn at their own pace. For more specific information about the duration of this course, I would recommend checking the course details on the GUVI website or contacting the GUVI support team directly."

In [None]:
guvi_chatbot("Can I switch my Zen Class course after enrollment?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer:



"I'd be happy to help answer your question about changing a course in your Zen Class enrollment. Generally, switching courses after enrollment is dependent on the specific policies of each Zen Class program. It's always best to reach out to the GUVI support team or the instructor of your current course for the most accurate information regarding your situation. They can provide you with the details about the process, any associated fees, and the deadlines for making a change. Please keep in mind that availability in other courses and sections may also be a factor. I hope this information is helpful, and I wish you the best of luck with your learning journey at GUVI!"

In [None]:
guvi_chatbot("Do I need to know coding before joining GUVI?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer:



"No, you don't need to have prior coding knowledge to join GUVI. Their programs are designed to be accessible to beginners. They offer bite-sized microlearning modules in various coding languages that will help you build a strong foundation in coding."

In [None]:
guvi_chatbot("How many days of attendance are required?")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer:



'The number of attendance days required can vary depending on the specific program or course you are enrolled in at GUVI. I would recommend checking the attendance policy in your course curriculum or contacting the GUVI support team for the most accurate information.'

#  Load Full Chunks Text from Drive

In [None]:
with open("/content/drive/MyDrive/guvi_rag/chunks.txt", "r", encoding="utf-8") as f:
    full_text = f.read()


#  Sentence Tokenization & Custom Chunking Function

In [None]:
# Step 1: Download tokenizer data
import nltk
nltk.download('punkt')

# Step 2: Chunking logic
from nltk.tokenize import sent_tokenize

def split_text_into_chunks(text, max_len=512, overlap=64):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    total_len = 0

    for sentence in sentences:
        if total_len + len(sentence) > max_len:
            chunks.append(" ".join(current_chunk))
            # Add overlap from end of previous chunk
            current_chunk = current_chunk[-(overlap//10):]  # adjust if needed
            total_len = sum(len(s) for s in current_chunk)

        current_chunk.append(sentence)
        total_len += len(sentence)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Generate Embeddings & Create FAISS Index

In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(chunks)


In [None]:
import faiss
import numpy as np
import os

dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# Save index + chunks
index_path = "/content/drive/MyDrive/guvi_rag/guvi_faiss.index"
chunks_path = "/content/drive/MyDrive/guvi_rag/chunks.txt"

faiss.write_index(index, index_path)
with open(chunks_path, "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(chunk.strip() + "\n")

print("✅ Updated FAISS index and chunks saved.")


✅ Updated FAISS index and chunks saved.


# Reload FAISS Index & Chunks from Drive

In [None]:
import faiss
import numpy as np

index = faiss.read_index("/content/drive/MyDrive/guvi_rag/guvi_faiss.index")
with open("/content/drive/MyDrive/guvi_rag/chunks.txt", "r", encoding="utf-8") as f:
    chunks = [line.strip() for line in f.readlines()]


#Test RAG Chatbot

In [None]:
guvi_chatbot("Can I switch my Zen Class course after enrollment?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer:



'While many GUVI courses offer lifetime access, the specific policy on switching courses after enrollment may vary. For accurate information, please contact GUVI customer support. They will be able to provide you with the most current and detailed information regarding your specific situation.'

In [None]:
guvi_chatbot("GUVIல வேலை வாய்ப்பு இருக்கா?")

🌐 Detected language: ta → Translating to English


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer (translated back to your language):



'GUVI நேரடி வேலைவாய்ப்புகளை வழங்காது, ஆனால் மாணவர்களுக்கு வேலைவாய்ப்பு வாய்ப்புகளை கண்டறிய உதவும் தொழில் சேவைகள் மற்றும் உதவிகளை வழங்குகிறது.'

In [None]:
guvi_chatbot("Do I need to know coding before joining GUVI?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



🤖 Answer:



"No, you don't need to have prior coding knowledge to join GUVI. They offer beginner-friendly courses that cover the fundamentals of coding. Their webinars and live coding sessions provide hands-on learning opportunities for those new to coding."

#HUGGING FACE-STREAMLIT DEPLOYMENT

#Kindly test my multilingual chatbot in the following website

#https://huggingface.co/spaces/gaja1995/GUVI_CHATBOT/tree/main