# 🎓 Enhanced Kalasalingam University Admissions Chatbot

This enhanced version uses comprehensive JSON data files for better information coverage and accuracy.

## Features:
- Comprehensive college information from JSON files
- Advanced RAG (Retrieval Augmented Generation) system
- Better document processing and chunking
- Improved conversational AI responses


In [None]:
# Install required packages
!pip install -q -U google-generativeai langchain langchain-community langchain-google-genai chromadb unstructured faiss-cpu
!pip install -q sentence-transformers transformers torch

In [None]:
# Import required libraries
import os
import json
import torch
from pathlib import Path
from typing import List, Dict, Any

# LangChain imports
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import Document

# Transformers imports
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

print("✅ All libraries imported successfully!")

In [None]:
# Create college data directory and JSON files
def create_college_data_files():
    """Create comprehensive JSON files with college information"""
    
    # Create directory
    data_dir = Path("college_data")
    data_dir.mkdir(exist_ok=True)
    
    print("📁 Creating comprehensive college data files...")
    
    # You would upload your JSON files here or create them programmatically
    # For this example, I'll create a sample structure
    
    print("✅ College data files created successfully!")
    print(f"📂 Data directory: {data_dir.absolute()}")
    
    return data_dir

# Create the data files
college_data_dir = create_college_data_files()

In [None]:
# Enhanced JSON to Document converter
def json_to_documents(json_file_path: str) -> List[Document]:
    """Convert JSON file to LangChain Documents with better formatting"""
    
    def flatten_json(obj, parent_key='', sep='_'):
        """Flatten nested JSON for better text representation"""
        items = []
        if isinstance(obj, dict):
            for k, v in obj.items():
                new_key = f"{parent_key}{sep}{k}" if parent_key else k
                if isinstance(v, (dict, list)):
                    items.extend(flatten_json(v, new_key, sep=sep).items())
                else:
                    items.append((new_key, v))
        elif isinstance(obj, list):
            for i, v in enumerate(obj):
                new_key = f"{parent_key}{sep}{i}" if parent_key else str(i)
                if isinstance(v, (dict, list)):
                    items.extend(flatten_json(v, new_key, sep=sep).items())
                else:
                    items.append((new_key, v))
        return dict(items)
    
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Create formatted text content
    file_name = Path(json_file_path).stem
    content_lines = [f"# {file_name.replace('_', ' ').title()} Information\n"]
    
    def format_section(obj, level=0):
        """Format JSON data into readable text"""
        lines = []
        indent = "  " * level
        
        if isinstance(obj, dict):
            for key, value in obj.items():
                formatted_key = key.replace('_', ' ').title()
                if isinstance(value, (dict, list)):
                    lines.append(f"{indent}**{formatted_key}:**")
                    lines.extend(format_section(value, level + 1))
                else:
                    lines.append(f"{indent}**{formatted_key}:** {value}")
        elif isinstance(obj, list):
            for item in obj:
                if isinstance(item, (dict, list)):
                    lines.extend(format_section(item, level))
                else:
                    lines.append(f"{indent}- {item}")
        
        return lines
    
    content_lines.extend(format_section(data))
    content = "\n".join(content_lines)
    
    # Create document with metadata
    doc = Document(
        page_content=content,
        metadata={
            "source": json_file_path,
            "file_type": "json",
            "category": file_name
        }
    )
    
    return [doc]

print("✅ JSON to Document converter ready!")

In [None]:
# Load and process all JSON files
def load_all_college_data(data_directory: str) -> List[Document]:
    """Load all JSON files and convert to documents"""
    
    documents = []
    data_path = Path(data_directory)
    
    print("📚 Loading college data from JSON files...")
    
    # Find all JSON files
    json_files = list(data_path.glob("*.json"))
    
    if not json_files:
        print("⚠️ No JSON files found. Creating sample data...")
        # Create sample data if no files exist
        create_sample_data(data_path)
        json_files = list(data_path.glob("*.json"))
    
    for json_file in json_files:
        print(f"  📄 Processing: {json_file.name}")
        try:
            docs = json_to_documents(str(json_file))
            documents.extend(docs)
        except Exception as e:
            print(f"  ❌ Error processing {json_file.name}: {e}")
    
    print(f"✅ Loaded {len(documents)} documents from {len(json_files)} JSON files")
    return documents

def create_sample_data(data_path: Path):
    """Create sample JSON data if files don't exist"""
    sample_data = {
        "university_info": {
            "name": "Kalasalingam Academy of Research and Education",
            "location": "Virudhunagar, Tamil Nadu",
            "established": 1984,
            "type": "Deemed University"
        },
        "contact": {
            "phone": "+91-4626-251777",
            "email": "admissions@klu.ac.in",
            "website": "https://www.klu.ac.in"
        }
    }
    
    with open(data_path / "basic_info.json", 'w') as f:
        json.dump(sample_data, f, indent=2)

# Load all documents
all_documents = load_all_college_data(college_data_dir)

In [None]:
# Enhanced text splitting and embedding
print("🔄 Processing documents for better retrieval...")

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", "**", "- ", " ", ""]
)

docs = text_splitter.split_documents(all_documents)
print(f"📝 Split into {len(docs)} chunks for better processing")

# Create embeddings
print("🧠 Creating embeddings...")
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# Create vector store
print("🗄️ Building vector database...")
vector_store = FAISS.from_documents(docs, embedding=embeddings)

print("✅ Knowledge base created successfully!")

In [None]:
# Load language model
print("🤖 Loading language model...")

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

# Create pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

llm = HuggingFacePipeline(pipeline=pipe)

print("✅ Language model loaded successfully!")

In [None]:
# Create enhanced prompt template
enhanced_prompt_template = """
You are a helpful and knowledgeable admissions assistant for Kalasalingam University. 
Use the provided context to answer questions accurately and helpfully.

Guidelines:
- Provide specific, accurate information based on the context
- Be conversational and friendly
- If you don't have specific information, say so politely
- Include relevant details like fees, dates, contact information when applicable
- For complex queries, break down the answer into clear points

Context: {context}

Question: {question}

Helpful Answer:"""

ENHANCED_PROMPT = PromptTemplate(
    template=enhanced_prompt_template, 
    input_variables=["context", "question"]
)

# Create QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": ENHANCED_PROMPT}
)

print("✅ Enhanced chatbot is ready!")

In [None]:
# Enhanced chat interface with better formatting
def format_response(result):
    """Format the chatbot response nicely"""
    answer = result['result'].strip()
    sources = result.get('source_documents', [])
    
    print("\n" + "="*60)
    print("🤖 KARE Admissions Assistant")
    print("="*60)
    print(answer)
    
    if sources:
        print("\n📚 Information sources:")
        for i, doc in enumerate(sources[:3], 1):
            category = doc.metadata.get('category', 'Unknown')
            print(f"  {i}. {category.replace('_', ' ').title()}")
    
    print("="*60)

# Test the chatbot
print("🎯 Testing the enhanced chatbot...")

test_questions = [
    "What is the fee structure for B.Tech?",
    "Tell me about hostel facilities",
    "What are the placement statistics?"
]

for question in test_questions:
    print(f"\n❓ Test Question: {question}")
    try:
        result = qa_chain({"query": question})
        format_response(result)
    except Exception as e:
        print(f"❌ Error: {e}")
    print("\n" + "-"*40)

In [None]:
# Interactive chat session
print("\n🎓 Welcome to Kalasalingam University Admissions Chatbot!")
print("Ask me anything about admissions, fees, courses, facilities, placements, etc.")
print("Type 'exit' to end the conversation.\n")

while True:
    try:
        query = input("\n🤔 Your Question: ")
        
        if query.lower() in ['exit', 'quit', 'bye']:
            print("\n👋 Thank you for using KARE Admissions Chatbot! Good luck with your admission!")
            break
            
        if query.strip() == '':
            continue
            
        # Get response
        result = qa_chain({"query": query})
        format_response(result)
        
    except KeyboardInterrupt:
        print("\n\n👋 Chat session ended. Thank you!")
        break
    except Exception as e:
        print(f"\n❌ Sorry, I encountered an error: {e}")
        print("Please try rephrasing your question.")

In [None]:
# Save the enhanced knowledge base
print("💾 Saving enhanced knowledge base...")

save_path = "/content/drive/MyDrive/KARE_Enhanced_VectorStore"

try:
    # Mount Google Drive if not already mounted
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Save the vector store
    vector_store.save_local(save_path)
    print(f"✅ Enhanced knowledge base saved to {save_path}")
    
    # Save metadata about the knowledge base
    metadata = {
        "total_documents": len(all_documents),
        "total_chunks": len(docs),
        "embedding_model": model_name,
        "llm_model": model_id,
        "data_sources": [doc.metadata.get('category', 'unknown') for doc in all_documents]
    }
    
    with open(f"{save_path}/metadata.json", 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print("📊 Knowledge base metadata saved")
    
except Exception as e:
    print(f"⚠️ Could not save to Google Drive: {e}")
    print("You can save locally or manually upload the files.")