# Prototype: AI-Powered Study Assistant using Gemma 3n

In [None]:
from ollama import Client
import re
import json
import unicodedata

client = Client()
model = "gemma3n:e2b"

MAX_SECTION_LENGTH = 1500  # max chars per chunk
OVERLAP_LENGTH = 200       # overlap chars between chunks

### Preprocessing Utilities

In [None]:
def normalize(text):
    return unicodedata.normalize("NFKC", text)

def is_heading(line):
    line = line.strip()

    # Reject empty lines or bullet points
    if not line or line.startswith(("-", "*", "•")):
        return False

    # Reject lines that are too long (likely paragraph)
    if len(line) > 80:
        return False

    # Heuristic 1: Ends with colon or question mark
    if line.endswith((':', '?')):
        return True

    # Heuristic 2: All uppercase or Title case (capitalized words)
    if line.isupper():
        return True

    # Heuristic 3: Starts with capital letter, and has no ending punctuation (likely a title)
    if re.match(r"^[A-Z][\w\s\-()]*$", line):
        return True

    return False

def split_text_into_sections(text):
    lines = text.splitlines()
    sections = []
    current_title = None
    current_text = []

    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue

        if is_heading(stripped):
            # Save previous section
            if current_title or current_text:
                sections.append({
                    "title": current_title or "Untitled",
                    "text": "\n".join(current_text).strip()
                })
            current_title = stripped
            current_text = []
        else:
            current_text.append(stripped)

    # Final section
    if current_title or current_text:
        sections.append({
            "title": current_title or "Untitled",
            "text": "\n".join(current_text).strip()
        })

    # Optional: remove sections with no useful text
    return [s for s in sections if s["text"].strip()]

def split_long_section(section, max_length=MAX_SECTION_LENGTH, overlap=OVERLAP_LENGTH):
    text = section["text"]
    title = section["title"]
    chunks = []

    start = 0
    length = len(text)
    while start < length:
        end = start + max_length
        if end >= length:
            chunk_text = text[start:]
            chunks.append({"title": title, "text": chunk_text.strip()})
            break
        else:
            # Try to split at the last newline before max_length to avoid cutting mid sentence/line
            split_pos = text.rfind("\n", start, end)
            if split_pos == -1 or split_pos <= start:
                split_pos = end  # no newline found, split hard at max_length

            chunk_text = text[start:split_pos].strip()
            chunks.append({"title": title, "text": chunk_text})

            # Next start is split_pos minus overlap to keep some context
            start = max(split_pos - overlap, start + 1)

    return chunks


### Load and Prepare Text

In [None]:
# Load and preprocess
with open("test.txt", "r", encoding="utf-8") as f:
    context = normalize(f.read())

sections = split_text_into_sections(context)

# Further split any too-long sections
final_sections = []
for sec in sections:
    if len(sec["text"]) > MAX_SECTION_LENGTH:
        split_chunks = split_long_section(sec)
        final_sections.extend(split_chunks)
    else:
        final_sections.append(sec)

with open("sections.json", "w", encoding="utf-8") as f:
    json.dump(sections, f, indent=2, ensure_ascii=False)

print(f"Split into {len(sections)} sections.")

### LLM Utilities

In [None]:
def ask_gemma_raw(prompt):
    response = client.generate(model=model, prompt=prompt)
    return response["response"]

def parse_sections_json(raw_response):
    try:
        # Remove ```json or ``` wrapper
        cleaned = raw_response.strip()
        if cleaned.startswith("```json"):
            cleaned = cleaned.removeprefix("```json").removesuffix("```").strip()
        elif cleaned.startswith("```"):
            cleaned = cleaned.removeprefix("```").removesuffix("```").strip()

        # Normalize all Unicode to standard ASCII equivalents
        cleaned = unicodedata.normalize("NFKD", cleaned).encode("ascii", "ignore").decode("ascii")

        # Replace smart quotes
        cleaned = re.sub(r"[“”]", '"', cleaned)
        cleaned = re.sub(r"[‘’]", "'", cleaned)

        # Escape inner newlines in text fields
        cleaned = re.sub(r'"text":\s*"([\s\S]*?)"', lambda m: f'"text": "{m.group(1).replace("\n", "\\n")}"', cleaned)

        # Remove trailing commas before closing brackets
        cleaned = re.sub(r",\s*]", "]", cleaned)

        return json.loads(cleaned)
    except Exception as e:
        print(f"Failed to parse cleaned JSON: {e}")
        print("Raw response was:\n", raw_response)
        return []

### Quiz Generation

In [None]:
questions = []

for section in sections:
    prompt = f"""
You are a helpful study assistant. Given the following notes section, please:

1. Summarize the key points briefly.
2. Generate 3-5 quiz questions that test understanding of these key points.

Return the output as a JSON object with the following format:

{{
  "summary": "brief summary text here",
  "questions": [
    "Question 1?",
    "Question 2?",
    ...
  ]
}}

Notes:
{section['text']}

Section title: {section['title']}
""".strip()

    response_text = ask_gemma_raw(prompt).strip()
    print(response_text)
    parsed = parse_sections_json(response_text)
    

    questions.append({
        "section": section["title"],
        "summary": parsed.get("summary", ""),
        "questions": parsed.get("questions", []),
    })
    break

# flatten questions for quizzing
flat_questions = []

for entry in questions:
    section_title = entry["section"]
    for question in entry["questions"]:
        flat_questions.append({
            "section": section_title,
            "question": question
        })

### Quiz Loop

In [None]:
print("\nQuiz Time! Please answer the following:\n")

for i, q in enumerate(flat_questions):
    print(f"Q{i+1} ({q['section']}): {q['question']}")

print("\nNow let's evaluate your answers!\n")

for i, q in enumerate(flat_questions):
    user_answer = input(f"Answer for Q{i+1}: ")

    # Find matching section text from original `sections`
    matching_section = next((s for s in sections if s["title"] == q["section"]), None)

    grading_prompt = f"""
You are an AI tutor. Evaluate the student's answer to the following question using a 0 to 5 scale.

Score Criteria:
- 0: Completely incorrect or off-topic
- 1 to 2: Partially correct, major gaps or confusion
- 3 to 4: Mostly correct, minor omissions or errors
- 5: Fully correct, clear, and complete

Respond in this JSON format only:

{{
  "score": number between 0 and 5,
  "feedback": "A short, friendly explanation that sounds like you're speaking directly to the student. Encourage them or gently correct misunderstandings."
}}

Section title: {q['section']}
Section content:
{matching_section['text'] if matching_section else 'Unavailable'}

Question: {q['question']}
Student's answer: {user_answer}
""".strip()

    try:
        response = client.generate(model=model, prompt=grading_prompt)
        print(f"\nGemma's Feedback for Q{i+1}:\n{response['response']}\n")
    except Exception as e:
        print(f"Error grading Q{i+1}: {e}")
