<a href="https://colab.research.google.com/github/MuammerEren1/AcademyGen/blob/main/AcademyGen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# (run this first)

!pip install requests gradio pdfplumber nltk transformers torch
!pip install openai
!pip install openai==0.28
!pip install gradio

In [None]:
# (run this second)
# by Muammer Eren
# Thank you for reviewing :)
# Note: Don't forget to run the cells in order to open the app

import gradio as gr
import pdfplumber
import openai
import os
import re
import nltk
from nltk.tokenize import sent_tokenize

# OpenAI API key for GPT usage.
openai.api_key = "API-KEY" #You must enter your own API Key here.

# First it takes the text text as clean as possible from the pdf file
# This function here, cleans up text by removing extra spaces and unwanted characters
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # This replaces multiple spaces with one
    text = re.sub(r'[^\w\s.,!?;:-]', '', text)  # Remove unwanted characters
    return text.strip()  # Remove spaces from start and end

# This function breaks text into sentences
def simple_sentence_tokenize(text):
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)  # Split by punctuation followed by uppercase
    return [s.strip() for s in sentences if s.strip()]  # Remove empty sentences

# Extracts text from a PDF file
def extract_text_from_pdf(pdf_file):
  # Open a PDF and extract all the text page by page.
    with pdfplumber.open(pdf_file) as pdf:
        text_chunks = []
        for page in pdf.pages:
            if page.extract_text(): # Check if the page has text.
                text_chunks.append(clean_text(page.extract_text()))  # Clean and save the text.
        return " ".join(text_chunks)

# Divides the text into smaller parts for better processing
def chunk_text(text, max_chunk_size=10000):
    sentences = simple_sentence_tokenize(text)  # getting sentences
    chunks = []  # Store text parts here
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        if current_length + len(sentence) > max_chunk_size and current_chunk:
            chunks.append(" ".join(current_chunk))  # Add current chunk to list
            current_chunk = [sentence]  # this starts a new chunk
            current_length = len(sentence)
        else:
            current_chunk.append(sentence)
            current_length += len(sentence)

    if current_chunk:  # adds any leftover sentences
        chunks.append(" ".join(current_chunk))

    return chunks

# Adjust text to match the target word count
def truncate_to_word_limit(text, target_words):
  # Shorten or expand text to match a specific word count
    words = text.split()
    current_words = len(words)

    # Check if text is too short and expand it using OpenAI
    if current_words < target_words * 0.9:  # expands if short
        try:
            additional_words_needed = target_words - current_words
            expansion_prompt = f"""The current content is too short. Please add approximately {additional_words_needed} more words.
            Previous content: {text}"""

            response = openai.ChatCompletion.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are an expert at expanding educational content naturally."},
                    {"role": "user", "content": expansion_prompt}
                ],
                max_tokens=4000,
                temperature=0.7
            )

            additional_text = response['choices'][0]['message']['content'].strip()
            text += "\n\n" + additional_text
            words = text.split()
        except Exception as e:
            print(f"Error expanding content: {str(e)}")

    # If text is too long, truncate it to the word limit
    if len(words) > target_words:  # Cut off if too long
        sentences = simple_sentence_tokenize(" ".join(words[:target_words + 50]))
        truncated_text = ""
        word_count = 0

        for sentence in sentences:
            sentence_words = len(sentence.split())
            if word_count + sentence_words > target_words:
                break
            truncated_text += sentence + " "
            word_count += sentence_words

        return truncated_text.strip()

    return text

# Creates a section of the educational material based on the text
def generate_chat_section(text, section_type, word_count_option):
  # Creates sections (e.g., introduction, examples) using OpenAI
    try:
      # defines word counts for different lecture lengths
        word_counts = {
            "30 minutes": {'introduction': (500, 550), 'detailed': (1800, 1900), 'examples': (1200, 1300), 'summary': (500, 550)},
            "60 minutes": {'introduction': (1500, 1600), 'detailed': (4500, 4700), 'examples': (2500, 2700), 'summary': (1000, 1100)}
        }

        target_words, max_words = word_counts[word_count_option][section_type]
        prompts = {
            'introduction': f"Write a {target_words}-word introduction for students. Use friendly tone. Make the content engaging and easy to understand while covering foundational concepts and setting expectations for deeper learning.",
            'detailed': f"Provide {target_words}-word detailed explanations. Make the subject easy to understand. Ensure comprehensive coverage of the subject with a focus on clarity.",
            'examples': f"Give {target_words}-word practical examples. Include step-by-step solutions and real-life uses.",
            'summary': f"Write a {target_words}-word summary and suggest next steps for students. Recap key points in simple terms and outline clear, actionable steps for further understanding."
        }

        system_prompt = "You are an expert at creating educational content. Make sure to generate content with the exact word count. Use friendly tone and make sure the content is not repetitive at all. What you generate should be understandable and clear, ensure the language is accessible and suitable for learners of all levels."

        prompt = f"{prompts[section_type]}\n\nContext from the document:\n{text[:30000]}"

        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
            max_tokens=8000,
            temperature=0.7
        )
        generated_text = response['choices'][0]['message']['content'].strip()
        return truncate_to_word_limit(generated_text, target_words)

    except Exception as e:
        return f"Error generating {section_type} section: {str(e)}"

# Format the output for readability
def format_output(sections):
    formatted_output = ""
    for title, content in [
        ("1. Introduction to the Subject", sections['introduction']),
        ("2. Detailed Explanation of the Subject", sections['detailed']),
        ("3. Practical Examples and Solutions", sections['examples']),
        ("4. Summary and Next Steps for Students", sections['summary'])
    ]:
        formatted_output += f"\n{title}\n{'=' * len(title)}\n{content}\n\n"
    return formatted_output

# Main function for processing PDF
def process_pdf(pdf_file, word_count_option):
  #Process the PDF and generate lecture notes.
    try:
        if not pdf_file:
            return "Please upload a PDF file."

        text = extract_text_from_pdf(pdf_file)  # get text from PDF
        if not text.strip():
            return "No text found in the PDF."

        sections = {}
        chunks = chunk_text(text)
        context_text = " ".join(chunks[:3])

        for section_type in ['introduction', 'detailed', 'examples', 'summary']:
            sections[section_type] = generate_chat_section(context_text, section_type, word_count_option)

        return format_output(sections)

    except Exception as e:
        return f"An error occurred: {str(e)}"

# Here is the Gradio user interface of the app
with gr.Blocks(theme=gr.themes.Soft()) as iface:
    gr.Markdown(
        """
        # 📚 AcademyGen - AI Lecture Note Generator by Muammer Eren

        Transform any educational content into well-structured lecture notes. It may take few minutes to generate.
        """
    )

    with gr.Row():
        with gr.Column(scale=2):
            file_input = gr.File(
                label="Upload a PDF Transcript",
                file_types=[".pdf"],
                elem_id="file_input"
            )
            duration = gr.Dropdown(
                choices=["30 minutes", "60 minutes"],
                label="Lecture Duration",
                value="30 minutes",
                elem_id="duration_dropdown"
            )
            submit_btn = gr.Button("Generate Lecture Notes", variant="primary")

            gr.Markdown(
                """
                ### Instructions:
                1. Upload a PDF file containing your teaching material
                2. Select the desired lecture duration
                3. Click 'Generate Teaching Material' to create your content

                ### Duration Guide:
                - 30 minutes ≈ 3,900 words
                - 60 minutes ≈ 9,000 words
                """
            )

        with gr.Column(scale=3):
            output_text = gr.Textbox(
                label="Generated Lecture Notes",
                lines=30,
                elem_id="output_text"
            )

    submit_btn.click(
        fn=process_pdf,
        inputs=[file_input, duration],
        outputs=output_text,
        api_name="generate_content"
    )

    gr.Markdown(
        """
        ### Made By Muammer Eren
        AcademyGen helps all educators create comprehensive and well-structured lecture notes from any transcript.
        These are the contents that will be generated:
        - Clear introduction to the subject
        - Detailed explanations of key concepts
        - Practical examples and solutions
        - Summary and next steps for students
        """
    )

# Launching
iface.launch(debug=True)

# How prompts were engineered and refined:
# I write the prompts carefully with specific educational roles (introduction, detailed explanation, examples, and summary) and I tested them many times to ensure they generate clear, understandable, structured content

#Challenges faced and solutions:
#The main challenges were handling long documents and adjusting fast and correct responses, which were solved by implementing efficient text chunking and smart processing of key document portions.

#How the system can be extended or scaled:
#The system can scale to serve more users with content creation in different languages, separate logins per user, and database consolidation.

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://22142b9f59bf0bb16f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://22142b9f59bf0bb16f.gradio.live


