In [32]:
# import nltk
import fitz  # PyMuPDF
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string

# Download NLTK resources if not already downloaded
# nltk.download('punkt')
# nltk.download('stopwords')

In [28]:
def extract_text_from_pdf(file_path):
    """Extract text content from a PDF file."""
    document = fitz.open(file_path)
    text = ''
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text")
    # print(text)
    return text

In [29]:
def generate_headers(text, num_headers=5, max_header_length=10):
    """Generate concise and relevant headers from text."""
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Tokenize and clean the text
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stopwords.words('english') and word not in string.punctuation]

    # Count word frequencies
    word_frequencies = Counter(words)

    # Score each sentence based on word frequencies
    sentence_scores = {}
    for sentence in sentences:
        sentence_words = word_tokenize(sentence.lower())
        sentence_score = sum(word_frequencies.get(word, 0) for word in sentence_words)
        sentence_scores[sentence] = sentence_score

    # Select the top sentences as potential headers
    sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    headers = []
    
    for sentence in sorted_sentences[:num_headers * 2]:  # consider more sentences to get better headers
        # Truncate the sentence to a maximum length
        words = word_tokenize(sentence)
        if len(words) > max_header_length:
            truncated_sentence = ' '.join(words[:max_header_length]) + '...'
        else:
            truncated_sentence = sentence

        # Filter out very short headers and duplicates
        if len(truncated_sentence) > 15 and truncated_sentence not in headers:  # example threshold for header length
            headers.append(truncated_sentence)

        if len(headers) >= num_headers:
            break

    return headers

In [30]:
def main():
    """Main function to extract headers from a PDF file."""
    # Path to your PDF file
    file_path = r"E:\University\Materials\6 - Year 5 (2023-2024)\GP\technology.pdf"
    text = extract_text_from_pdf(file_path)
    
    # Generate enhanced headers
    headers = generate_headers(text, num_headers=5, max_header_length=10)
    
    print("Generated Headers:")
    for header in headers:
        print(header)


In [31]:
if __name__ == "__main__":
    main()


Generated Headers:
Based on their literature review , the authors noted that...
D. The Growth of English Teaching through Technology With the...
b. Teachers ’ Area of Specialty Teachers of technical topics...
Specifically , prior experience with technology is significantly correlated with...
C. Use of Technology in English Teaching And Learning In...
