In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import PyPDF2
import re
import pickle

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Extracted text from the PDF.
    """
    try:
        text = ""
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)

            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()

        return text

    except Exception as e:
        return f"Error: {str(e)}"

def process_text(text):
    """
    Find all elements that match the typical pattern of an index:
    text followed by a series of dots and a page number.

    Args:
        text (str): The text in which to search for the pattern

    Returns:
        list: List of tuples (title, page_number)
    """
    # Regex pattern: text followed by at least 5 dots and then a number, excluding titles that start with a number and a dot
    pattern = r'(?<!\d\.\s)(.+?)\s*\.{5,}\s*(\d+)'

    # Find all matches
    matches = re.findall(pattern, text, re.MULTILINE)

    # Clean the results (remove extra spaces)
    results = [(title.strip(), int(page)) for title, page in matches if re.match(r'\b\w+(\s+\w+)+\b', title)]

    return results


def cut_text_before_phrase(text, phrase):
    """
    Cuts all text before the specified phrase.

    Args:
        text (str): The original text.
        phrase (str): The phrase to search for.

    Returns:
        str: The text starting from the specified phrase.
    """
    # Find the index of the phrase in the text
    index = text.find(phrase)

    # If the phrase is found, return the text starting from that phrase
    if index != -1:
        return text[index:]
    else:
        # If the phrase is not found, return the original text
        return text


def divide_text_by_titles(text, title_list):
    """
    Divides the text into chunks based on the titles provided in a list of tuples.
    Considers a title valid only if it is preceded by a newline character.

    Args:
        text (str): The complete text to divide
        title_list (list): List of tuples (title, level) where title is the pattern to search for

    Returns:
        list: List of dictionaries, each containing the title, level, and content of the chunk
    """
    chunks = []

    # Extract only the titles from the tuples and create a mapping dictionary title -> level
    titles_levels = {title: level for title, level in title_list}

    # Sort titles from longest to shortest to avoid partial matching
    sorted_titles = sorted(titles_levels.keys(), key=len, reverse=True)

    # Make sure the text starts with a newline to capture titles at the beginning
    text_with_newline = '\n' + text if not text.startswith('\n') else text

    # Find all positions of titles in the text
    matches = []
    for title in sorted_titles:
        # Look for the title but only if it comes after a newline
        pattern = r'\n' + re.escape(title)
        for match in re.finditer(pattern, text_with_newline):
            # Store the title and position
            matches.append((match.start(), match.end(), title))

    # Sort matches by position
    matches.sort(key=lambda x: x[0])

    # If there are no matches, return an empty list
    if not matches:
        return chunks

    # Process each match to divide the text
    for i, (start, end, found_title) in enumerate(matches):
        # The effective start of the content is after the newline + title
        start_pos = start + 1  # +1 to skip the initial newline

        # Calculate the end of the section (beginning of the next section or end of text)
        end_pos = matches[i+1][0] if i < len(matches) - 1 else len(text_with_newline)

        # Extract the text for this section
        content = text_with_newline[start_pos:end_pos].strip()

        # Create the chunk and add it to the list
        chunk = {
            'title': found_title,
            'content': content
        }
        chunks.append(chunk)

    return chunks

# usage
if __name__ == "__main__":
    pdf_path = "/content/drive/MyDrive/ELAN_manual.pdf"  # Replace with your PDF file path
    extracted_text = extract_text_from_pdf(pdf_path)
    titles = process_text(extracted_text)
    cleaned_text = cut_text_before_phrase(extracted_text, "xxiChapter 1. ELAN documents")
    chunks = divide_text_by_titles(cleaned_text, titles)
    chunks_cleaned = [el for el in chunks if el['title'] != el['content']]
    with open('/content/drive/MyDrive/ELAN_chunks.pkl', 'wb') as file:
      pickle.dump(chunks_cleaned, file)



