In [None]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import re
import os
import openai
import time
import ast

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
def extract_pages_text(pdf_path):
    pages = []
    for page_layout in extract_pages(pdf_path):
        lines = []
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                lines.append(element.get_text())
        pages.append('\n'.join(lines))
    return pages

def is_toc_page(text):
    if "table of contents" in text or "contents" in text:
        return True
    if re.search(r'\.{5,}', text) and re.search(r'\d{1,3}\s*$', text, re.MULTILINE):
        return True
    if sum(1 for l in text.split('\n') if re.match(r'.*\d{1,3}\s*$', l)) > 5:
        return True
    return False

def is_ack_page(text):
    return "acknowledgement" in text or "acknowledgments" in text

def is_declaration_page(text):
    return "declaration" in text

def is_main_section_start(text):
    return bool(re.search(
        r'\b(?:1\.|chapter\s*1)[:\s-]*introduction\b|\bintroduction\b',
        text, re.IGNORECASE
    ))

def remove_empty_lines(text):
    return "\n".join(line for line in text.splitlines() if line.strip())

def remove_references(text):
    # Trims text at first "references" heading (case-insensitive)
    pattern = re.compile(r"(?im)^.*references.*$", re.MULTILINE)
    match = pattern.search(text)
    if match:
        return text[:match.start()].strip()
    return text

def pdf_to_text(pdf_path):
    pages = extract_pages_text(pdf_path)
    body_pages = pages[1:]  # Remove the first page (cover)
    filtered_pages = []
    skip_mode = None
    for pg in body_pages:
        pg_lower = pg.lower()
        if skip_mode == 'toc':
            if is_main_section_start(pg_lower):
                skip_mode = None
            elif is_toc_page(pg_lower):
                continue
            else:
                skip_mode = None
        elif skip_mode == 'ack':
            if is_main_section_start(pg_lower):
                skip_mode = None
            elif is_ack_page(pg_lower):
                continue
            else:
                skip_mode = None
        elif skip_mode == 'dec':
            if is_main_section_start(pg_lower):
                skip_mode = None
            elif is_declaration_page(pg_lower):
                continue
            else:
                skip_mode = None
        if skip_mode is None:
            if is_toc_page(pg_lower):
                skip_mode = 'toc'
                continue
            elif is_ack_page(pg_lower):
                skip_mode = 'ack'
                continue
            elif is_declaration_page(pg_lower):
                skip_mode = 'dec'
                continue
        filtered_pages.append(pg)

    pages_left = len(filtered_pages)

    main_body_text = "\n\n".join(filtered_pages)
    main_body_text = remove_empty_lines(main_body_text)
    main_body_text = remove_references(main_body_text)
    return main_body_text, pages_left

def process_pdf_folder(folder_path, output_txt_folder=None, csv_report_path=None):
    import pandas as pd
    results = []
    if output_txt_folder:
        os.makedirs(output_txt_folder, exist_ok=True)
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            print(f"Processing: {filename}")
            try:
                text, num_pages = pdf_to_text(pdf_path)
                results.append({"filename": filename, "pages_left": num_pages})
                if output_txt_folder:
                    txt_filename = os.path.splitext(filename)[0] + ".txt"
                    txt_path = os.path.join(output_txt_folder, txt_filename)
                    with open(txt_path, "w", encoding="utf-8") as f:
                        f.write(text)
            except Exception as e:
                print(f"Failed to process {filename}: {e}")
    # Report of pages
    if csv_report_path:
        pd.DataFrame(results).to_csv(csv_report_path, index=False)
    return results


if __name__ == "__main__":
    folder = "/content/drive/MyDrive/data_pdf"
    out_folder = "/content/drive/MyDrive/Data3_txt"
    report_csv = "/content/drive/MyDrive/pdf_page_report.csv"
    process_pdf_folder(folder, out_folder, report_csv)


Processing: 2015_Masurel_phd.pdf
Processing: 2008_MATABANE_FE3.pdf
Processing: 2012_Lindsay_thesis_final.pdf




Processing: 2007_Tshibubudze_THE MARKOYE FAULT_2007.pdf


In [None]:
def text_to_sentence_paragraphs(text, sentences_per_paragraph=6):
    # Split into sentences using regex (simple version)
    sentences = re.split(r'(?<=[.!?])\s+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    paragraphs = []
    for i in range(0, len(sentences), sentences_per_paragraph):
        para = " ".join(sentences[i:i+sentences_per_paragraph])
        paragraphs.append(para)
    return paragraphs


In [None]:

with open("/content/drive/MyDrive/Data3_txt/2008_MATABANE_FE3.txt", "r", encoding="utf-8") as f:
    text = f.read()
paragraphs = text_to_sentence_paragraphs(text, sentences_per_paragraph=6)
print(f"Found {len(paragraphs)} paragraphs.")
print("\nSample paragraph:\n", paragraphs[0])


Found 84 paragraphs.

Sample paragraph:
 Abstract 
The  FE3  open  casts  (Sadiola  goldfield,  Mali)  are  operated  by  the  by  the  Société 
d’Exploitation des Mines d’Or de Sadiola S.A.. The open casts comprise meta-sedimentary 
rocks  that  can  be  divided  into  five  units;  lower  slump  facies,  upper  slump  facies,  siltstone-
greywacke  unit,  greywacke  unit  and  a  laterite  profile. The  lower  slump  facies  consists  of 
siltstone beds and is characterised by massive chaotic slump folds and olistoliths. The upper 
slump  facies  consists  of  greywacke-siltstone  beds  and  is  characterised  by  turbidite  beds, 
slump  folds  and  fluid  escape  structures. The  siltstone-greywacke  unit  is  characterised  by 
turbidite beds. The greywacke unit is carbonaceous.


In [None]:
def text_to_sentence_paragraphs(text, sentences_per_paragraph=6):
    # Basic sentence splitter (works for most scientific writing)
    sentences = re.split(r'(?<=[.!?])\s+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    paragraphs = []
    for i in range(0, len(sentences), sentences_per_paragraph):
        para = " ".join(sentences[i:i+sentences_per_paragraph])
        paragraphs.append(para)
    return paragraphs

def folder_txts_to_paragraphs(folder_path, sentences_per_paragraph=6):
    file_paragraphs = {}
    for fname in os.listdir(folder_path):
        if fname.lower().endswith(".txt"):
            file_path = os.path.join(folder_path, fname)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
            paras = text_to_sentence_paragraphs(text, sentences_per_paragraph=sentences_per_paragraph)
            file_paragraphs[fname] = paras
    return file_paragraphs


In [None]:
folder_path = "/content/drive/MyDrive/Data3_txt"
file_paragraphs = folder_txts_to_paragraphs(folder_path, sentences_per_paragraph=6)

test_files = list(file_paragraphs.keys())[:3]
for fname in test_files:
    print(f"\n--- {fname} ---")
    for i, para in enumerate(file_paragraphs[fname][:5]):
        print(f"Paragraph {i+1}: {para[:300]}{'...' if len(para)>300 else ''}\n")


--- 2015_Masurel_phd.txt ---
Paragraph 1: ii 
A vous Maman, Papa, Manon, Arthur 
“Happiness is only real when shared” 
(Christopher Johnson McCandless, Into the wild) 
iii 
iv 
Abstract 
The  Paleoproterozoic  belts  of  West  Africa  are  the  fastest-growing  gold 
production 
in 
the  world. The  Kédougou-Kénieba 
inlier  (KKI)  represen...

Paragraph 2: Deposit-scale investigations indicate that gold deposits of the Sadiola-Yatela district are 
primarily  hosted  by 
impure  metacarbonates  and, 
to  a 
lesser  extent,  detrital 
metasedimentary  rocks  (e.g.,  wacke,  arenite,  siltstone,  and  argillite)  and  Eburnean 
granitoids. The  presence ...

Paragraph 3: The D2 and 
D3  events  represent  a  transpressional  deformation  continuum  that  was  associated  with 
voluminous calc-alkaline magmatism. This deformation continuum marks the principal 
imprint  of  the  Eburnean  orogeny  (ca. 2115-2060  Ma)  in  the  district. Eburnean 
granitoids exposed in...

Paragraph 4: Des