In [10]:
import os
import re
from io import BytesIO

import fitz
import pdfplumber
from pprint import pprint
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from oauth2client.service_account import ServiceAccountCredentials

from tqdm import tqdm

from multiprocessing import Pool, cpu_count

import logging
from rich.logging import RichHandler

import glob

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 15 ms (started: 2024-02-26 15:44:30 -08:00)


In [22]:
class Path:
    current_dir = os.getcwd()
    project_dir = os.path.dirname(current_dir)

    root_dir = os.path.join(project_dir, "oil-wells-data-scraper")
    gcp_service_account_key = os.path.join(root_dir, "test-5681a-202ae82505a3.json")

    data_text_dir = os.path.join(project_dir, "data", "raw-text")

def get_logger(name):
    # Create a logger
    logger = logging.getLogger(name)

    # Set the logging level (adjust as needed)
    logger.setLevel(logging.DEBUG)

    # Create a console handler and set the level
    ch = RichHandler()
    ch.setLevel(logging.DEBUG)

    # Create a formatter and add it to the handler
    formatter = logging.Formatter("%(message)s")
    ch.setFormatter(formatter)

    # Add the handler to the logger
    logger.addHandler(ch)
    return logger


logger = get_logger("Extracter")

time: 0 ns (started: 2024-02-26 16:15:01 -08:00)


In [3]:
# Authenticate using the application default credentials generated by gcloud
gauth = GoogleAuth()
credentials = ServiceAccountCredentials.from_json_keyfile_name(
    Path.gcp_service_account_key, scopes=["https://www.googleapis.com/auth/drive"]
)
gauth.credentials = credentials
gauth.ServiceAuth()

time: 16 ms (started: 2024-02-26 15:39:12 -08:00)


In [4]:
# Initialize GoogleDrive object without authentication
drive = GoogleDrive(gauth)

# Folder ID from the shared link
folder_id = "12g-bhOylyaMoLF5djocnAeZHBx-gsxgY"

# List all files in the folder
file_list = drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()
file_list = [{'id': file_dict['id'], 'title': file_dict['title']} for file_dict in file_list]
file_list

[{'id': '1qXERbF2EhVHPYwPKL0VABomrZ-_n19xY', 'title': 'W28654.pdf'},
 {'id': '1NB3j4hqiZXCxoRTbEnbQFUoRw43DEUdv', 'title': 'W28755.pdf'},
 {'id': '1fgDzVzyPPcXMwqOCDd0-JRtoPOrolciX', 'title': 'W28651.pdf'},
 {'id': '1nz8wvg8mI3LT27VaMZGiFBsQATOSaOf8', 'title': 'W30789.pdf'},
 {'id': '1QThcZsvItpIH6qaBe0pHhT7Qa7uAkyRN', 'title': 'W28652.pdf'},
 {'id': '1UUtKvq0_crRehsXpwI9hTVEcDknlTZAm', 'title': 'W28554.pdf'},
 {'id': '1g85d9r-pEyachdN6dy7H_aRC6Lf4LZJ6', 'title': 'W29242.pdf'},
 {'id': '1ZaDFnuv1gf0arO1b84qYSmqJzdYhP2jJ', 'title': 'W23230.pdf'},
 {'id': '1KgiXY33Fi9oNrYrBTHhRoDgqtsBPQTuw', 'title': 'W28633.pdf'},
 {'id': '19EoftY50DtmIF-MpUyLfFafr7C4UBIz2', 'title': 'W23362.pdf'},
 {'id': '1DuoQUUml5viE9eXDgcjGSNeHP-KFLvmX', 'title': 'W29334.pdf'},
 {'id': '1hFW-vSM7us4hEdZ-CgbhMKROj8AZRSYG', 'title': 'W28754.pdf'},
 {'id': '12akatIC0mO7P61MTBbyXbzmDE5wv2vU2', 'title': 'W28658.pdf'},
 {'id': '1wjjtd6VFaFsEnUMrCHV2vhXBn-rt9Exz', 'title': 'W20197.pdf'},
 {'id': '1-DpH3iXsjsbmYN7Ja99ylFKf

time: 875 ms (started: 2024-02-26 15:39:13 -08:00)


In [32]:
def process_pdf_dep(file, drive):
    filename = file["title"].split(".")[0]
    filepath = os.path.join(Path.data_text_dir, f"{filename}.txt")

    if file["title"].endswith(".pdf"):
        pdf_file = drive.CreateFile({"id": file["id"]})
        print(f"Downloading '{file['title']}' to buffer memory...")
        pdf_bytes = pdf_file.GetContentIOBuffer().read()
        
        # Open the PDF file using pdfplumber
        # with pdfplumber.open(BytesIO(pdf_bytes)) as pdf:
        #     text = ""
        #     for page in pdf.pages:
        #         text += page.extract_text()

        # with open(filepath, "w", encoding="utf-8") as f:
        #     f.write(text)

        print(f"Processing PDF '{file['title']}' with PyMuPDF...")
        # Open file with PyMuPDF
        with fitz.open(stream=BytesIO(pdf_bytes), filetype="pdf") as pdf:
            text = ""
            for page_number in range(pdf.page_count):
                page = pdf.load_page(page_number)
                text += page.get_text()

        with open(filepath, "a", encoding="utf-8") as f:
            f.write(text)
            
        print(f"Saved extract text to '{os.path.relpath(filepath)}'.")


def process_pdf(file, drive):
    filename = file["title"].split(".")[0]
    filepath = os.path.join(Path.data_text_dir, f"{filename}.txt")

    if not os.path.exists(filepath):
        if file["title"].endswith(".pdf"):
            pdf_file = drive.CreateFile({"id": file["id"]})
            logger.info(f"Downloading '{file['title']}' to buffer memory...")
            pdf_bytes = pdf_file.GetContentIOBuffer().read()

            logger.info(f"Processing PDF '{file['title']}' with pdfplumber...")
            # Open the PDF file using pdfplumber
            with pdfplumber.open(BytesIO(pdf_bytes)) as pdf:
                text = ""
                for page in pdf.pages:
                    text += page.extract_text()

            with open(filepath, "w", encoding="utf-8") as f:
                f.write(text)

            logger.info(f"Processing PDF '{file['title']}' with PyMuPDF...")
            # Open file with PyMuPDF
            with fitz.open(stream=BytesIO(pdf_bytes), filetype="pdf") as pdf:
                text = ""
                for page_number in range(pdf.page_count):
                    page = pdf.load_page(page_number)
                    text += page.get_text()

            with open(filepath, "a", encoding="utf-8") as f:
                f.write(text)

            logger.info(f"Saved extract text to '{os.path.relpath(filepath)}'.")
    logger.info(f"'{os.path.relpath(filepath)}' already exists.")

time: 0 ns (started: 2024-02-26 16:24:01 -08:00)


In [7]:
for fileitem in tqdm(file_list):
    process_pdf(fileitem, drive)

100%|██████████| 77/77 [00:00<00:00, 3079.75it/s]

'..\data\raw-text\W28654.txt' already exists.
'..\data\raw-text\W28755.txt' already exists.
'..\data\raw-text\W28651.txt' already exists.
'..\data\raw-text\W30789.txt' already exists.
'..\data\raw-text\W28652.txt' already exists.
'..\data\raw-text\W28554.txt' already exists.
'..\data\raw-text\W29242.txt' already exists.
'..\data\raw-text\W23230.txt' already exists.
'..\data\raw-text\W28633.txt' already exists.
'..\data\raw-text\W23362.txt' already exists.
'..\data\raw-text\W29334.txt' already exists.
'..\data\raw-text\W28754.txt' already exists.
'..\data\raw-text\W28658.txt' already exists.
'..\data\raw-text\W20197.txt' already exists.
'..\data\raw-text\W25158.txt' already exists.
'..\data\raw-text\W29244.txt' already exists.
'..\data\raw-text\W28634.txt' already exists.
'..\data\raw-text\W28648.txt' already exists.
'..\data\raw-text\W20407.txt' already exists.
'..\data\raw-text\W28655.txt' already exists.
'..\data\raw-text\W28744.txt' already exists.
'..\data\raw-text\W28557.txt' alre




In [33]:
for fileitem in tqdm(file_list):
    process_pdf_dep(fileitem, drive)

  0%|          | 0/77 [00:00<?, ?it/s]

Downloading 'W28654.pdf' to buffer memory...
Processing PDF 'W28654.pdf' with PyMuPDF...


  1%|▏         | 1/77 [00:10<13:03, 10.31s/it]

Saved extract text to '..\data\raw-text\W28654.txt'.
Downloading 'W28755.pdf' to buffer memory...
Processing PDF 'W28755.pdf' with PyMuPDF...


  3%|▎         | 2/77 [00:19<11:42,  9.37s/it]

Saved extract text to '..\data\raw-text\W28755.txt'.
Downloading 'W28651.pdf' to buffer memory...
Processing PDF 'W28651.pdf' with PyMuPDF...


  4%|▍         | 3/77 [00:24<09:05,  7.37s/it]

Saved extract text to '..\data\raw-text\W28651.txt'.
Downloading 'W30789.pdf' to buffer memory...
Processing PDF 'W30789.pdf' with PyMuPDF...


  5%|▌         | 4/77 [00:28<07:26,  6.12s/it]

Saved extract text to '..\data\raw-text\W30789.txt'.
Downloading 'W28652.pdf' to buffer memory...
Processing PDF 'W28652.pdf' with PyMuPDF...


  6%|▋         | 5/77 [00:37<08:35,  7.15s/it]

Saved extract text to '..\data\raw-text\W28652.txt'.
Downloading 'W28554.pdf' to buffer memory...
Processing PDF 'W28554.pdf' with PyMuPDF...


  8%|▊         | 6/77 [00:45<09:00,  7.61s/it]

Saved extract text to '..\data\raw-text\W28554.txt'.
Downloading 'W29242.pdf' to buffer memory...
Processing PDF 'W29242.pdf' with PyMuPDF...


  9%|▉         | 7/77 [00:55<09:43,  8.34s/it]

Saved extract text to '..\data\raw-text\W29242.txt'.
Downloading 'W23230.pdf' to buffer memory...
Processing PDF 'W23230.pdf' with PyMuPDF...


 10%|█         | 8/77 [00:58<07:37,  6.63s/it]

Saved extract text to '..\data\raw-text\W23230.txt'.
Downloading 'W28633.pdf' to buffer memory...
Processing PDF 'W28633.pdf' with PyMuPDF...


 12%|█▏        | 9/77 [01:03<06:59,  6.17s/it]

Saved extract text to '..\data\raw-text\W28633.txt'.
Downloading 'W23362.pdf' to buffer memory...
Processing PDF 'W23362.pdf' with PyMuPDF...


 13%|█▎        | 10/77 [01:12<07:39,  6.86s/it]

Saved extract text to '..\data\raw-text\W23362.txt'.
Downloading 'W29334.pdf' to buffer memory...
Processing PDF 'W29334.pdf' with PyMuPDF...


 14%|█▍        | 11/77 [01:17<07:11,  6.53s/it]

Saved extract text to '..\data\raw-text\W29334.txt'.
Downloading 'W28754.pdf' to buffer memory...
Processing PDF 'W28754.pdf' with PyMuPDF...


 16%|█▌        | 12/77 [01:24<07:05,  6.54s/it]

Saved extract text to '..\data\raw-text\W28754.txt'.
Downloading 'W28658.pdf' to buffer memory...
Processing PDF 'W28658.pdf' with PyMuPDF...


 17%|█▋        | 13/77 [01:29<06:36,  6.20s/it]

Saved extract text to '..\data\raw-text\W28658.txt'.
Downloading 'W20197.pdf' to buffer memory...
Processing PDF 'W20197.pdf' with PyMuPDF...


 18%|█▊        | 14/77 [01:38<07:09,  6.82s/it]

Saved extract text to '..\data\raw-text\W20197.txt'.
Downloading 'W25158.pdf' to buffer memory...
Processing PDF 'W25158.pdf' with PyMuPDF...


 19%|█▉        | 15/77 [01:43<06:35,  6.39s/it]

Saved extract text to '..\data\raw-text\W25158.txt'.
Downloading 'W29244.pdf' to buffer memory...
Processing PDF 'W29244.pdf' with PyMuPDF...


 21%|██        | 16/77 [01:49<06:14,  6.14s/it]

Saved extract text to '..\data\raw-text\W29244.txt'.
Downloading 'W28634.pdf' to buffer memory...
Processing PDF 'W28634.pdf' with PyMuPDF...


 22%|██▏       | 17/77 [01:52<05:26,  5.44s/it]

Saved extract text to '..\data\raw-text\W28634.txt'.
Downloading 'W28648.pdf' to buffer memory...
Processing PDF 'W28648.pdf' with PyMuPDF...


 23%|██▎       | 18/77 [02:01<06:15,  6.37s/it]

Saved extract text to '..\data\raw-text\W28648.txt'.
Downloading 'W20407.pdf' to buffer memory...
Processing PDF 'W20407.pdf' with PyMuPDF...


 25%|██▍       | 19/77 [02:06<05:42,  5.91s/it]

Saved extract text to '..\data\raw-text\W20407.txt'.
Downloading 'W28655.pdf' to buffer memory...
Processing PDF 'W28655.pdf' with PyMuPDF...


 26%|██▌       | 20/77 [02:09<04:55,  5.19s/it]

Saved extract text to '..\data\raw-text\W28655.txt'.
Downloading 'W28744.pdf' to buffer memory...
Processing PDF 'W28744.pdf' with PyMuPDF...


 27%|██▋       | 21/77 [02:15<04:56,  5.29s/it]

Saved extract text to '..\data\raw-text\W28744.txt'.
Downloading 'W28557.pdf' to buffer memory...
Processing PDF 'W28557.pdf' with PyMuPDF...


 29%|██▊       | 22/77 [02:20<04:52,  5.32s/it]

Saved extract text to '..\data\raw-text\W28557.txt'.
Downloading 'W28649.pdf' to buffer memory...
Processing PDF 'W28649.pdf' with PyMuPDF...


 30%|██▉       | 23/77 [02:26<05:02,  5.59s/it]

Saved extract text to '..\data\raw-text\W28649.txt'.
Downloading 'W28978.pdf' to buffer memory...
Processing PDF 'W28978.pdf' with PyMuPDF...


 31%|███       | 24/77 [02:32<04:51,  5.50s/it]

Saved extract text to '..\data\raw-text\W28978.txt'.
Downloading 'W28394.pdf' to buffer memory...
Processing PDF 'W28394.pdf' with PyMuPDF...


 32%|███▏      | 25/77 [02:38<04:54,  5.67s/it]

Saved extract text to '..\data\raw-text\W28394.txt'.
Downloading 'W28637.pdf' to buffer memory...
Processing PDF 'W28637.pdf' with PyMuPDF...


 34%|███▍      | 26/77 [02:43<04:41,  5.52s/it]

Saved extract text to '..\data\raw-text\W28637.txt'.
Downloading 'W28636.pdf' to buffer memory...
Processing PDF 'W28636.pdf' with PyMuPDF...


 35%|███▌      | 27/77 [02:49<04:43,  5.68s/it]

Saved extract text to '..\data\raw-text\W28636.txt'.
Downloading 'W30188.pdf' to buffer memory...
Processing PDF 'W30188.pdf' with PyMuPDF...


 36%|███▋      | 28/77 [02:56<04:56,  6.04s/it]

Saved extract text to '..\data\raw-text\W30188.txt'.
Downloading 'W28303.pdf' to buffer memory...
Processing PDF 'W28303.pdf' with PyMuPDF...


 38%|███▊      | 29/77 [03:02<04:53,  6.11s/it]

Saved extract text to '..\data\raw-text\W28303.txt'.
Downloading 'W23370.pdf' to buffer memory...
Processing PDF 'W23370.pdf' with PyMuPDF...


 39%|███▉      | 30/77 [03:06<04:10,  5.33s/it]

Saved extract text to '..\data\raw-text\W23370.txt'.
Downloading 'W28425.pdf' to buffer memory...
Processing PDF 'W28425.pdf' with PyMuPDF...


 40%|████      | 31/77 [03:11<04:07,  5.38s/it]

Saved extract text to '..\data\raw-text\W28425.txt'.
Downloading 'W28976.pdf' to buffer memory...
Processing PDF 'W28976.pdf' with PyMuPDF...


 42%|████▏     | 32/77 [03:16<04:02,  5.38s/it]

Saved extract text to '..\data\raw-text\W28976.txt'.
Downloading 'W23367.pdf' to buffer memory...
Processing PDF 'W23367.pdf' with PyMuPDF...


 43%|████▎     | 33/77 [03:22<03:58,  5.41s/it]

Saved extract text to '..\data\raw-text\W23367.txt'.
Downloading 'W28194.pdf' to buffer memory...
Processing PDF 'W28194.pdf' with PyMuPDF...


 44%|████▍     | 34/77 [03:28<04:02,  5.63s/it]

Saved extract text to '..\data\raw-text\W28194.txt'.
Downloading 'W30189.pdf' to buffer memory...
Processing PDF 'W30189.pdf' with PyMuPDF...


 45%|████▌     | 35/77 [03:34<03:58,  5.67s/it]

Saved extract text to '..\data\raw-text\W30189.txt'.
Downloading 'W23360.pdf' to buffer memory...
Processing PDF 'W23360.pdf' with PyMuPDF...


 47%|████▋     | 36/77 [03:43<04:32,  6.64s/it]

Saved extract text to '..\data\raw-text\W23360.txt'.
Downloading 'W90244.pdf' to buffer memory...
Processing PDF 'W90244.pdf' with PyMuPDF...


 48%|████▊     | 37/77 [03:48<04:11,  6.29s/it]

Saved extract text to '..\data\raw-text\W90244.txt'.
Downloading 'W23366.pdf' to buffer memory...
Processing PDF 'W23366.pdf' with PyMuPDF...


 49%|████▉     | 38/77 [03:57<04:32,  6.99s/it]

Saved extract text to '..\data\raw-text\W23366.txt'.
Downloading 'W23359.pdf' to buffer memory...
Processing PDF 'W23359.pdf' with PyMuPDF...


 51%|█████     | 39/77 [04:03<04:16,  6.75s/it]

Saved extract text to '..\data\raw-text\W23359.txt'.
Downloading 'W36047.pdf' to buffer memory...
Processing PDF 'W36047.pdf' with PyMuPDF...


 52%|█████▏    | 40/77 [04:08<03:45,  6.09s/it]

Saved extract text to '..\data\raw-text\W36047.txt'.
Downloading 'W23364.pdf' to buffer memory...
Processing PDF 'W23364.pdf' with PyMuPDF...


 53%|█████▎    | 41/77 [04:13<03:26,  5.73s/it]

Saved extract text to '..\data\raw-text\W23364.txt'.
Downloading 'W29316.pdf' to buffer memory...
Processing PDF 'W29316.pdf' with PyMuPDF...


 55%|█████▍    | 42/77 [04:17<03:04,  5.27s/it]

Saved extract text to '..\data\raw-text\W29316.txt'.
Downloading 'W23372.pdf' to buffer memory...
Processing PDF 'W23372.pdf' with PyMuPDF...


 56%|█████▌    | 43/77 [04:20<02:40,  4.73s/it]

Saved extract text to '..\data\raw-text\W23372.txt'.
Downloading 'W23361.pdf' to buffer memory...
Processing PDF 'W23361.pdf' with PyMuPDF...


 57%|█████▋    | 44/77 [04:25<02:32,  4.62s/it]

Saved extract text to '..\data\raw-text\W23361.txt'.
Downloading 'W23368.pdf' to buffer memory...
Processing PDF 'W23368.pdf' with PyMuPDF...


 58%|█████▊    | 45/77 [04:29<02:26,  4.59s/it]

Saved extract text to '..\data\raw-text\W23368.txt'.
Downloading 'W25157.pdf' to buffer memory...
Processing PDF 'W25157.pdf' with PyMuPDF...


 60%|█████▉    | 46/77 [04:33<02:12,  4.27s/it]

Saved extract text to '..\data\raw-text\W25157.txt'.
Downloading 'W28190.pdf' to buffer memory...
Processing PDF 'W28190.pdf' with PyMuPDF...


 61%|██████    | 47/77 [04:40<02:35,  5.18s/it]

Saved extract text to '..\data\raw-text\W28190.txt'.
Downloading 'W15358.pdf' to buffer memory...
Processing PDF 'W15358.pdf' with PyMuPDF...


 62%|██████▏   | 48/77 [04:42<02:05,  4.34s/it]

Saved extract text to '..\data\raw-text\W15358.txt'.
Downloading 'W29317.pdf' to buffer memory...
Processing PDF 'W29317.pdf' with PyMuPDF...


 64%|██████▎   | 49/77 [04:49<02:18,  4.95s/it]

Saved extract text to '..\data\raw-text\W29317.txt'.
Downloading 'W22099.pdf' to buffer memory...
Processing PDF 'W22099.pdf' with PyMuPDF...


 65%|██████▍   | 50/77 [04:58<02:45,  6.13s/it]

Saved extract text to '..\data\raw-text\W22099.txt'.
Downloading 'W28600.pdf' to buffer memory...
Processing PDF 'W28600.pdf' with PyMuPDF...


 66%|██████▌   | 51/77 [05:04<02:39,  6.15s/it]

Saved extract text to '..\data\raw-text\W28600.txt'.
Downloading 'W23371.pdf' to buffer memory...
Processing PDF 'W23371.pdf' with PyMuPDF...


 68%|██████▊   | 52/77 [05:08<02:19,  5.57s/it]

Saved extract text to '..\data\raw-text\W23371.txt'.
Downloading 'W23363.pdf' to buffer memory...
Processing PDF 'W23363.pdf' with PyMuPDF...


 69%|██████▉   | 53/77 [05:13<02:11,  5.48s/it]

Saved extract text to '..\data\raw-text\W23363.txt'.
Downloading 'W25159.pdf' to buffer memory...
Processing PDF 'W25159.pdf' with PyMuPDF...


 70%|███████   | 54/77 [05:18<02:01,  5.27s/it]

Saved extract text to '..\data\raw-text\W25159.txt'.
Downloading 'W25160.pdf' to buffer memory...
Processing PDF 'W25160.pdf' with PyMuPDF...


 71%|███████▏  | 55/77 [05:22<01:46,  4.85s/it]

Saved extract text to '..\data\raw-text\W25160.txt'.
Downloading 'W28756.pdf' to buffer memory...
Processing PDF 'W28756.pdf' with PyMuPDF...


 73%|███████▎  | 56/77 [05:28<01:48,  5.15s/it]

Saved extract text to '..\data\raw-text\W28756.txt'.
Downloading 'W90329.pdf' to buffer memory...


 74%|███████▍  | 57/77 [05:32<01:35,  4.76s/it]

Processing PDF 'W90329.pdf' with PyMuPDF...
Saved extract text to '..\data\raw-text\W90329.txt'.
Downloading 'W28342.pdf' to buffer memory...
Processing PDF 'W28342.pdf' with PyMuPDF...


 75%|███████▌  | 58/77 [05:36<01:29,  4.69s/it]

Saved extract text to '..\data\raw-text\W28342.txt'.
Downloading 'W23365.pdf' to buffer memory...
Processing PDF 'W23365.pdf' with PyMuPDF...


 77%|███████▋  | 59/77 [05:41<01:24,  4.71s/it]

Saved extract text to '..\data\raw-text\W23365.txt'.
Downloading 'W11745.pdf' to buffer memory...
Processing PDF 'W11745.pdf' with PyMuPDF...
Saved extract text to '..\data\raw-text\W11745.txt'.


 78%|███████▊  | 60/77 [05:42<01:04,  3.80s/it]

Downloading 'W25156.pdf' to buffer memory...
Processing PDF 'W25156.pdf' with PyMuPDF...


 79%|███████▉  | 61/77 [05:49<01:13,  4.60s/it]

Saved extract text to '..\data\raw-text\W25156.txt'.
Downloading 'W22221.pdf' to buffer memory...
Processing PDF 'W22221.pdf' with PyMuPDF...


 81%|████████  | 62/77 [05:57<01:23,  5.58s/it]

Saved extract text to '..\data\raw-text\W22221.txt'.
Downloading 'W90258.pdf' to buffer memory...
Processing PDF 'W90258.pdf' with PyMuPDF...


 82%|████████▏ | 63/77 [06:01<01:10,  5.06s/it]

Saved extract text to '..\data\raw-text\W90258.txt'.
Downloading 'W23369.pdf' to buffer memory...
Processing PDF 'W23369.pdf' with PyMuPDF...


 83%|████████▎ | 64/77 [06:08<01:12,  5.60s/it]

Saved extract text to '..\data\raw-text\W23369.txt'.
Downloading 'W22740.pdf' to buffer memory...
Processing PDF 'W22740.pdf' with PyMuPDF...


 84%|████████▍ | 65/77 [06:12<01:03,  5.29s/it]

Saved extract text to '..\data\raw-text\W22740.txt'.
Downloading 'W22220.pdf' to buffer memory...
Processing PDF 'W22220.pdf' with PyMuPDF...


 86%|████████▌ | 66/77 [06:15<00:51,  4.67s/it]

Saved extract text to '..\data\raw-text\W22220.txt'.
Downloading 'W21796.pdf' to buffer memory...
Processing PDF 'W21796.pdf' with PyMuPDF...


 87%|████████▋ | 67/77 [06:19<00:42,  4.26s/it]

Saved extract text to '..\data\raw-text\W21796.txt'.
Downloading 'W28599.pdf' to buffer memory...
Processing PDF 'W28599.pdf' with PyMuPDF...


 88%|████████▊ | 68/77 [06:24<00:40,  4.47s/it]

Saved extract text to '..\data\raw-text\W28599.txt'.
Downloading 'W22731.pdf' to buffer memory...
Processing PDF 'W22731.pdf' with PyMuPDF...


 90%|████████▉ | 69/77 [06:27<00:32,  4.08s/it]

Saved extract text to '..\data\raw-text\W22731.txt'.
Downloading 'W22247.pdf' to buffer memory...
Processing PDF 'W22247.pdf' with PyMuPDF...
Saved extract text to '..\data\raw-text\W22247.txt'.


 91%|█████████ | 70/77 [06:29<00:23,  3.40s/it]

Downloading 'W20863.pdf' to buffer memory...


 92%|█████████▏| 71/77 [06:31<00:18,  3.03s/it]

Processing PDF 'W20863.pdf' with PyMuPDF...
Saved extract text to '..\data\raw-text\W20863.txt'.
Downloading 'W21266.pdf' to buffer memory...
Processing PDF 'W21266.pdf' with PyMuPDF...


 94%|█████████▎| 72/77 [06:34<00:15,  3.19s/it]

Saved extract text to '..\data\raw-text\W21266.txt'.
Downloading 'W20864.pdf' to buffer memory...
Processing PDF 'W20864.pdf' with PyMuPDF...


 95%|█████████▍| 73/77 [06:38<00:13,  3.37s/it]

Saved extract text to '..\data\raw-text\W20864.txt'.
Downloading 'W28601.pdf' to buffer memory...
Processing PDF 'W28601.pdf' with PyMuPDF...


 96%|█████████▌| 74/77 [06:42<00:10,  3.54s/it]

Saved extract text to '..\data\raw-text\W28601.txt'.
Downloading 'W22249.pdf' to buffer memory...
Processing PDF 'W22249.pdf' with PyMuPDF...


 97%|█████████▋| 75/77 [06:45<00:06,  3.45s/it]

Saved extract text to '..\data\raw-text\W22249.txt'.
Downloading 'W25571.pdf' to buffer memory...
Processing PDF 'W25571.pdf' with PyMuPDF...


 99%|█████████▊| 76/77 [06:50<00:03,  3.80s/it]

Saved extract text to '..\data\raw-text\W25571.txt'.
Downloading 'W11920.pdf' to buffer memory...


100%|██████████| 77/77 [06:52<00:00,  5.36s/it]

Processing PDF 'W11920.pdf' with PyMuPDF...
Saved extract text to '..\data\raw-text\W11920.txt'.
time: 6min 52s (started: 2024-02-26 16:24:07 -08:00)





In [40]:
text_file_paths = glob.glob(os.path.join(Path.data_text_dir, "*.txt"))

for path in text_file_paths:
    # Print or process the extracted text
    api_id_pattern = r'(?:\b\d{2}-\d{3}-\d{5}\b)|(?:\b\d{2} - \d{3} - \d{5}\b)|(?:^API(?:\s)?#:(?:\s)?\d{10}$)|(?:API(?:\s+)?\d{10})'

    with open(path, "r", encoding="utf-8") as f:
        text = f.read()

    api_id = set(re.findall(api_id_pattern, text))
    api_id = [re.sub(r"[^\d-]", "", id) for id in api_id]

    api_id_final_pattern = r'(\d{2})(\d{3})(\d{5})'
    for i in range(len(api_id)):
        # Find all matches of the pattern in the string
        if len(api_id[i])!=12:
            match = re.search(api_id_final_pattern, api_id[i])
            if match:
                # Format the match into 'XX-XXX-XXXXX' format
                api_id[i] = f"{match.group(1)}-{match.group(2)}-{match.group(3)}"

    d = {
        "file_name": os.path.basename(path),
        "id": set(api_id)
    }
    print(d)

{'file_name': 'W11745.txt', 'id': set()}
{'file_name': 'W11920.txt', 'id': set()}
{'file_name': 'W15358.txt', 'id': set()}
{'file_name': 'W20197.txt', 'id': {'33-053-03413'}}
{'file_name': 'W20407.txt', 'id': {'33-053-06011', '33-005-30347', '33-053-03472', '33-053-06010', '33-053-05924'}}
{'file_name': 'W20863.txt', 'id': set()}
{'file_name': 'W20864.txt', 'id': {'33-053-04981', '33-053-04071', '33-053-03937', '33-053-03911', '33-053-03936', '33-053-03609', '33-053-03912'}}
{'file_name': 'W21266.txt', 'id': set()}
{'file_name': 'W21796.txt', 'id': set()}
{'file_name': 'W22099.txt', 'id': {'33-053-04981', '33-053-04071', '33-053-03937', '33-053-03911', '33-053-03936', '33-053-03609', '33-053-03912'}}
{'file_name': 'W22220.txt', 'id': {'33-053-04981', '33-053-04071', '33-053-03937', '33-053-03911', '33-053-03936', '33-053-03609', '33-053-03912'}}
{'file_name': 'W22221.txt', 'id': {'33-053-04981', '33-053-04071', '33-053-03937', '33-053-03911', '33-053-03936', '33-053-03609', '33-053-039

In [None]:
{'file_name': 'W28654.pdf', 'id': {'33-053-06028'}}
{'file_name': 'W28755.pdf', 'id': {'33-053-03426', '33-053-06056'}}
{'file_name': 'W28651.pdf', 'id': {'33-053-06025'}}
{'file_name': 'W30789.pdf', 'id': {'33-053-06755', '33-053-03426'}}
{'file_name': 'W28652.pdf', 'id': {'33-053-06026'}}
{'file_name': 'W28554.pdf', 'id': {'33-053-05954', '33-053-05998', '33-053-05943', '33-053-05995', '33-053-05997'}}
{'file_name': 'W29242.pdf', 'id': {'33-053-06223'}}
{'file_name': 'W23230.pdf', 'id': {'33-053-03426', '33-053-04211'}}
{'file_name': 'W28633.pdf', 'id': {'33-053-06018'}}
{'file_name': 'W23362.pdf', 'id': {'33-105-02722'}}
{'file_name': 'W29334.pdf', 'id': {'33-053-06243'}}
{'file_name': 'W28754.pdf', 'id': {'33-053-03426', '33-053-06055'}}
{'file_name': 'W28658.pdf', 'id': {'33-053-06030'}}
{'file_name': 'W20197.pdf', 'id': {'33-053-03413'}}
{'file_name': 'W25158.pdf', 'id': {'33-053-04854'}}
{'file_name': 'W29244.pdf', 'id': {'33-053-06225'}}
{'file_name': 'W28634.pdf', 'id': {'33-053-06019'}}
{'file_name': 'W28648.pdf', 'id': {'33-053-06023'}}
{'file_name': 'W20407.pdf', 'id': set()}
{'file_name': 'W28655.pdf', 'id': {'33-053-06029'}}
{'file_name': 'W28744.pdf', 'id': {'33-053-06051'}}
{'file_name': 'W28557.pdf', 'id': {'33-053-05954', '33-053-05998', '33-053-05943', '33-053-05995', '33-053-05997'}}
{'file_name': 'W28649.pdf', 'id': {'33-053-06024'}}
{'file_name': 'W28978.pdf', 'id': {'33-053-06131'}}
{'file_name': 'W28394.pdf', 'id': {'33-053-05954', '33-053-05998', '33-053-05943', '33-053-05995', '33-053-05997'}}
{'file_name': 'W28637.pdf', 'id': {'33-053-06022'}}
{'file_name': 'W28636.pdf', 'id': {'33-053-06021'}}
{'file_name': 'W30188.pdf', 'id': {'33-053-06548'}}
{'file_name': 'W28303.pdf', 'id': {'33-053-05906'}}
{'file_name': 'W23370.pdf', 'id': {'33-105-02730'}}
{'file_name': 'W28425.pdf', 'id': {'33-053-05954', '33-053-05998', '33-053-05943', '33-053-05995', '33-053-05997'}}
{'file_name': 'W28976.pdf', 'id': {'33-053-06129'}}
{'file_name': 'W23367.pdf', 'id': {'33-105-02727'}}
{'file_name': 'W28194.pdf', 'id': {'33-053-05849', '33-053-03433'}}
{'file_name': 'W30189.pdf', 'id': {'33-053-06549'}}
{'file_name': 'W23360.pdf', 'id': {'33-105-02720'}}
{'file_name': 'W90244.pdf', 'id': {'33-053-90244'}}
{'file_name': 'W23366.pdf', 'id': {'33-105-02726'}}
{'file_name': 'W23359.pdf', 'id': {'33-105-02719'}}
{'file_name': 'W36047.pdf', 'id': {'33-053-03433', '33-053-08946'}}
{'file_name': 'W23364.pdf', 'id': {'33-105-02724'}}
{'file_name': 'W29316.pdf', 'id': {'33-053-06231'}}
{'file_name': 'W23372.pdf', 'id': {'33-105-02732'}}
{'file_name': 'W23361.pdf', 'id': {'33-105-02721'}}
{'file_name': 'W23368.pdf', 'id': {'33-105-02728'}}
{'file_name': 'W25157.pdf', 'id': {'33-053-04853'}}
{'file_name': 'W28190.pdf', 'id': {'33-053-05845'}}
{'file_name': 'W15358.pdf', 'id': set()}
{'file_name': 'W29317.pdf', 'id': {'33-053-06232'}}
{'file_name': 'W22099.pdf', 'id': {'33-053-04071', '33-053-03609', '33-053-03936', '33-053-03937', '33-053-03912', '33-053-03911', '33-053-04981'}}
{'file_name': 'W28600.pdf', 'id': {'33-053-06011'}}
{'file_name': 'W23371.pdf', 'id': {'33-105-02731'}}
{'file_name': 'W23363.pdf', 'id': {'33-105-02723'}}
{'file_name': 'W25159.pdf', 'id': {'33-053-04855'}}
{'file_name': 'W25160.pdf', 'id': {'33-053-04856'}}
{'file_name': 'W28756.pdf', 'id': {'33-053-03426', '33-053-06057'}}
{'file_name': 'W90329.pdf', 'id': {'33-053-90329'}}
{'file_name': 'W28342.pdf', 'id': {'33-053-05924'}}
{'file_name': 'W23365.pdf', 'id': {'33-105-02725'}}
{'file_name': 'W11745.pdf', 'id': set()}
{'file_name': 'W25156.pdf', 'id': {'33-053-04852'}}
{'file_name': 'W22221.pdf', 'id': {'33-053-04071', '33-053-03609', '33-053-03936', '33-053-03937', '33-053-03912', '33-053-03911', '33-053-04981'}}
{'file_name': 'W90258.pdf', 'id': set()}
{'file_name': 'W23369.pdf', 'id': {'33-105-02729'}}
{'file_name': 'W22740.pdf', 'id': {'33-053-04071', '33-053-03609', '33-053-03936', '33-053-03937', '33-053-03912', '33-053-03911', '33-053-04981'}}
{'file_name': 'W22220.pdf', 'id': {'33-053-04071', '33-053-03609', '33-053-03936', '33-053-03937', '33-053-03912', '33-053-03911', '33-053-04981'}}
{'file_name': 'W21796.pdf', 'id': set()}
{'file_name': 'W28599.pdf', 'id': {'33-053-06010'}}
{'file_name': 'W22731.pdf', 'id': {'33-053-04069'}}
{'file_name': 'W22247.pdf', 'id': {'33-053-03943'}}
{'file_name': 'W20863.pdf', 'id': set()}
{'file_name': 'W21266.pdf', 'id': set()}
{'file_name': 'W20864.pdf', 'id': {'33-053-04071', '33-053-03609', '33-053-03936', '33-053-03937', '33-053-03912', '33-053-03911', '33-053-04981'}}
{'file_name': 'W28601.pdf', 'id': {'33-053-06012'}}
{'file_name': 'W22249.pdf', 'id': {'33-053-03944'}}
{'file_name': 'W25571.pdf', 'id': {'33-053-04071', '33-053-03609', '33-053-03936', '33-053-03937', '33-053-03912', '33-053-03911', '33-053-04981'}}
{'file_name': 'W11920.pdf', 'id': set()}

In [14]:
import re

# Given list of strings containing API IDs
api_ids = ['API 3305305924', 'API 3305306010', 'API 3305306011', 'API 3300530347']

# Regular expression pattern to extract the 10-digit numbers after 'API'
api_id_pattern = r'API\s*(\d{2})(\d{3})(\d{5})'

# List to store formatted API IDs
formatted_api_ids = []

# Iterate over each string in the list
for api_string in api_ids:
    # Find all matches of the pattern in the string
    match = re.search(api_id_pattern, api_string)
    if match:
        # Format the match into 'XX-XXX-XXXXX' format
        formatted_api_id = f"{match.group(1)}-{match.group(2)}-{match.group(3)}"
        # Append the formatted API ID to the list
        formatted_api_ids.append(formatted_api_id)

# Print the formatted API IDs
print(formatted_api_ids)


['33-053-05924', '33-053-06010', '33-053-06011', '33-005-30347']
time: 0 ns (started: 2024-02-26 02:45:57 -08:00)


In [21]:
pdf_file = drive.CreateFile({"id": "1GoN9tFDGa7nXXeRA-PySoJUkntKVILKl"})
# print(f"Downloading '{file['title']}' to buffer memory...")
pdf_bytes = pdf_file.GetContentIOBuffer().read()

with pdfplumber.open(BytesIO(pdf_bytes)) as pdf:
    text = ""
    for page in pdf.pages:
        text += page.extract_text()

print(text)

Slawson Exploration Company, Inc.
McKenzie County, ND
Sec. 36-T153N-R101W
Magnum 1-36-25H
Plan A
Survey: Sperry Survey
Standard Report
22 March, 2012
Well Coordinates: 389,408.89 N, 1,204,680.91 E (48° 01' 30.35" N, 103° 37' 20.35" W)
Ground Level: 2,187.00 ft
Local Coordinate Origin: Centered on Well Magnum 1-36-25H
Viewing Datum: RKB 22.00' @ 2209.00ft (Nabors 419)
TVDs to System: N
North Reference: True
Unit System: API - US Survey Feet - Custom
Geodetic Scale Factor Applied
Version: 2003.16 Build: 43ISlawson Exploration Company, Inc.
McKenzie County, ND
Survey Report for Magnum 1-36-25H - Sperry Survey
Measured Vertical Vertical Dogleg
Depth Inclination Azimuth Depth +N/-S +E/-W Section Rate
(ft) (°) (°) (ft) (ft) (ft) (ft) (°/100ft)
9,506.00 0.44 263.55 9,505.25 10.02 23.24 11.31 0.00
Tie On to Extreme Surveys
9,541.00 0.46 293.91 9,540.25 10.06 22.98 11.34 0.68
First Sperry MWD Survey
9,636.00 0.13 208.26 9,635.24 10.12 22.58 11.38 0.49
9,732.00 0.62 275.00 9,731.24 10.07 22.01 1