In [18]:
import pandas as pd
import pdfplumber
import re
import numpy as np
import os
import sys
pd.set_option('future.no_silent_downcasting', True)

In [2]:
def extract_text(script, text):
    return re.search(script, text).group(1)

In [3]:
def extract_header(page):
    text = page.extract_text()
    event = extract_text(r'Event:\s*(.*?)\s*Round', text)
    round= extract_text(r'Round\s*(.*?)\s*Track', text)
    track = extract_text(r'Track:\s*(.*?)\s*Report', text)
    report = extract_text(r'Report:\s*(.*?)\s*Session', text)
    session = extract_text(r'Session:\s*(.*?)\s*\n', text)
    car, driver = extract_text(r'Car\s*(.*?)\s*\n', text).split('-')
    return event, round, track, report, session, car, driver

In [4]:
def extract_table(page):
    table = np.array(page.extract_table())
    return table

In [5]:
def print_header(header):
    event, round, track, report, session, car, driver = header
    result = f"""
    Event: {event}
    Round: {round}
    Track: {track}
    Report: {report}
    Session: {session}
    Car: {car}
    Driver: {driver}
    """
    print(result)

In [6]:
def full_array(size, param):
    return np.full(size, param)

In [7]:
def lap_correction(lap_list):
    new_list = []
    for i in range(len(lap_list)):
        j = lap_list.iloc[i]  
        if i == 0:
            valor = j
        elif i % 2 == 1:
            valor = lap_list.iloc[i-1]  
        else:
            valor = j  
        new_list.append(valor)
    return new_list

In [8]:
def table_correction(df):
    df['Lap'] = lap_correction(df['Lap'])
    df = df.replace('', np.nan)
    
    integer_columns = ['car','Lap']
    float_columns = ['SF_to_T1', 'T1_to_SS1', 'SS1_to_T2', 'T2_to_BS', 'BS_to_T3', 'T3_to_SS2', 'SS2_to_T4', 'T4_to_FS', 'FS_to_SF', 'Lapt', 'PI_to_PO', 'PO_to_SF', 'SF_to_PI']
    for col in integer_columns:
        df[col] = df[col].astype(int)
    for col in float_columns:
        df[col] = df[col].astype(float)
    
    return df

In [9]:
def create_df_page(page):
    header = extract_header(page)
    table = extract_table(page) 
    headers_table = ['Lap', 'T/S', 'SF_to_T1', 'T1_to_SS1', 'SS1_to_T2', 'T2_to_BS', 'BS_to_T3', 'T3_to_SS2', 'SS2_to_T4', 'T4_to_FS', 'FS_to_SF', 'Lapt', 'PI_to_PO', 'PO_to_SF', 'SF_to_PI']
    shape_table = np.shape(table)[0]
    event, round, track, report, session, car, driver = header
    
    event_arr = full_array(shape_table, event)
    round_arr = full_array(shape_table, round)
    track_arr = full_array(shape_table, track)
    report_arr = full_array(shape_table, report)
    session_arr = full_array(shape_table, session)
    car_arr = full_array(shape_table, car)
    driver_arr = full_array(shape_table, driver)
    
    data = {'event': event_arr,
            'round': round_arr,
            'track': track_arr,
            'report': report_arr,
            'session': session_arr,
            'car': car_arr,
            'driver': driver_arr}
    
    df_header = pd.DataFrame(data)
    df_table = pd.DataFrame(table, columns=headers_table)
    df_page = pd.concat([df_header, df_table], axis=1)
    
    df_page_corrected = table_correction(df_page)
    
    return df_page_corrected

In [10]:
def read_all_pages(pdf):
    base_df = create_df_page(pdf.pages[0])
    for page in range(1, len(pdf.pages)):
        print(f'Reading page {page}')
        try:
            new_df = create_df_page(pdf.pages[page])
            base_df = pd.concat([base_df, new_df], axis=0).reset_index(drop=True)
        except:
            print(f'Page # {page+1} was not read it has different content to the format')
            continue
    print('No more pages to read ... we are done')
    return base_df


In [24]:
def read_and_save_dataset(route):
    new_datasets_route = 'new_datasets/'
    # Check and creates the directory if it does not exist
    os.makedirs(new_datasets_route, exist_ok=True)
    try:
        
        pdf_files = [file for file in os.listdir(route) if file.lower().endswith(".pdf")]

        if not pdf_files:
            raise FileNotFoundError(f"The folder {route} does not have any pdf file to read")

        for file in os.listdir(route):
            if file.lower().endswith(".pdf"):
                route_pdf = os.path.join(route, file)
                print(f"📖 Reading: {file}, Route: {route_pdf}")
                pdf =  pdfplumber.open(f'{route_pdf}')
                whole_pdf = read_all_pages(pdf)
                name_file = file.replace('.pdf',"")
                route_new_dataset = os.path.join(new_datasets_route, f'{name_file}.parquet')
                whole_pdf.to_parquet(route_new_dataset, index=False)

    except FileNotFoundError as e:
        print(e)
    except Exception as e:
        print(f"Unexpected error: {e}")
        


In [25]:
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Error: You must provide a folder path with PDFs")
        sys.exit(1)  
    route_pdfs = sys.argv[1] 
    read_and_save_dataset(route_pdfs)

Unexpected error: [WinError 123] El nombre de archivo, el nombre de directorio o la sintaxis de la etiqueta del volumen no son correctos: '--f=c:\\Users\\Gamer2022\\AppData\\Roaming\\jupyter\\runtime\\kernel-v368931657b6c357121e6c8c2cbed43ed0bd1ddec2.json'


In [None]:
def read_and_save_dataset(route):
    new_datasets_route = './new_datasets/'
    # Checks and creates the directory if it does not exist
    os.makedirs(new_datasets_route, exist_ok=True)

    for file in os.listdir(route):
        if file.lower().endswith(".pdf"):
            # Route for reading the files
            route_pdf = os.path.join(route, file)
            print(f"📖 Reading: {file}, Route: {route_pdf}")
            # Read the pages
            pdf = pdfplumber.open(f'{route_pdf}')
            whole_pdf = read_all_pages(pdf)
            # Save the dataset created in the new_datasets folder
            name_file = file.replace('.pdf', "")
            route_new_dataset = os.path.join(
                new_datasets_route, f'{name_file}.parquet')
            whole_pdf.to_parquet(route_new_dataset, index=False)