In [1]:
import pandas as pd
import pdfplumber
import re
import numpy as np
pd.set_option('future.no_silent_downcasting', True)

In [2]:
def extract_text(script, text):
    return re.search(script, text).group(1)

In [3]:
def extract_header(page):
    text = page.extract_text()
    event = extract_text(r'Event:\s*(.*?)\s*Round', text)
    round= extract_text(r'Round\s*(.*?)\s*Track', text)
    track = extract_text(r'Track:\s*(.*?)\s*Report', text)
    report = extract_text(r'Report:\s*(.*?)\s*Session', text)
    session = extract_text(r'Session:\s*(.*?)\s*\n', text)
    car, driver = extract_text(r'Car\s*(.*?)\s*\n', text).split('-')
    return event, round, track, report, session, car, driver

In [4]:
def extract_table(page):
    table = np.array(page.extract_table())
    return table

In [5]:
def print_header(header):
    event, round, track, report, session, car, driver = header
    result = f"""
    Event: {event}
    Round: {round}
    Track: {track}
    Report: {report}
    Session: {session}
    Car: {car}
    Driver: {driver}
    """
    print(result)

In [6]:
def full_array(size, param):
    return np.full(size, param)

In [7]:
def lap_correction(lap_list):
    new_list = []
    for i in range(len(lap_list)):
        j = lap_list.iloc[i]  
        if i == 0:
            valor = j
        elif i % 2 == 1:
            valor = lap_list.iloc[i-1]  
        else:
            valor = j  
        new_list.append(valor)
    return new_list

In [8]:
def table_correction(df):
    df['Lap'] = lap_correction(df['Lap'])
    df = df.replace('', np.nan)
    
    integer_columns = ['car','Lap']
    float_columns = ['SF_to_T1', 'T1_to_SS1', 'SS1_to_T2', 'T2_to_BS', 'BS_to_T3', 'T3_to_SS2', 'SS2_to_T4', 'T4_to_FS', 'FS_to_SF', 'Lapt', 'PI_to_PO', 'PO_to_SF', 'SF_to_PI']
    for col in integer_columns:
        df[col] = df[col].astype(int)
    for col in float_columns:
        df[col] = df[col].astype(float)
    
    return df

In [9]:
def create_df_page(page):
    header = extract_header(page)
    table = extract_table(page) 
    headers_table = ['Lap', 'T/S', 'SF_to_T1', 'T1_to_SS1', 'SS1_to_T2', 'T2_to_BS', 'BS_to_T3', 'T3_to_SS2', 'SS2_to_T4', 'T4_to_FS', 'FS_to_SF', 'Lapt', 'PI_to_PO', 'PO_to_SF', 'SF_to_PI']
    shape_table = np.shape(table)[0]
    event, round, track, report, session, car, driver = header
    
    event_arr = full_array(shape_table, event)
    round_arr = full_array(shape_table, round)
    track_arr = full_array(shape_table, track)
    report_arr = full_array(shape_table, report)
    session_arr = full_array(shape_table, session)
    car_arr = full_array(shape_table, car)
    driver_arr = full_array(shape_table, driver)
    
    data = {'event': event_arr,
            'round': round_arr,
            'track': track_arr,
            'report': report_arr,
            'session': session_arr,
            'car': car_arr,
            'driver': driver_arr}
    
    df_header = pd.DataFrame(data)
    df_table = pd.DataFrame(table, columns=headers_table)
    df_page = pd.concat([df_header, df_table], axis=1)
    
    df_page_corrected = table_correction(df_page)
    
    return df_page_corrected

In [14]:
def read_all_pages(pdf):
    base_df = create_df_page(pdf.pages[0])
    for page in range(1, len(pdf.pages)):
        print(f'Reading page {page}')
        try:
            new_df = create_df_page(pdf.pages[page])
            base_df = pd.concat([base_df, new_df], axis=0).reset_index(drop=True)
        except:
            print(f'Page # {page+1} was not read it has different content to the format')
            continue
    print('No more pages to read ... we are done')
    return base_df


In [15]:
pdf =  pdfplumber.open('doc_1.pdf')

In [16]:
whole_pdf = read_all_pages(pdf)

Reading page 1
Reading page 2
Reading page 3
Reading page 4
Reading page 5
Reading page 6
Reading page 7
Reading page 8
Reading page 9
Reading page 10
Reading page 11
Reading page 12
Reading page 13
Reading page 14
Reading page 15
Reading page 16
Reading page 17
Reading page 18
Reading page 19
Reading page 20
Reading page 21
Reading page 22
Reading page 23
Reading page 24
Reading page 25
Reading page 26
Reading page 27
Reading page 28
Reading page 29
Reading page 30
Reading page 31
Reading page 32
Reading page 33
Reading page 34
Reading page 35
Reading page 36
Reading page 37
Reading page 38
Reading page 39
Reading page 40
Reading page 41
Reading page 42
Reading page 43
Reading page 44
Reading page 45
Reading page 46
Reading page 47
Reading page 48
Reading page 49
Reading page 50
Reading page 51
Reading page 52
Reading page 53
Reading page 54
Reading page 55
Reading page 56
Reading page 57
Reading page 58
Reading page 59
Reading page 60
Reading page 61
Reading page 62
Reading page 63
R