In [2]:
import pdfplumber
import pandas as pd
import re
import sys

In [48]:
def special_case_for_header(header):
    if header == 'Công ty':
        return "Company"
    elif header == "Tổng số ngày nghỉ hưởng lương BHXH (tỷ lệ 75%)\nLeave days subject to 75% Social Insurance":
        return "Leave days subject to 75% Social Insurance"
    elif header == "Tổng số ngày nghỉ hưởng lương BHXH (tỷ lệ 100%)\nLeave days subject to 100% Social Insurance":
        return "Leave days subject to 100% Social Insurance"
    elif header == "Lương doanh số Commission":
        return "Comission"
    else:
        return header

In [54]:
# Define the function to extract tables from the PDF
def clean_headers(headers):
    """Clean column headers by removing Vietnamese words and replacing spaces with underscores."""
    cleaned = []
    for header in headers:
        # Remove Vietnamese words (containing diacritics) and keep only English words
        try:
            english_only = header.split("/ ")[1]
        except IndexError as e:
            print(header)
            english_only = header
        cleaned.append(special_case_for_header(english_only).strip().replace('\n', ' ').replace(' ', '_'))  # Replace spaces with underscores
    return cleaned

In [11]:
def extract_cleaned_tables_with_index(pdf_path):
    """Extract tables, clean headers, remove Vietnamese words, and add an index column."""
    with pdfplumber.open(pdf_path) as pdf:
        tables = []
        flag_first_tb = True
        for page in pdf.pages:
            page_tables = page.extract_tables()
            for table in page_tables:
                if table:  # Ensure the table is not empty
                    df = pd.DataFrame(table[1:], columns=table[0]).transpose()
                    df.reset_index(inplace=True)
                    df.columns = clean_headers(df.iloc[0])  # Clean headers
                    df = df.drop(df.index[0])  # Drop the row used for column names
                    tables.append(df)
    return tables

In [56]:
pdf_path = r"D:\Hungtv7\Personal_Fin\VNG_payslip\payslip_VG-15316_2021_04.pdf"
tables = extract_cleaned_tables_with_index(pdf_path)

Tổng số ngày nghỉ hưởng lương BHXH (tỷ lệ 75%)
Leave days subject to 75% Social Insurance
Tổng số ngày nghỉ hưởng lương BHXH (tỷ lệ 100%)
Leave days subject to 100% Social Insurance
Lương doanh số Commission


In [64]:
# Display the first few rows of each extracted table for illustration
for i, table in enumerate(tables):
    if table.shape[0] < 3:
        table = table.drop([1])   
    elif table.shape[0] >= 3:
        table = table.drop([1])
    print(f"Transposed Table {i+1}:\n", table)

Transposed Table 1:
 Empty DataFrame
Columns: [Employee_Name, Employee_Code, Company, Department, Beneficiary_Account_No., PIT_code]
Index: []
Transposed Table 2:
          GENERAL_INFORMATION Basic_salary V_Bonus
2                       None                     
3  LƯƠNG CŨ\nPREVIOUS SALARY          0.0     0.0
4  LƯƠNG MỚI\nCURRENT SALARY    9,000,000     0.0
Transposed Table 3:
   WORKING_INFO._IN_MONTH Standard_working_days Actual_working_days  \
2                   None                                             
3                   None                  None                 0.0   
4                   None                  21.0                 4.5   

  Unpaid_leave Paid_leave Leave_days_subject_to_75%_Social_Insurance  \
2                                                                      
3         None       None                                       None   
4          0.5        0.0                                        0.0   

  Leave_days_subject_to_100%_Social_Insurance

In [76]:
tables[2]

Unnamed: 0,WORKING_INFO._IN_MONTH,Standard_working_days,Actual_working_days,Unpaid_leave,Paid_leave,Leave_days_subject_to_75%_Social_Insurance,Leave_days_subject_to_100%_Social_Insurance,OT_on_working_day,OT_on_working_day_(night_shift),OT_on_off-day,OT_on_off-day_(night-_shift),OT_on_holidays,OT_on_holidays_(night_shift)
1,,(3),(4),(5),(6),(7),(8),(9a),(9b),(10a),(10b),(11a),(11b)
2,,,,,,,,,,,,,
3,,,0.0,,,,,,,,,,
4,,21.0,4.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
tables[0]

Unnamed: 0,Employee_Name,Employee_Code,Company,Department,Beneficiary_Account_No.,PIT_code
1,Trần Việt Hùng,VG-15316 - hungtv7,Công ty Cổ Phần VNG,Financial Analysis & Reporting,4301010748686,8595841018
