In [1]:
!pip install -q tabula-py
!pip install -q jpype1
!pip install --upgrade openpyxl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m496.6/496.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m


In [2]:
import tabula
import pandas as pd
import numpy as np
import os
import glob
from dateutil import parser
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

### 1. pdf to pd.dataframe using Tabula library

In [75]:
def process_komatsu_pdf(pdf_path, maquina="KOM 390-10"):
    # read pdf
    dfs = tabula.read_pdf(pdf_path, area=[20,0,760,520], pages=1)
    df = dfs[0]

    df[df.columns[0]] = df[df.columns[0]].str.replace("", '')
    df = (df.drop('Unnamed: 1', axis=1)
            .dropna()
            .replace({'H': '', 'l/h': '', 'l': ''}, regex=True))
    df = df.drop(df.index[0])
    df.iloc[:, 0] = df.iloc[:, 0].str[:10]

    split_words = df['Informe detallado de funcionamiento mensual'].str.split()
    df_split = pd.DataFrame(
        split_words.tolist(),
        columns=['eli1', 'eli2', "machine_hours", 'SMR', 'operating_hours', "idle_hours", "total_fuel", "gal_hour"]
    )

    # Rename and concatenate
    df = df.reset_index(drop=True)
    new_names = ['Date', "eli0", "total_fuel_idle"]
    df = df.rename(columns=dict(zip(df.columns, new_names)))
    df_f = pd.concat([df, df_split], axis=1)
    df_f = (df_f.drop(columns=["eli0","eli1", "eli2"])
                .drop(df_f.index[-1])
                .replace('-', np.nan))


    columns_to_convert = ["machine_hours", 'SMR', 'operating_hours', "idle_hours", "total_fuel", "gal_hour","total_fuel_idle"]
    df_f[columns_to_convert] = df_f[columns_to_convert].astype(float)

    # Date format adjustment
    df_f['Date'] = df_f['Date'].apply(lambda x: parser.parse(x, dayfirst=True))

    # Assign machine name
    df_f = df_f.assign(Shovel=maquina)

    # Convert fuel from liters to US gallons https://en.wikipedia.org/wiki/Litre
    df_f["total_fuel"] = df_f["total_fuel"]/3.785
    df_f["gal_hour"] = df_f["gal_hour"]/3.785

    # Select final columns
    filter = ["Date",'operating_hours', "idle_hours", "total_fuel", "gal_hour", "Shovel"]
    df_f = df_f[filter].copy()

    # Set index and sort
    df_f = df_f.set_index('Date')
    df_f = df_f.sort_index().round(1)

    return df_f

In [76]:
pdf_path = "/content/komatsu_pdf.pdf"
data_pdf = process_komatsu_pdf(pdf_path)

### 2. xlsx to pd.dataframe using pandas

In [79]:
def procesar_archivos_kom(path_dir, shovel):

    df = pd.read_excel(path_dir)[:-1]  # remove the last row
    df['Shovel'] = shovel

    # Rename columns by position
    df.rename(columns={
        df.columns[0]: 'Date',
        df.columns[1]: 'machine_hours',
        df.columns[4]: "idle_hours",
        df.columns[3]: 'operating_hours',
        df.columns[5]: "total_fuel",
        df.columns[6]: "gal_hour"
    }, inplace=True)

    # Filtrar columnas necesarias
    filter = ["Date",'operating_hours', "idle_hours", "total_fuel", "gal_hour", "Shovel"]
    df = df[filter]

    # Convert liters to gallons
    df["total_fuel"] /= 3.78541
    df["gal_hour"] /= 3.78541

    # Round values
    df = df.round(1)

    # Convert and set date index
    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
    df.dropna(subset=['Date'], inplace=True)
    df.set_index('Date', inplace=True)

    return df


In [80]:
path_dir2 = "/content/komatsu_excel.xlsx"
data_xlsx = procesar_archivos_kom(path_dir2, "KOM 390-10")

### 3. Merge and save in CSV format

In [81]:
combined_df = pd.concat([data_pdf, data_xlsx], ignore_index=False)

In [82]:
combined_df.to_csv("Shovels_Komatsu_output.csv",index=True)