In [1]:
# Imports & version check
import sys, os
import pandas as pd

try:
    import pyarrow, pyarrow.parquet as pq
except Exception as e:
    raise RuntimeError("PyArrow is required. Install with: python -m pip install pyarrow==21.0.0")

try:
    from deltalake import DeltaTable
except Exception:
    DeltaTable = None  # Optional for reading a full Delta table

print('Python:', sys.executable)
print('pandas:', pd.__version__)
print('pyarrow:', pyarrow.__version__)
print('deltalake:', end=' ')
print(DeltaTable.__module__.split('.')[0] if DeltaTable else 'not installed')


Python: c:\Users\mohanreddykotha\sumitomo_datalake\sumitomo\.venv\Scripts\python.exe
pandas: 2.3.2
pyarrow: 21.0.0
deltalake: deltalake


In [2]:
# Set your paths
PARQUET_FILE = r"C:\Users\mohanreddykotha\sumitomo_datalake\sumitomo\data\bronze_delta\Slovakia\REPORT_X_SLOVAKIA__Sheet1\part-00001-3e05bdd7-3870-4ec7-8f3a-e6bf0b6c053c-c000.snappy.parquet"
TABLE_DIR    = r"C:\Users\mohanreddykotha\sumitomo_datalake\sumitomo\data\bronze_delta\Slovakia\REPORT_X_SLOVAKIA__Sheet1"

assert os.path.exists(PARQUET_FILE), f'Parquet file not found: {PARQUET_FILE}'
assert os.path.isdir(TABLE_DIR), f'Table directory not found: {TABLE_DIR}'
PARQUET_FILE, TABLE_DIR


('C:\\Users\\mohanreddykotha\\sumitomo_datalake\\sumitomo\\data\\bronze_delta\\Slovakia\\REPORT_X_SLOVAKIA__Sheet1\\part-00001-3e05bdd7-3870-4ec7-8f3a-e6bf0b6c053c-c000.snappy.parquet',
 'C:\\Users\\mohanreddykotha\\sumitomo_datalake\\sumitomo\\data\\bronze_delta\\Slovakia\\REPORT_X_SLOVAKIA__Sheet1')

In [7]:
# Business columns — small subset only (UPPERCASE names)
BUSINESS_KEEP = [
    'ORDER_NO','MODEL_2','ORDER_DATE','VIN','MODEL_YEAR',
    'BODY','SERIES','ENGINE','TRANSMISSION','COLOUR','TRIM','OPTION_LIST', 'BUILD_DATE'
]
len(BUSINESS_KEEP), BUSINESS_KEEP


(13,
 ['ORDER_NO',
  'MODEL_2',
  'ORDER_DATE',
  'VIN',
  'MODEL_YEAR',
  'BODY',
  'SERIES',
  'ENGINE',
  'TRANSMISSION',
  'COLOUR',
  'TRIM',
  'OPTION_LIST',
  'BUILD_DATE'])

In [10]:
# A) Read the **single Parquet file** with projection (fast)
#    We first inspect the schema, intersect with BUSINESS_KEEP, then read only those columns.

import pyarrow.parquet as pq
schema = pq.read_schema(PARQUET_FILE)
available = {str(f.name) for f in schema}
keep_final = [c for c in BUSINESS_KEEP if c in available]
missing = [c for c in BUSINESS_KEEP if c not in available]

df = pd.read_parquet(PARQUET_FILE, columns=keep_final, engine='pyarrow')

print('Columns requested :', len(BUSINESS_KEEP))
print('Columns read      :', len(keep_final))
if missing:
    print('Missing columns :', missing)

pd.set_option('display.max_columns', None, 'display.width', 500)

print(df.head(20).to_string(index=False))
df.head()


Columns requested : 13
Columns read      : 13
ORDER_NO             MODEL_2          ORDER_DATE         VIN MODEL_YEAR                  BODY             SERIES                                          ENGINE                                             TRANSMISSION                        COLOUR                                                               TRIM                                                                                                                                                                                                                                                                     OPTION_LIST          BUILD_DATE
   A0034         V710 CAMPER 2025-01-08 00:00:00                2026.00                CAMPER            320 LWB     2.0L CR TC DSL PANTHER F EURO 6.2 EMISSIONS     8 SPD AUTOMATIC TRANS 8F57 4 WHL L/H PART TIME DRIVE                 BLUE METALLIC                                              NORDIC BLUV - NUNATEK                         19X7.5 A

Unnamed: 0,ORDER_NO,MODEL_2,ORDER_DATE,VIN,MODEL_YEAR,BODY,SERIES,ENGINE,TRANSMISSION,COLOUR,TRIM,OPTION_LIST,BUILD_DATE
0,A0034,V710 CAMPER,2025-01-08 00:00:00,,2026.0,CAMPER,320 LWB,2.0L CR TC DSL PANTHER F EURO 6.2 EMISSIONS,8 SPD AUTOMATIC TRANS 8F57 4 WHL L/H PART TIME...,BLUE METALLIC,NORDIC BLUV - NUNATEK,"19X7.5 ALLOY WHEEL STYLE A , ENGINE UNDERBODY ...",UNPLANNED
1,B1262,V710 CAMPER,2025-02-26 00:00:00,,2026.0,CAMPER,320 LWB,2.0L CR TC DSL PANTHER F EURO 6.2 EMISSIONS,8 SPD AUTOMATIC TRANS 8F57 4 WHL L/H PART TIME...,MOONDUST SILVER C/C MED SOLID,NORDIC BLUV - NUNATEK,"ENGINE UNDERBODY PROTECTION , EXTENDED RANGE S...",UNPLANNED
2,B1263,V710 CAMPER,2025-02-26 00:00:00,,2026.0,CAMPER,320 LWB,2.0L CR TC DSL PANTHER F EURO 6.2 EMISSIONS,8 SPD AUTOMATIC TRANS 8F57 4 WHL L/H PART TIME...,MAGNETIC,NORDIC BLUV - NUNATEK,"ENGINE UNDERBODY PROTECTION , EXTENDED RANGE S...",UNPLANNED
3,B1083,V710 CAMPER,2025-02-14 00:00:00,,2026.0,CAMPER,320 LWB,2.0L CR TC DSL PANTHER F EURO 6.2 EMISSIONS,8 SPD AUTOMATIC TRANS 8F57 4 WHL L/H PART TIME...,MOONDUST SILVER C/C MED SOLID,NORDIC BLUV - NUNATEK,"EXTENDED RANGE SINGLE FU/TNK , DRIVER ASSISTAN...",UNPLANNED
4,B1084,V710 CAMPER,2025-02-14 00:00:00,,2026.0,CAMPER,320 LWB,2.0L CR TC DSL PANTHER F EURO 6.2 EMISSIONS,8 SPD AUTOMATIC TRANS 8F57 4 WHL L/H PART TIME...,ARTISAN RED,NORDIC BLUV - NUNATEK,"EXTENDED RANGE SINGLE FU/TNK , DRIVER ASSISTAN...",UNPLANNED


---
### (Optional) B) Read the **entire Delta table** and then select the same columns
Use this if you want rows across **all** Parquet parts. Requires the `deltalake` package.


In [9]:
if DeltaTable is not None:
    dt = DeltaTable(TABLE_DIR)
    print('Delta version:', dt.version())
    df_all = dt.to_pandas()
    cols_in_data = set(df_all.columns)
    keep_final = [c for c in BUSINESS_KEEP if c in cols_in_data]
    missing = [c for c in BUSINESS_KEEP if c not in cols_in_data]
    df_tbl = df_all[keep_final] if keep_final else df_all
    print('Columns requested :', len(BUSINESS_KEEP))
    print('Columns read      :', len(keep_final))
    if missing:
        print('Missing columns :', missing)

    print(df_tbl.head(20).to_string(index=False))
else:
    print('deltalake not installed; skipping full-table read.')


Delta version: 0
Columns requested : 13
Columns read      : 13
ORDER_NO             MODEL_2          ORDER_DATE         VIN MODEL_YEAR                  BODY             SERIES                                          ENGINE                                             TRANSMISSION                        COLOUR                                                               TRIM                                                                                                                                                                                                                                                                     OPTION_LIST          BUILD_DATE
   A0034         V710 CAMPER 2025-01-08 00:00:00                2026.00                CAMPER            320 LWB     2.0L CR TC DSL PANTHER F EURO 6.2 EMISSIONS     8 SPD AUTOMATIC TRANS 8F57 4 WHL L/H PART TIME DRIVE                 BLUE METALLIC                                              NORDIC BLUV - NUNATEK                