In [1]:
# Section 1: Load Setup
import pandas as pd
import os

# Define paths
data_dir = "loaded_data"
os.makedirs(data_dir, exist_ok=True)
parquet_full_path = os.path.join(data_dir, "full_data.parquet")
parquet_inc_path = os.path.join(data_dir, "incremental_data.parquet")

print("Setup complete. Output directory:", data_dir)

Setup complete. Output directory: loaded_data


In [2]:
# Section 2: Load Full Transformed Data
# Read transformed_full.csv and save as Parquet
df_full = pd.read_csv("transformed_full.csv")
df_full.to_parquet(parquet_full_path, index=False)
print(f"Full transformed data saved to {parquet_full_path}")
print("Preview of full data:")
print(df_full.head())

Full transformed data saved to loaded_data\full_data.parquet
Preview of full data:
   transaction_id        date  customer_id product  amount product_category  \
0               1  2025-03-23         1916   Phone   58.85           Mobile   
1               2  2025-01-15         4752  Laptop  640.10        Computing   
2               3  2025-01-04         1525  Laptop  584.84        Computing   
3               4  2025-02-05         6168  Laptop  144.98        Computing   
4               5  2025-02-01         7572   Phone  560.75           Mobile   

     month  
0  2025-03  
1  2025-01  
2  2025-01  
3  2025-02  
4  2025-02  


In [3]:
# Section 3: Load Incremental Transformed Data
# Read transformed_incremental.csv and save as Parquet
df_inc = pd.read_csv("transformed_incremental.csv")
df_inc.to_parquet(parquet_inc_path, index=False)
print(f"Incremental transformed data saved to {parquet_inc_path}")
print("Preview of incremental data:")
print(df_inc.head())

Incremental transformed data saved to loaded_data\incremental_data.parquet
Preview of incremental data:
Empty DataFrame
Columns: [transaction_id, date, customer_id, product, amount, product_category, month]
Index: []


In [4]:
# Section 4: Verification
# Read back the Parquet files and verify
verified_full = pd.read_parquet(parquet_full_path)
verified_inc = pd.read_parquet(parquet_inc_path)
print("Verified full data (first 5 rows):")
print(verified_full.head())
print("Verified incremental data (first 5 rows):")
print(verified_inc.head())

Verified full data (first 5 rows):
   transaction_id        date  customer_id product  amount product_category  \
0               1  2025-03-23         1916   Phone   58.85           Mobile   
1               2  2025-01-15         4752  Laptop  640.10        Computing   
2               3  2025-01-04         1525  Laptop  584.84        Computing   
3               4  2025-02-05         6168  Laptop  144.98        Computing   
4               5  2025-02-01         7572   Phone  560.75           Mobile   

     month  
0  2025-03  
1  2025-01  
2  2025-01  
3  2025-02  
4  2025-02  
Verified incremental data (first 5 rows):
Empty DataFrame
Columns: [transaction_id, date, customer_id, product, amount, product_category, month]
Index: []


* Since transformed_incremental.csv is currently empty (based on the last extraction date of 2025-04-05 and no newer records), the incremental Parquet file will also be empty.