In [1]:
import sys
import os
import pandas as pd

# --- Path Setup ---
# Adds the project root to the system path so Python can find 'src' modules
project_root = os.path.abspath(os.path.join(os.getcwd(), '..')) 

if project_root not in sys.path:
    sys.path.append(project_root)

print(f"Project root added to path: {project_root}")


# --- Module Imports ---
from src.cleaning.clean_base import clean_base
from src.parsing.parse_goods_description import parse_goods_description
from src.feature_engineering.features import engineer_features

Project root added to path: C:\Users\Home\OneDrive\Desktop\Assign\siddharth_trade_pipeline


In [2]:
# IMPORTANT: Replace the filename below if yours is different!
file_path = "../data/raw/sample_data.csv"

# Load the raw data
try:
    df = pd.read_csv(file_path)
    print(f"Raw data loaded successfully. Initial Shape: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: Raw file not found at {file_path}. Please check the file name and location.")
    # Stop execution if the file isn't found
    df = None

Raw data loaded successfully. Initial Shape: (2079, 20)


In [3]:
def standardize_units(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardizes the 'UNIT' column into consistent labels using a predefined map.
    """
    UNIT_COL = 'UNIT'
    
    unit_map = {
        # Pieces/Numbers
        "PCS": "PCS", "PC": "PCS", "NOS": "PCS", "PIECES": "PCS", "SET": "PCS",
        # Weight
        "KG": "KG", "KGS": "KG", "GMS": "GRAMS",
        # Volume/Capacity
        "LTR": "LITER", "LITRE": "LITER", "ML": "ML",
        # Length/Area
        "MTR": "METER", "SQM": "SQM", "INCH": "INCH",
        # Tonnage
        "MT": "MT", "METRIC TON": "MT"
    }
    
    def normalize_unit(u):
        if not isinstance(u, str):
            return "OTHERS"
        u_upper = u.strip().upper()
        # Use .get() with a default of 'OTHERS' for unmapped units
        return unit_map.get(u_upper, "OTHERS") 

    # Apply the standardization function to the DataFrame
    df["unit_standardized"] = df[UNIT_COL].apply(normalize_unit)
    
    return df

In [4]:
if df is not None:
    # 1. Base Cleaning (Dates, Missing Financial/Quantity Data)
    print("Running Phase 2A: Basic Cleaning...")
    df_step1 = clean_base(df.copy())

    # 2. Text Parsing (Extracting Price, Capacity, Material)
    print("Running Phase 2B: Goods Description Parsing...")
    df_step2 = parse_goods_description(df_step1)
    
    # 3. Unit Standardization
    print("Running Phase 2C: Unit Standardization...")
    df_step3 = standardize_units(df_step2)

    # 4. Feature Engineering (Grand Total, Landed Cost, Categories)
    print("Running Phase 3: Feature Engineering...")
    df_final_clean = engineer_features(df_step3)

    print("\n--- Pipeline Execution Complete ---")
    print(f"Final Cleaned Shape: {df_final_clean.shape}")
else:
    print("\nPipeline stopped due to data loading error.")

Running Phase 2A: Basic Cleaning...
Running Phase 2B: Goods Description Parsing...
Running Phase 2C: Unit Standardization...
Running Phase 3: Feature Engineering...

--- Pipeline Execution Complete ---
Final Cleaned Shape: (2079, 33)


  df["date_of_shipment"] = pd.to_datetime(
  df["date_of_shipment"] = pd.to_datetime(


In [5]:
# Review the newly created columns
print("\n--- Sample of Final Processed Data ---")
print(df_final_clean[[
    'DATE', 'year', 'TOTAL VALUE_INR', 'DUTY PAID_INR', 
    'grand_total_inr', 'landed_cost_per_unit', 
    'unit_standardized', 'unit_price_in_usd', 
    'category', 'sub_category'
]].sample(5).T)


# Export the processed data for Phase 4 (SQL Loading)
OUTPUT_PATH = "../data/processed/trade_cleaned.csv"
df_final_clean.to_csv(OUTPUT_PATH, index=False)
print(f"\nProcessed data exported successfully to: {OUTPUT_PATH}")


--- Sample of Final Processed Data ---
                                   633                1648               1821  \
DATE                         2023-07-07         2019-02-20         2018-07-24   
year                               2023               2019               2018   
TOTAL VALUE_INR              1518677.74         1208432.58         1830356.41   
DUTY PAID_INR                  556443.5           374372.4           567044.4   
grand_total_inr              2075121.24         1582804.98         2397400.81   
landed_cost_per_unit          345.85354          16.487552           9.227871   
unit_standardized                    KG                PCS                PCS   
unit_price_in_usd                0.4494                NaN                NaN   
category                      Metalware          Metalware          Metalware   
sub_category          General Metalware  General Metalware  General Metalware   

                                   413                1158  
DATE   