In [1]:
import pandas as pd

# Load the raw dataset 
# Note: We don't "open" the file in text editor, we load it into memory
file_path = '../data/raw.xlsx'

try:
    # Reading the Excel file
    df = pd.read_excel(file_path, engine='openpyxl')
    print("✅ File loaded successfully!")

    # 1. Data Cleaning: Remove rows where Customer ID is missing
    df_cleaned = df.dropna(subset=['Customer ID']).copy()

    # 2. Data Cleaning: Filter out negative or zero Quantity and Price
    df_cleaned = df_cleaned[(df_cleaned['Quantity'] > 0) & (df_cleaned['Price'] > 0)]

    # 3. Data Formatting: Convert Customer ID to integer
    df_cleaned['Customer ID'] = df_cleaned['Customer ID'].astype(int)

    # 4. Feature Engineering: Calculate Total Sales Amount
    df_cleaned['TotalAmount'] = df_cleaned['Quantity'] * df_cleaned['Price']

    # Show Cleaning Summary in English
    print("\n--- Cleaning Summary ---")
    print(f"Original records: {len(df)}")
    print(f"Cleaned records: {len(df_cleaned)}")
    print(f"Rows removed: {len(df) - len(df_cleaned)}")

    # 5. Save the cleaned data to CSV
    output_path = '../data/processed_retail.csv'
    df_cleaned.to_csv(output_path, index=False)
    print(f"\n✅ Cleaned data saved as: {output_path}")

    # Display the first 5 rows
    display(df_cleaned.head())

except Exception as e:
    print(f"❌ Error: {e}")

✅ File loaded successfully!

--- Cleaning Summary ---
Original records: 525461
Cleaned records: 407664
Rows removed: 117797

✅ Cleaned data saved as: ../data/processed_retail.csv


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,TotalAmount
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United Kingdom,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085,United Kingdom,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085,United Kingdom,30.0
