In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# --- 1. LOAD DATA ---
# We use 'encoding="ISO-8859-1"' because this specific dataset often has special characters
print("Loading data... this may take a moment.")
df = pd.read_csv('../01_Raw_Data/retail_transactions_raw.csv', encoding='ISO-8859-1')

# --- 2. INITIAL INSPECTION ---
print(f"Original Row Count: {df.shape[0]}")

# --- 3. DATA CLEANING (Business Logic) ---

# A. Remove rows where Customer ID is missing (We can't measure retention without an ID)
df_clean = df.dropna(subset=['Customer ID'])

# B. Remove returned items (In this dataset, cancellations often have 'C' in the Invoice code)
# We only want valid sales for this specific analysis.
df_clean = df_clean[~df_clean['Invoice'].str.contains('C', na=False)]

# C. Create a Total Sales Column (Quantity * Price)
# This helps us identify "High Value" customers later.
df_clean['TotalAmount'] = df_clean['Quantity'] * df_clean['Price']

# D. Convert Date to proper DateTime format
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])

# --- 4. RESULTS ---
print("-" * 30)
print(f"Cleaned Row Count: {df_clean.shape[0]}")
print(f"Rows Dropped: {df.shape[0] - df_clean.shape[0]}")
print("-" * 30)

# Check the first few rows to ensure it looks right
print(df_clean[['Invoice', 'Customer ID', 'InvoiceDate', 'TotalAmount']].head())

Loading data... this may take a moment.
Original Row Count: 1067371
------------------------------
Cleaned Row Count: 805620
Rows Dropped: 261751
------------------------------
  Invoice  Customer ID         InvoiceDate  TotalAmount
0  489434      13085.0 2009-12-01 07:45:00         83.4
1  489434      13085.0 2009-12-01 07:45:00         81.0
2  489434      13085.0 2009-12-01 07:45:00         81.0
3  489434      13085.0 2009-12-01 07:45:00        100.8
4  489434      13085.0 2009-12-01 07:45:00         30.0


In [2]:
# Save to the Processed Data folder
# index=False means we don't save the row numbers, just the data
df_clean.to_csv('../02_Processed_Data/retail_transactions_clean.csv', index=False)
print("File saved successfully to 02_Processed_Data!")

File saved successfully to 02_Processed_Data!
