In [1]:
# 📦 Import required libraries
import pandas as pd
import numpy as np

# 📂 Load the dataset
df = pd.read_csv("../data/online_retail.csv", encoding='ISO-8859-1')
print("Original dataset shape:", df.shape)

# 🔍 View basic info
df.info()
df.head()

# 🚫 Remove rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# 🚫 Remove canceled orders (InvoiceNo starting with 'C')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# 🚫 Remove rows with Quantity <= 0 or UnitPrice <= 0
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

# 🕒 Convert InvoiceDate to datetime format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# 💲 Create a new column for TotalPrice = Quantity × UnitPrice
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# ✅ Save cleaned dataset (optional)
df.to_csv("../data/cleaned_data.csv", index=False)

# ✅ Final shape
print("Cleaned dataset shape:", df.shape)


Original dataset shape: (541909, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


Cleaned dataset shape: (397884, 9)
