In [45]:
import os
import pandas as pd
import numpy as np

# Check if you're running on Kaggle
IS_KAGGLE = os.path.exists("/kaggle/input")

if IS_KAGGLE:
    dataset_path = "/kaggle/input/cafe-sales-dirty-data-for-cleaning-training/dirty_cafe_sales.csv"
else:
    dataset_path = "datasets/dirty_cafe_sales.csv"

# Load the dataset
df = pd.read_csv(dataset_path)

# Replace 'unknown' and 'error' with NaN
display(df.head())

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [46]:
df.describe()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
count,10000,9667,9862,9821.0,9827.0,7421,6735,9841
unique,10000,10,7,8.0,19.0,5,4,367
top,TXN_1961373,Juice,5,3.0,6.0,Digital Wallet,Takeaway,UNKNOWN
freq,1,1171,2013,2429.0,979.0,2291,3022,159


In [47]:
# Replace all case variations of 'unknown' and 'error' with NaN (null) without regex
to_null = lambda x: np.nan if str(x).lower() in ['unknown', 'error'] else x
df = df.applymap(to_null)
display(df.head())

  df = df.applymap(to_null)


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,,,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [48]:
display(df.dtypes)

Transaction ID      object
Item                object
Quantity            object
Price Per Unit      object
Total Spent         object
Payment Method      object
Location            object
Transaction Date    object
dtype: object

In [49]:
# Show all columns and their null value counts
display(df.isnull().sum())
display(df.columns)

Transaction ID         0
Item                 969
Quantity             479
Price Per Unit       533
Total Spent          502
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date'],
      dtype='object')

In [50]:
modes = {col: df[col].mode().iloc[0] for col in df.columns if not df[col].mode().empty}
df.fillna(value=modes, inplace=True)

In [51]:
print("\nMissing values per column:")
display(df.isnull().sum())


Missing values per column:


Transaction ID      0
Item                0
Quantity            0
Price Per Unit      0
Total Spent         0
Payment Method      0
Location            0
Transaction Date    0
dtype: int64

In [52]:
df.dtypes

Transaction ID      object
Item                object
Quantity            object
Price Per Unit      object
Total Spent         object
Payment Method      object
Location            object
Transaction Date    object
dtype: object

In [55]:
# Numeric columns you expect
num_cols = ["Quantity", "Price Per Unit", "Total Spent"]  # replace with real names
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")  # string to float/int, invalids become NaN


# Date columns
date_cols = ["Transaction Date"]  # replace with real
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# Categorical cleanup
cat_cols = ["Transaction ID", "Item", "Payment Method", "Location"]
for col in cat_cols:
    df[col] = df[col].astype("category")

In [56]:
df.dtypes

Transaction ID            category
Item                      category
Quantity                     int64
Price Per Unit             float64
Total Spent                float64
Payment Method            category
Location                  category
Transaction Date    datetime64[ns]
dtype: object