In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pt

In [2]:
df = pd.read_csv("dirty_cafe_sales.csv")

In [3]:
df.columns = df.columns.str.lower().str.replace(' ','_').str.strip()
df.columns.to_list()

['transaction_id',
 'item',
 'quantity',
 'price_per_unit',
 'total_spent',
 'payment_method',
 'location',
 'transaction_date']

In [4]:
df.describe()

Unnamed: 0,transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
count,10000,9667,9862,9821.0,9827.0,7421,6735,9841
unique,10000,10,7,8.0,19.0,5,4,367
top,TXN_1961373,Juice,5,3.0,6.0,Digital Wallet,Takeaway,UNKNOWN
freq,1,1171,2013,2429.0,979.0,2291,3022,159


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   transaction_id    10000 non-null  object
 1   item              9667 non-null   object
 2   quantity          9862 non-null   object
 3   price_per_unit    9821 non-null   object
 4   total_spent       9827 non-null   object
 5   payment_method    7421 non-null   object
 6   location          6735 non-null   object
 7   transaction_date  9841 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB


In [6]:
df.isna().sum()

transaction_id         0
item                 333
quantity             138
price_per_unit       179
total_spent          173
payment_method      2579
location            3265
transaction_date     159
dtype: int64

In [7]:
df.drop_duplicates()

Unnamed: 0,transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
...,...,...,...,...,...,...,...,...
9995,TXN_7672686,Coffee,2,2.0,4.0,,UNKNOWN,2023-08-30
9996,TXN_9659401,,3,,3.0,Digital Wallet,,2023-06-02
9997,TXN_5255387,Coffee,4,2.0,8.0,Digital Wallet,,2023-03-02
9998,TXN_7695629,Cookie,3,,3.0,Digital Wallet,,2023-12-02


In [8]:
df.drop_duplicates(inplace=True)

In [9]:
mask = df.isin(["UNKNOWN", "ERROR"])
error_counts = mask.sum()
error_counts

transaction_id        0
item                636
quantity            341
price_per_unit      354
total_spent         329
payment_method      599
location            696
transaction_date    301
dtype: int64

In [10]:
# Convert columns to numeric (they might be stored as strings)
df['quantity'] = pd.to_numeric(df['quantity'], errors='coerce')
df['price_per_unit'] = pd.to_numeric(df['price_per_unit'], errors='coerce')
df['total_spent'] = pd.to_numeric(df['total_spent'], errors='coerce')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   transaction_id    10000 non-null  object 
 1   item              9667 non-null   object 
 2   quantity          9521 non-null   float64
 3   price_per_unit    9467 non-null   float64
 4   total_spent       9498 non-null   float64
 5   payment_method    7421 non-null   object 
 6   location          6735 non-null   object 
 7   transaction_date  9841 non-null   object 
dtypes: float64(3), object(5)
memory usage: 625.1+ KB


In [12]:
df.replace(['ERROR', 'UNKNOWN'], np.nan, inplace=True)
error_counts

transaction_id        0
item                636
quantity            341
price_per_unit      354
total_spent         329
payment_method      599
location            696
transaction_date    301
dtype: int64

In [13]:
df.head()

Unnamed: 0,transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4.0,1.0,,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2.0,5.0,10.0,,,2023-04-27
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [14]:
df.isna().sum()

transaction_id         0
item                 969
quantity             479
price_per_unit       533
total_spent          502
payment_method      3178
location            3961
transaction_date     460
dtype: int64

In [15]:
df['quantity'] = df['quantity'].fillna(df['total_spent'] / df['price_per_unit'])
df.isna().sum()

transaction_id         0
item                 969
quantity              38
price_per_unit       533
total_spent          502
payment_method      3178
location            3961
transaction_date     460
dtype: int64

In [16]:
df['price_per_unit'] = df['price_per_unit'].fillna(df['total_spent'] / df['quantity'])
df['total_spent'] = df['total_spent'].fillna(df['price_per_unit'] * df['quantity'])
df.isna().sum()

transaction_id         0
item                 969
quantity              38
price_per_unit        38
total_spent           40
payment_method      3178
location            3961
transaction_date     460
dtype: int64

In [17]:
price_to_item = {
    1.0: 'Cookie',
    1.5: 'Tea',
    2.0: 'Coffee',
    3.0: 'Juice',      # Choose Cake or Juice
    4.0: 'Smoothie',  # Choose Sandwich or Smoothie
    5.0: 'Salad'
}

# Fill items
df['item'] = df['item'].replace(['UNKNOWN', 'ERROR'], np.nan)
df['item'] = df['item'].fillna(df['price_per_unit'].map(price_to_item))

In [18]:
df.isna().sum()

transaction_id         0
item                   6
quantity              38
price_per_unit        38
total_spent           40
payment_method      3178
location            3961
transaction_date     460
dtype: int64

In [19]:
df[df['item'].isnull()]

Unnamed: 0,transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
1761,TXN_3611851,,4.0,,,Credit Card,,2023-02-09
2289,TXN_7524977,,4.0,,,,,2023-12-09
3779,TXN_7376255,,,,25.0,,In-store,2023-05-27
4152,TXN_9646000,,2.0,,,,In-store,2023-12-14
7597,TXN_1082717,,,,9.0,Digital Wallet,In-store,2023-12-13
9819,TXN_1208561,,,,20.0,Credit Card,,2023-08-19


In [20]:
df.dropna(subset=['item'], inplace=True)

In [21]:
df.isna().sum()

transaction_id         0
item                   0
quantity              35
price_per_unit        32
total_spent           37
payment_method      3175
location            3958
transaction_date     460
dtype: int64

In [22]:
menu_prices = {
    'Coffee': 2.0,
    'Tea': 1.5,
    'Sandwich': 4.0,
    'Salad': 5.0,
    'Cake': 3.0,
    'Cookie': 1.0,
    'Smoothie': 4.0,
    'Juice': 3.0
}

df['price_per_unit'] = df['price_per_unit'].fillna(df['item'].map(menu_prices))

In [23]:
df.isna().sum()

transaction_id         0
item                   0
quantity              35
price_per_unit         0
total_spent           37
payment_method      3175
location            3958
transaction_date     460
dtype: int64

In [24]:
df[df['quantity'].isnull()].head(10)

Unnamed: 0,transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
236,TXN_8562645,Salad,,5.0,,,In-store,2023-05-18
278,TXN_3229409,Juice,,3.0,,Cash,Takeaway,2023-04-15
629,TXN_9289174,Cake,,3.0,12.0,Digital Wallet,In-store,2023-12-30
641,TXN_2962976,Juice,,3.0,,,,2023-03-17
738,TXN_8696094,Sandwich,,4.0,,,Takeaway,2023-05-14
912,TXN_1575608,Sandwich,,4.0,20.0,,Takeaway,2023-01-05
1008,TXN_7225428,Tea,,1.5,3.0,Credit Card,Takeaway,2023-03-07
1436,TXN_7590801,Tea,,1.5,6.0,Cash,Takeaway,
1482,TXN_3593060,Smoothie,,4.0,16.0,Cash,,2023-03-05
2330,TXN_3849488,Salad,,5.0,5.0,,In-store,2023-03-01


In [25]:
(df['quantity'].isnull() & df['total_spent'].isnull()).sum()

np.int64(20)

In [26]:
df.drop(
    df[df['quantity'].isnull() & df['total_spent'].isnull()].index,
    inplace=True
)

In [27]:
df.isna().sum()

transaction_id         0
item                   0
quantity              15
price_per_unit         0
total_spent           17
payment_method      3168
location            3952
transaction_date     460
dtype: int64

In [28]:
df['quantity'] = df['quantity'].fillna(df['total_spent'] / df['price_per_unit'])
df['total_spent'] = df['total_spent'].fillna(df['price_per_unit'] * df['quantity'])
df.isna().sum()

transaction_id         0
item                   0
quantity               0
price_per_unit         0
total_spent            0
payment_method      3168
location            3952
transaction_date     460
dtype: int64

In [29]:
# Step 1: Convert to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'], errors='coerce')

# Step 2: Fill with forward fill, then backward fill
df['transaction_date'] = df['transaction_date'].ffill().bfill()

# Step 3: Verify no missing dates remain
print(f"Remaining missing dates: {df['transaction_date'].isna().sum()}")

Remaining missing dates: 0


In [30]:
df['payment_method'].value_counts(dropna=False)

payment_method
NaN               3168
Digital Wallet    2284
Credit Card       2268
Cash              2254
Name: count, dtype: int64

In [31]:
# Fill with the most common payment method (mode)
most_common_payment = df['payment_method'].mode()[0]
df['payment_method'] = df['payment_method'].fillna(most_common_payment)

df['payment_method'].value_counts()

payment_method
Digital Wallet    5452
Credit Card       2268
Cash              2254
Name: count, dtype: int64

In [32]:
# Fill with the most common location (mode)
most_common_location = df['location'].mode()[0]
df['location'] = df['location'].fillna(most_common_location)

In [33]:
df['location'].value_counts(dropna=False)

location
Takeaway    6968
In-store    3006
Name: count, dtype: int64

In [34]:
df.isna().sum()

transaction_id      0
item                0
quantity            0
price_per_unit      0
total_spent         0
payment_method      0
location            0
transaction_date    0
dtype: int64

In [35]:
df.to_csv('../clean_cafe_sales.csv', index=False)