In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
data = pd.read_csv('dirty_cafe_sales.csv')

data.isna().sum()
null_quantity_rows = data[data['Payment Method'].isna()]
print(null_quantity_rows)

     Transaction ID      Item Quantity Price Per Unit Total Spent  \
8       TXN_4717867       NaN        5            3.0        15.0   
9       TXN_2064365  Sandwich        5            4.0        20.0   
13      TXN_9437049    Cookie        5            1.0         5.0   
14      TXN_8915701     ERROR        2            1.5         3.0   
16      TXN_3765707  Sandwich        1            4.0         4.0   
...             ...       ...      ...            ...         ...   
9978    TXN_4302199       Tea        3            1.5         4.5   
9982    TXN_8567525    Cookie        2            1.0         2.0   
9985    TXN_3297457      Cake        2            3.0         6.0   
9994    TXN_7851634   UNKNOWN        4            4.0        16.0   
9995    TXN_7672686    Coffee        2            2.0         4.0   

     Payment Method  Location Transaction Date  
8               NaN  Takeaway       2023-07-28  
9               NaN  In-store       2023-12-31  
13              NaN  Tak

### Data Cleaning - Replace ERROR and UNKNOWN with NaN values


In [3]:
# Replace ERROR and UNKNOWN values with NaN in all columns
data = data.replace(['ERROR', 'UNKNOWN'], np.nan)

# Display the first few rows after cleaning
print("Dataset after replacing ERROR and UNKNOWN with NaN:")
display(data.head(10))

# Check data information to verify changes
print("\nData information after cleaning:")
data.info()

# Count remaining null values in each column
print("\nNull values count in each column:")
print(data.isnull().sum())

Dataset after replacing ERROR and UNKNOWN with NaN:


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,,,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,2023-03-31
6,TXN_4433211,,3,3.0,9.0,,Takeaway,2023-10-06
7,TXN_6699534,Sandwich,4,4.0,16.0,Cash,,2023-10-28
8,TXN_4717867,,5,3.0,15.0,,Takeaway,2023-07-28
9,TXN_2064365,Sandwich,5,4.0,20.0,,In-store,2023-12-31



Data information after cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    10000 non-null  object
 1   Item              9031 non-null   object
 2   Quantity          9521 non-null   object
 3   Price Per Unit    9467 non-null   object
 4   Total Spent       9498 non-null   object
 5   Payment Method    6822 non-null   object
 6   Location          6039 non-null   object
 7   Transaction Date  9540 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB

Null values count in each column:
Transaction ID         0
Item                 969
Quantity             479
Price Per Unit       533
Total Spent          502
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64


### Menalaah Data


In [4]:
data.head(10)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,,,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,2023-03-31
6,TXN_4433211,,3,3.0,9.0,,Takeaway,2023-10-06
7,TXN_6699534,Sandwich,4,4.0,16.0,Cash,,2023-10-28
8,TXN_4717867,,5,3.0,15.0,,Takeaway,2023-07-28
9,TXN_2064365,Sandwich,5,4.0,20.0,,In-store,2023-12-31


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    10000 non-null  object
 1   Item              9031 non-null   object
 2   Quantity          9521 non-null   object
 3   Price Per Unit    9467 non-null   object
 4   Total Spent       9498 non-null   object
 5   Payment Method    6822 non-null   object
 6   Location          6039 non-null   object
 7   Transaction Date  9540 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB


In [6]:
print(data.dtypes)

Transaction ID      object
Item                object
Quantity            object
Price Per Unit      object
Total Spent         object
Payment Method      object
Location            object
Transaction Date    object
dtype: object


In [7]:
data.isna().sum()

Transaction ID         0
Item                 969
Quantity             479
Price Per Unit       533
Total Spent          502
Payment Method      3178
Location            3961
Transaction Date     460
dtype: int64

In [8]:
# Check for duplicate rows
duplicate_rows = data[data.duplicated()]

# Display duplicate rows, if any
if not duplicate_rows.empty:
    print("Duplicate rows found:")
    display(duplicate_rows)
else:
    print("No duplicate rows found.")

No duplicate rows found.


## EDA

### Change data types
for Price Per Unit, Total Spent and Quantity to Float

In [9]:
# Convert numeric columns to float
data['Price Per Unit'] = pd.to_numeric(data['Price Per Unit'], errors='coerce')
data['Total Spent'] = pd.to_numeric(data['Total Spent'], errors='coerce')
data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')

### Handling Missing Value
We got so many missing value in our data, so we must clean it up.

We will drop any row if on Item is NULL

In [10]:
data = data.dropna(subset=['Item']) # Drop rows where 'Item' is NaN
# Display the first few rows after dropping NaN values
print("Dataset after dropping rows where Item is NaN:")
display(data.head(10))

Dataset after dropping rows where Item is NaN:


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4.0,1.0,,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2.0,5.0,10.0,,,2023-04-27
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,2023-06-11
5,TXN_2602893,Smoothie,5.0,4.0,20.0,Credit Card,,2023-03-31
7,TXN_6699534,Sandwich,4.0,4.0,16.0,Cash,,2023-10-28
9,TXN_2064365,Sandwich,5.0,4.0,20.0,,In-store,2023-12-31
10,TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,2023-11-07
11,TXN_3051279,Sandwich,2.0,4.0,8.0,Credit Card,Takeaway,


We also imputate the NaN data on Quantity column with the basic math with Total Spent divided by Price per unit.

In [11]:
mask = data['Quantity'].isna() & data['Total Spent'].notna() & data['Price Per Unit'].notna() & (data['Price Per Unit'] > 0)
data.loc[mask, 'Quantity'] = (data.loc[mask, 'Total Spent'] / data.loc[mask, 'Price Per Unit']).round()

# Display the number of remaining NaN values in Quantity
print(f"NaN values in Quantity column after filling: {data['Quantity'].isna().sum()}")

# Show a few examples of fixed rows
print("\nSample of rows where Quantity was calculated:")
display(data[mask].head())

NaN values in Quantity column after filling: 35

Sample of rows where Quantity was calculated:


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
20,TXN_3522028,Smoothie,5.0,4.0,20.0,Cash,In-store,2023-04-04
55,TXN_5522862,Cookie,2.0,1.0,2.0,Credit Card,Takeaway,2023-03-19
57,TXN_2080895,Cake,1.0,3.0,3.0,Digital Wallet,In-store,2023-04-19
66,TXN_8501819,Juice,2.0,3.0,6.0,Cash,,2023-03-30
117,TXN_2148617,Juice,3.0,3.0,9.0,Digital Wallet,,2023-01-10


We also imputate the NaN data on Price Per Unit column with the basic math with Total Spent divided by Quantity

In [12]:
mask2 = data['Price Per Unit'].isna() & data['Total Spent'].notna() & data['Quantity'].notna() & (data['Quantity'] > 0)
data.loc[mask2, 'Price Per Unit'] = (data.loc[mask2, 'Total Spent'] / data.loc[mask2, 'Quantity']).round(2)

# Display the number of remaining NaN values in Price Per Unit
print(f"NaN values in Price Per Unit column after filling: {data['Price Per Unit'].isna().sum()}")

# Show a few examples of fixed rows
print("\nSample of rows where Price Per Unit was calculated:")
display(data[mask2].head())

NaN values in Price Per Unit column after filling: 32

Sample of rows where Price Per Unit was calculated:


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
56,TXN_3578141,Cake,5.0,3.0,15.0,,Takeaway,2023-06-27
68,TXN_8427104,Salad,2.0,5.0,10.0,,In-store,2023-10-27
85,TXN_8035512,Tea,3.0,1.5,4.5,Cash,,2023-10-29
104,TXN_7447872,Juice,2.0,3.0,6.0,,,
140,TXN_2484241,Cake,3.0,3.0,9.0,Digital Wallet,,2023-07-19


We also imputate the NaN data on Total Spent column with the basic math with Price Per Unit multiply by Quantity

In [13]:
mask3 = data['Total Spent'].isna() & data['Price Per Unit'].notna() & data['Quantity'].notna()
data.loc[mask3, 'Total Spent'] = (data.loc[mask3, 'Price Per Unit'] * data.loc[mask3, 'Quantity']).round(2)

# Display the number of remaining NaN values in Total Spent
print(f"NaN values in Total Spent column after filling: {data['Total Spent'].isna().sum()}")

# Show a few examples of fixed rows
print("\nSample of rows where Total Spent was calculated:")
display(data[mask3].head())

NaN values in Total Spent column after filling: 37

Sample of rows where Total Spent was calculated:


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
2,TXN_4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,2023-07-19
25,TXN_7958992,Smoothie,3.0,4.0,12.0,,,2023-12-13
42,TXN_6650263,Tea,2.0,1.5,3.0,,Takeaway,2023-01-10
94,TXN_6289610,Juice,3.0,3.0,9.0,Cash,Takeaway,2023-08-07
143,TXN_8495063,Juice,1.0,3.0,3.0,Cash,,2023-05-31


Check the rows if maybe rows got Two or more NaN so it'll not be cleaning.

In [14]:
# Check rows where Quantity, Price Per Unit, or Total Spent are NaN
nan_rows = data[data[['Quantity', 'Price Per Unit', 'Total Spent']].isna().any(axis=1)]

# Display the rows with NaN values
print("Rows with NaN in Quantity, Price Per Unit, or Total Spent:")
display(nan_rows)

Rows with NaN in Quantity, Price Per Unit, or Total Spent:


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
65,TXN_4987129,Sandwich,3.0,,,,In-store,2023-10-20
236,TXN_8562645,Salad,,5.0,,,In-store,2023-05-18
278,TXN_3229409,Juice,,3.0,,Cash,Takeaway,2023-04-15
629,TXN_9289174,Cake,,,12.0,Digital Wallet,In-store,2023-12-30
641,TXN_2962976,Juice,,3.0,,,,2023-03-17
738,TXN_8696094,Sandwich,,4.0,,,Takeaway,2023-05-14
912,TXN_1575608,Sandwich,,,20.0,,Takeaway,2023-01-05
1008,TXN_7225428,Tea,,,3.0,Credit Card,Takeaway,2023-03-07
1436,TXN_7590801,Tea,,,6.0,Cash,Takeaway,
1482,TXN_3593060,Smoothie,,,16.0,Cash,,2023-03-05


We Drop the rows if the rows got Two or more NaN in Quantity, Price per unit, and Total spent

In [15]:
# Count NaN values in the three numeric columns for each row
nan_count = data[['Quantity', 'Price Per Unit', 'Total Spent']].isna().sum(axis=1)

# Identify rows with 2 or more NaN values in these columns
rows_to_drop = nan_count >= 2

# Drop these rows
data_cleaned = data[~rows_to_drop]

# Display the result
print(f"Original data shape: {data.shape}")
print(f"Cleaned data shape: {data_cleaned.shape}")
print(f"Number of rows dropped: {sum(rows_to_drop)}")


# Update the main dataframe
data = data_cleaned

Original data shape: (9031, 8)
Cleaned data shape: (8979, 8)
Number of rows dropped: 52


We check again the data information

In [17]:
data.isnull().sum()

Transaction ID         0
Item                   0
Quantity               0
Price Per Unit         0
Total Spent            0
Payment Method      2860
Location            3565
Transaction Date     415
dtype: int64

Since the 'Payment Method' and 'Location' columns contain NULL values and are not considered essential for data processing, they will be marked as NULL or excluded from further analysis.

We Will removing row that Transaction Date contain NULL values.

In [20]:
# Drop rows where Transaction ID is null
data = data.dropna(subset=['Transaction Date'])

# Display the count after dropping
print(f"Number of rows after dropping rows with null Transaction Date: {len(data)}")

# print the first few rows of the cleaned data
data.info()

Number of rows after dropping rows with null Transaction Date: 8564
<class 'pandas.core.frame.DataFrame'>
Index: 8564 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction ID    8564 non-null   object 
 1   Item              8564 non-null   object 
 2   Quantity          8564 non-null   float64
 3   Price Per Unit    8564 non-null   float64
 4   Total Spent       8564 non-null   float64
 5   Payment Method    5851 non-null   object 
 6   Location          5157 non-null   object 
 7   Transaction Date  8564 non-null   object 
dtypes: float64(3), object(5)
memory usage: 602.2+ KB
