# Clean Data Checker

In [115]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [116]:
# Read in the dataset from a public repository
url = "https://raw.githubusercontent.com/Natasa127/CSI4142-A2/main/dirty_cafe_sales.csv"
sales = pd.read_csv(url)
sales.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [117]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    10000 non-null  object
 1   Item              9667 non-null   object
 2   Quantity          9862 non-null   object
 3   Price Per Unit    9821 non-null   object
 4   Total Spent       9827 non-null   object
 5   Payment Method    7421 non-null   object
 6   Location          6735 non-null   object
 7   Transaction Date  9841 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB


### 1) Data Type errors

This test checks the data type of an attribute whose entries should be integers.

References:

Converting to numeric: https://pandas.pydata.org/docs/reference/api/pandas.to_numeric.html

Setting the type: https://www.geeksforgeeks.org/python-pandas-dataframe-astype/

Selecting rows in one dataframe but not in another: https://discovery.cs.illinois.edu/guides/DataFrame-Row-Selection/dataframe-isin-selection/

In [None]:
# Parameters to be edited by the user
attributes = ['Quantity', 'Price Per Unit', 'Total Spent']

test_attribute_int = 'Quantity'
datatype_int = 'int'
test_attribute_float = 'Total Spent'
datatype_float = 'float'


In [None]:
# Error check
def type_filter(df, col, datatype):
    # Creates a copy so that the original dataset is not modified
    df_filtered = df.copy()

    # Converts numeric data to a numeric type and sets all other values to NaN
    df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce')
    # Removes NaN values to leave only numerical values
    df_filtered = df_filtered.dropna(subset=[col]).copy()
    
    if datatype == 'int':
        # Takes only the integer values
        df_filtered = df_filtered[df_filtered[col] % 1 == 0].copy()

        # Converts the type to integer (as opposed to float)
        df_filtered[col] = df_filtered[col].astype(datatype)

    # Returns the filtered dataset
    return df_filtered

checked_sales = type_filter(sales, test_attribute_int, datatype_int)
checked_sales.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 9521 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    9521 non-null   object
 1   Item              9206 non-null   object
 2   Quantity          9521 non-null   int32 
 3   Price Per Unit    9349 non-null   object
 4   Total Spent       9353 non-null   object
 5   Payment Method    7074 non-null   object
 6   Location          6412 non-null   object
 7   Transaction Date  9371 non-null   object
dtypes: int32(1), object(7)
memory usage: 632.3+ KB


In [120]:
# Accesses entries with invalid datatypes for the given column
invalid_type = sales[~sales.index.isin(checked_sales.index)]
invalid_type

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
20,TXN_3522028,Smoothie,ERROR,4.0,20.0,Cash,In-store,2023-04-04
55,TXN_5522862,Cookie,ERROR,1.0,2.0,Credit Card,Takeaway,2023-03-19
57,TXN_2080895,Cake,UNKNOWN,3.0,3.0,Digital Wallet,In-store,2023-04-19
66,TXN_8501819,Juice,,3.0,6.0,Cash,,2023-03-30
117,TXN_2148617,Juice,ERROR,3.0,9.0,Digital Wallet,UNKNOWN,2023-01-10
...,...,...,...,...,...,...,...,...
9932,TXN_8502079,Tea,UNKNOWN,1.5,3.0,Cash,,2023-04-20
9935,TXN_9778251,Tea,ERROR,1.5,6.0,,Takeaway,2023-11-09
9944,TXN_7495283,Cake,UNKNOWN,3.0,15.0,Credit Card,Takeaway,2023-04-14
9957,TXN_6487003,Coffee,ERROR,2.0,8.0,Credit Card,Takeaway,2023-11-15


Results:

There are 479 rows with a quantity that is not an integer. This seems to occur when the value is unknown, and replaced by a string such as 'UNKNOWN' or 'ERROR' instead. For example, see the two rows below:

<u>Transaction ID / Item / Quantity
Transaction ID	/ Item	/ Quantity 	/ Price Per Unit	Total Spent	/ Payment Method	/ Location	/ Transaction Date</u>

20 / TXN_3522028	/ Smoothie	/ ERROR	/ 4.0	/ 20.0	/ Cash	/ In-store	/ 2023-04-04

55	/ TXN_5522862	/ Cookie	/ ERROR	/ 1.0	/ 2.0	/ Credit Card	/ Takeaway	/ 2023-03-19

### 2) Range errors

This test checks the range of a numerical variable, which consists of checking if the value of the variable is within the minimum and maximum acceptable values.

In [121]:
# Parameters to be edited by the user
attributes = ['Quantity', 'Price Per Unit', 'Total Spent']

test_attribute = 'Quantity'

minimum = 1

maximum = 5

In [122]:
# Error check

# Extract values that are either above the maximum acceptable value or below the minimum acceptable value
invalid_range = checked_sales[(checked_sales[test_attribute] > maximum) | (checked_sales[test_attribute] < minimum)]
invalid_range

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


Results:

There are no values for quantity that are outside of the acceptable range.

TODO: maybe we should add invalid entries (like -1)

### 3) Format errors

This test checks that dates are stored in the correct format, i.e. YYYY-MM-DD.

In [123]:
# Parameters to be edited by the user
attributes = ['Transaction Date']

In [None]:
# Error Check

def format_filter(df, col):
    df_filtered = df.copy()

    