# Cafe Sales Analytics Projects

## Objectives
Clean the raw datasets  Cafe Sales.

### SETUP for Cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
#import CSV
data_path = "../data/raw/dirty_cafe_sales.csv"
try:
    df = pd.read_csv(data_path)
except FileNotFoundError:
    print("ERROR : File Not Found")
    

In [3]:
# see columns
df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date'],
      dtype='object')

In [4]:
#make a copy to clean
clean_df = df.copy()
pd.set_option("display.max_row", None)
# check all unique values in the item colunm
clean_df["Item"].unique() 

array(['Coffee', 'Cake', 'Cookie', 'Salad', 'Smoothie', 'UNKNOWN',
       'Sandwich', nan, 'ERROR', 'Juice', 'Tea'], dtype=object)

### Clean 'Item' column

In [5]:
# change the type to str and strip leading an trailing spaces (not needed)
clean_df["Item"] = clean_df["Item"].astype('string').str.strip()
# set title format for item (not needed)
clean_df["Item"] = clean_df["Item"].str.title()
#check changes
clean_df["Item"].unique()

<StringArray>
[  'Coffee',     'Cake',   'Cookie',    'Salad', 'Smoothie',  'Unknown',
 'Sandwich',       <NA>,    'Error',    'Juice',      'Tea']
Length: 11, dtype: string

In [6]:
clean_df["Item"].dtype

string[python]

In [7]:
# set erroneous values in the item column to NULL
prob = ['UNKNOWN','ERROR','nan','Unknown','Nan','Error','<Na>']
clean_df["Item"] = clean_df["Item"].replace(prob,pd.NA)
#check changes
clean_df["Item"].unique()

<StringArray>
[  'Coffee',     'Cake',   'Cookie',    'Salad', 'Smoothie',       <NA>,
 'Sandwich',    'Juice',      'Tea']
Length: 9, dtype: string

### Clean 'Quantity' column

In [8]:
# check unique values
clean_df['Quantity'].unique()

array(['2', '4', '5', '3', '1', 'ERROR', 'UNKNOWN', nan], dtype=object)

In [9]:
# remove all teh problem values from Quantity column 
clean_df['Quantity'] = clean_df['Quantity'].replace(prob,pd.NA)
# set column type to int32
clean_df['Quantity'] = clean_df['Quantity'].astype('Int32')
# verify changes
clean_df['Quantity'].unique()

<IntegerArray>
[2, 4, 5, 3, 1, <NA>]
Length: 6, dtype: Int32

### Clean 'Price Per Unit' Column

In [10]:
# replace erroneous values in the Price Per Unit column to NULL
clean_df['Price Per Unit'] = clean_df['Price Per Unit'].replace(prob,np.nan)
# set the type to float
clean_df['Price Per Unit'] = pd.to_numeric(clean_df['Price Per Unit'], errors = "coerce")
# verify type
clean_df['Price Per Unit'].dtype

dtype('float64')

In [11]:
# verify correct values
clean_df['Price Per Unit'].unique()

array([2. , 3. , 1. , 5. , 4. , 1.5, nan])

### Clean 'Total Spent' Column

In [12]:
# check out the values in the column
clean_df['Total Spent'].unique()

array(['4.0', '12.0', 'ERROR', '10.0', '20.0', '9.0', '16.0', '15.0',
       '25.0', '8.0', '5.0', '3.0', '6.0', nan, 'UNKNOWN', '2.0', '1.0',
       '7.5', '4.5', '1.5'], dtype=object)

In [13]:
# remove erroneous values in the column
clean_df['Total Spent'] = clean_df['Total Spent'].replace(prob,np.nan)
# update the type to a float while maintening NULL values
clean_df['Total Spent'] = pd.to_numeric(clean_df['Total Spent'],errors="coerce")
#check changed type
clean_df['Total Spent'].dtype

dtype('float64')

In [14]:
# recheck column values
clean_df['Total Spent'].unique()

array([ 4. , 12. ,  nan, 10. , 20. ,  9. , 16. , 15. , 25. ,  8. ,  5. ,
        3. ,  6. ,  2. ,  1. ,  7.5,  4.5,  1.5])

In [15]:
#check columns (to view them easier)
df.columns

Index(['Transaction ID', 'Item', 'Quantity', 'Price Per Unit', 'Total Spent',
       'Payment Method', 'Location', 'Transaction Date'],
      dtype='object')

### Clean 'Payment Method' Column

In [16]:
# view unique values to clean
clean_df['Payment Method'].unique()

array(['Credit Card', 'Cash', 'UNKNOWN', 'Digital Wallet', 'ERROR', nan],
      dtype=object)

In [17]:
#Set type and strip trailing and leading (not needed)
clean_df['Payment Method'] = clean_df['Payment Method'].astype("string").str.strip()
# set title format(not needed)
clean_df['Payment Method'] = clean_df['Payment Method'].str.title()

In [18]:
# remove invalid values and replace with NULL
clean_df['Payment Method'] = clean_df['Payment Method'].replace(prob,pd.NA)
# check changes and type
clean_df['Payment Method'].unique()

<StringArray>
['Credit Card', 'Cash', <NA>, 'Digital Wallet']
Length: 4, dtype: string

### Clean 'Location' Column

In [19]:
# check what needs to be corrected
clean_df['Location'].unique()

array(['Takeaway', 'In-store', 'UNKNOWN', nan, 'ERROR'], dtype=object)

In [20]:
# set type to string
clean_df['Location'] = clean_df['Location'].astype("string")
# remove invalid values and replace with NULL
clean_df['Location'] = clean_df['Location'].replace(prob,pd.NA)

In [21]:
# check if corrected
clean_df['Location'].unique()

<StringArray>
['Takeaway', 'In-store', <NA>]
Length: 3, dtype: string

### Clean 'Transaction Date' Column

In [22]:
# remove invalid values and replace with NULL
clean_df['Transaction Date'] = clean_df['Transaction Date'].replace(prob,pd.NA)

In [23]:
# convert to datetime type
clean_df['Transaction Date'] = pd.to_datetime(clean_df['Transaction Date'])
# check type
print(clean_df['Transaction Date'].dtype)

datetime64[ns]


### SAVE the Cleaned dataset to a new CSV

In [24]:
clean_df.to_csv("../data/processed/cleaned_cafe_sales_without_filling.csv", index = False)