In [23]:
# Imports and Variables used for testing
import pandas as pd                                                                                 # This creates the tables
import os                                                                                           # For OS related paths 

csv = "chesterfield_25-08-2021_09-00-00.csv"                                                        # Csv file
df = ""                                                                                             # Global for the DataFrame
count = ""                                                                                          # Global for count functions
column = "payment_type"                                                                             # Argument for functions
value = "CARD"                                                                                      # Argument for functions
columns = ['date_time', 'Location', 'full_name', 'order', 'amount', 'payment_type', 'card_number']  # Headers for the Chesterfield csv file
sanitise_these_columns = ['full_name', 'card_number']                                               # Columns list to be sanitised

In [24]:
# Remove given Columns
def extract_sanitise_csv(csv,sanitise_these_columns):
    try:
        global df # as Global to be able to print it outside the function
        df = pd.read_csv(csv, header=None, names=columns)
        df = df.drop(columns=sanitise_these_columns)
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return df

In [25]:
# Test for the previous function
extract_sanitise_csv(csv,sanitise_these_columns)
df.head(5)

Unnamed: 0,date_time,Location,order,amount,payment_type
0,25/08/2021 09:00,Chesterfield,Regular Flavoured iced latte - Hazelnut - 2.75...,5.2,CARD
1,25/08/2021 09:02,Chesterfield,"Large Flavoured iced latte - Caramel - 3.25, R...",17.3,CARD
2,25/08/2021 09:04,Chesterfield,"Large Flat white - 2.45, Regular Latte - 2.15",4.6,CARD
3,25/08/2021 09:06,Chesterfield,"Regular Flavoured latte - Hazelnut - 2.55, Lar...",5.0,CARD
4,25/08/2021 09:08,Chesterfield,"Regular Latte - 2.15, Large Latte - 2.45",4.6,CASH


In [26]:
# Returns the row in which a column contains a given value
def filter_by_column_value(csv,column,value):
    try:
        df = pd.read_csv(csv, header=None, names=columns)
        global contain_values # as Global to be able to print it outside the function
        contain_values = df[df[column].str.contains(value)]
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return contain_values

In [27]:
# Test for the previous function
filter_by_column_value(csv,column,value)
contain_values.head(10)

Unnamed: 0,date_time,Location,full_name,order,amount,payment_type,card_number
0,25/08/2021 09:00,Chesterfield,Richard Copeland,Regular Flavoured iced latte - Hazelnut - 2.75...,5.2,CARD,5494174000000000.0
1,25/08/2021 09:02,Chesterfield,Scott Owens,"Large Flavoured iced latte - Caramel - 3.25, R...",17.3,CARD,6844802000000000.0
2,25/08/2021 09:04,Chesterfield,Francis Strayhorn,"Large Flat white - 2.45, Regular Latte - 2.15",4.6,CARD,9557104000000000.0
3,25/08/2021 09:06,Chesterfield,Rodney Drake,"Regular Flavoured latte - Hazelnut - 2.55, Lar...",5.0,CARD,2978328000000000.0
6,25/08/2021 09:12,Chesterfield,James Petrick,"Large Flavoured latte - Hazelnut - 2.85, Regul...",17.4,CARD,5309043000000000.0
7,25/08/2021 09:14,Chesterfield,Ronald Hagy,"Regular Flavoured iced latte - Vanilla - 2.75,...",13.55,CARD,1166596000000000.0
8,25/08/2021 09:17,Chesterfield,Lashawn Fant,"Regular Flavoured iced latte - Caramel - 2.75,...",12.55,CARD,8790558000000000.0
9,25/08/2021 09:19,Chesterfield,James Folsom,"Large Flat white - 2.45, Large Flavoured latte...",5.3,CARD,8722920000000000.0
11,25/08/2021 09:23,Chesterfield,Robyn Baker,Regular Latte - 2.15,2.15,CARD,4284401000000000.0
12,25/08/2021 09:25,Chesterfield,Leonard Saari,"Large Flat white - 2.45, Large Latte - 2.45, L...",10.3,CARD,4819037000000000.0


In [28]:
# Returns the count of the different values contained in a given column
def count_number_of_different_values(csv,column):
    try:
        df = pd.read_csv(csv, header=None, names=columns)
        global count  # as Global to be able to print it outside the function
        count = df[column].value_counts(ascending=True)
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [29]:
# Test for the previous function
count_number_of_different_values(csv,column)
print(str(count))

payment_type
CASH     85
CARD    183
Name: count, dtype: int64


In [30]:
# Counts the number of times a value is repeated on a given column
def count_number_of_times_a_value_is_repeated(csv,column,value):
    try:
        df = pd.read_csv(csv, header=None, names=columns)
        global count # as Global to be able to print it outside the function
        count = df[column].value_counts()[value]
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [31]:
# Test for the previous function
count_number_of_times_a_value_is_repeated(csv,column,value)
print("the value: " + value + " was found " + str(count) + " times.")

the value: CARD was found 183 times.


In [32]:
# Saves Data frame to file as CSV

def save_df_to_csv(path,newfile):
    try:
        os.makedirs(path, exist_ok=True)  
        df.to_csv(path + newfile)

    except:
        print(f'Saving operation could not be completed')


In [33]:
# Test for the previous function
path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile.csv"

save_df_to_csv(path,newfile)

In [34]:
# Saves Data frame to file as JSON

def save_df_to_json(path,newfile):
    try:
        with open(path + newfile, 'w') as f:
            f.write(df.to_json(orient='records', lines=True))

    except:
        print(f'Saving operation could not be completed')
    


In [35]:
# Test for the previous function
path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile.json"

save_df_to_json(path,newfile)