In [None]:
# Imports and Variables used for testing
import pandas as pd                                                                                 # Loads Pandas package 
import os                                                                                           # For OS related paths 

# Variable List (with their default value [Used in Testing])
csv = os.path.abspath("../raw_data/chesterfield_25-08-2021_09-00-00.csv")                           # Csv file on the raw_data subfolder
df = ""                                                                                             # Global for the DataFrame
count = ""                                                                                          # Global for count functions
column = "payment_type"                                                                             # Argument for functions
value = "CARD"                                                                                      # Argument for functions
columns = ['date_time', 'Location', 'full_name', 'order', 'amount', 'payment_type', 'card_number']  # Headers for the Chesterfield csv file
sanitise_these_columns = ['full_name', 'card_number']                                               # Columns list to be sanitised
path = "results/"                                                                                   # to add folder/subfolder/ if needed
newfile = "newfile.csv"                                                                             # File to be created

In [None]:
# Remove given Columns
def extract_sanitise_csv(csv,sanitise_these_columns):
    try:
        global df # as Global to be able to print it outside the function
        df = pd.read_csv(csv, header=None, names=columns)
        df = df.drop(columns=sanitise_these_columns)
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return df

In [None]:
# Test for the previous function

# Leave the desired file uncommented (Chesterfield or Leeds)
# csv = os.path.abspath("../raw_data/chesterfield_25-08-2021_09-00-00.csv")
csv = os.path.abspath("../raw_data/leeds_01-01-2020_09-00-00.csv")
 
sanitise_these_columns = ['full_name', 'card_number']

extract_sanitise_csv(csv,sanitise_these_columns)
df.head(5)

In [46]:
# Returns the row in which a column contains a given value
def filter_by_column_value(csv,column,value):
    try:
        df = pd.read_csv(csv, header=None, names=columns)
        global contain_values # as Global to be able to print it outside the function
        contain_values = df[df[column].str.contains(value)]
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return contain_values

In [47]:
# Test for the previous function

# Leave the desired file uncommented (Chesterfield or Leeds)
# csv = os.path.abspath("../raw_data/chesterfield_25-08-2021_09-00-00.csv")
csv = os.path.abspath("../raw_data/leeds_01-01-2020_09-00-00.csv")

columns = ['date_time', 'Location', 'full_name', 'order', 'amount', 'payment_type', 'card_number']
value = "CARD"

filter_by_column_value(csv,column,value)
contain_values.head(10)

Unnamed: 0,date_time,Location,full_name,order,amount,payment_type,card_number
1,01/01/2020 09:01,Leeds,Matthew Palmer,"Large Chai latte - 2.60, Regular Filter coffee...",4.1,CARD,5933193000000000.0
2,01/01/2020 09:03,Leeds,Mack Cendejas,Large Speciality Tea - English breakfast - 1.6...,2.9,CARD,2728143000000000.0
3,01/01/2020 09:04,Leeds,Thomas Williams,"Large Chai latte - 2.60, Large Iced americano ...",7.6,CARD,2992561000000000.0
4,01/01/2020 09:06,Leeds,Diane Ferree,Regular Hot Chocolate - 1.40,1.4,CARD,8040887000000000.0
5,01/01/2020 09:08,Leeds,Bruce Watts,"Regular Chai latte - 2.30, Regular Filter coff...",3.8,CARD,2242609000000000.0
6,01/01/2020 09:09,Leeds,Sergio Vanmeter,"Regular Iced americano - 2.15, Regular Hot Cho...",10.75,CARD,1548663000000000.0
7,01/01/2020 09:11,Leeds,Rosemary Wertman,"Regular Filter coffee - 1.50, Regular Hot Choc...",8.4,CARD,6706720000000000.0
8,01/01/2020 09:12,Leeds,Marcia Mason,"Large Iced americano - 2.50, Regular Chai latt...",4.8,CARD,2590703000000000.0
12,01/01/2020 09:19,Leeds,Richard Redding,"Regular Hot Chocolate - 1.40, Large Filter cof...",6.85,CARD,4480989000000000.0
13,01/01/2020 09:21,Leeds,Jean Wontor,Large Speciality Tea - English breakfast - 1.6...,4.1,CARD,3200300000000000.0


In [None]:
# Returns the count of the different values contained in a given column
def count_number_of_different_values(csv,column):
    try:
        df = pd.read_csv(csv, header=None, names=columns)
        global count  # as Global to be able to print it outside the function
        count = df[column].value_counts(ascending=True)
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [None]:
# Test for the previous function

# Leave the desired file uncommented (Chesterfield or Leeds)
# csv = os.path.abspath("../raw_data/chesterfield_25-08-2021_09-00-00.csv")
csv = os.path.abspath("../raw_data/leeds_01-01-2020_09-00-00.csv")

column = "payment_type" 

count_number_of_different_values(csv,column)
print(str(count))

In [None]:
# Counts the number of times a value is repeated on a given column
def count_number_of_times_a_value_is_repeated(csv,column,value):
    try:
        df = pd.read_csv(csv, header=None, names=columns)
        global count # as Global to be able to print it outside the function
        count = df[column].value_counts()[value]
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [None]:
# Test for the previous function

# Leave the desired file uncommented (Chesterfield or Leeds)
# csv = os.path.abspath("../raw_data/chesterfield_25-08-2021_09-00-00.csv")
csv = os.path.abspath("../raw_data/leeds_01-01-2020_09-00-00.csv")

column = "payment_type"
value = "CARD" 

count_number_of_times_a_value_is_repeated(csv,column,value)
print("the value: " + value + " was found " + str(count) + " times.")

In [44]:
# Saves Data frame to file as CSV

def save_df_to_csv(path,newfile):
    try:
        os.makedirs(path, exist_ok=True)  
        df.to_csv(path + newfile)

    except:
        print(f'Saving operation could not be completed')


In [45]:
# Test for the previous function
path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile.csv"

save_df_to_csv(path,newfile)

In [None]:
# Saves Data frame to file as JSON

def save_df_to_json(path,newfile):
    try:
        with open(path + newfile, 'w') as f:
            f.write(df.to_json(orient='records', lines=True))

    except:
        print(f'Saving operation could not be completed')
    


In [None]:
# Test for the previous function
path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile.json"

save_df_to_json(path,newfile)