In [None]:
# Imports and Variables used for testing
import pandas as pd                                                                                 # Loads Pandas package 
import os                                                                                           # For OS related paths 

# Variable List (with their default value [Used in Testing])
raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"                                              # Csv filename on the raw_data subfolder
csv = ""                                                                                            # filepath to be read by functions, used after joint with raw_data_file NOTE: csv variable could be an url if needed
df = ""                                                                                             # Global for the DataFrame
count = ""                                                                                          # Global for count functions
column = "payment_type"                                                                             # Argument for functions
value = "CARD"                                                                                      # Argument for functions
columns = ['date_time', 'Location', 'full_name', 'order', 'amount', 'payment_type', 'card_number']  # Headers for the orders csv files
sanitise_these_columns = ['full_name', 'card_number']                                               # Columns list to be sanitised
path = "results/"                                                                                   # to add folder/subfolder/ if needed
newfile = "newfile.csv"                                                                             # Filename to be created by the Create CSV/JSON function

In [None]:
#FUNCTIONS INDEX
#
# 1. absolute_path_for_raw_data
#               Takes: raw_data_file (a variable for the name of the csv file in the raw_data folder) 
#               Returns: CSV variable containing the Absolute Path to the raw_data subfolder + filename]
#  
# 2. extract_sanitise_csv
#               Takes: csv (absolute path variable), columns (a List with the headers for the csv file), sanitise_these_columns (List of Columns to be Sanitised)
#               Returns Sanitised Dataframe as DF
#
# 3. filter_by_column_value
#               Takes: csv (absolute path variable) column (the column to check) value (the value to check in the column)
#               Returns: contain_values variable which is a new DF which includes the rows in which a column contains the given value
#
# 4. count_number_of_different_values
#               Takes: csv (absolute path variable) column (the column to check)
#               Returns: A count of the different values contained in the given column
#
# 5. count_number_of_times_a_value_is_repeated
#               Takes: csv (absolute path variable) column (the column to check) value (the value to check in the column)
#               Returns: A Count of the number of times the given value is repeated on the given column
#
# 6. save_df_to_csv
#               Takes: path (name of folder) newfile (name and extension of the file to be created)
#               Returns: DF variable contents saved a CSV file under the given path folder
#
# 7. save_df_to_json
#               Takes: path (name of folder) newfile (name and extension of the file to be created)
#               Returns: DF variable contents saved a JSON file under the given path folder

In [None]:
# 1. - Absolute path for csv files in the raw_data folder   [takes variable filename.... returns path to raw_data subfolder folder plus filename]
# takes the file name and returns 
def absolute_path_for_raw_data(raw_data_file):
    path = os.path.abspath("../raw_data")
    global csv
    csv = path + "\\" + raw_data_file
    return csv

In [None]:
# 2. Remove given Columns
def extract_sanitise_csv(csv,sanitise_these_columns):
    try:
        global df # as Global to be able to print it outside the function
        df = pd.read_csv(csv, header=None, names=columns)
        df = df.drop(columns=sanitise_these_columns)
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return df

In [None]:
# Test EXTRACT SANITISE CSV Function

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file)  # returns csv variable with the absolute path
 
sanitise_these_columns = ['full_name', 'card_number']

extract_sanitise_csv(csv,sanitise_these_columns)
df.head(5)

In [None]:
# 3. Returns the rows in which a column contains a given value
def filter_by_column_value(csv,column,value):
    try:
        # df = pd.read_csv(csv, header=None, names=columns) #Uncomment if you want to use the Unsanatised version of the DF
        global contain_values # as Global to be able to print it outside the function
        contain_values = df[df[column].str.contains(value)]
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return contain_values

In [None]:
# Test FILTER BY COLUMN VALUE function

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

columns = ['date_time', 'Location', 'full_name', 'order', 'amount', 'payment_type', 'card_number']
value = "CARD"

filter_by_column_value(csv,column,value)
contain_values.head(10)

In [None]:
# 4. Returns the count of the different values contained in a given column
def count_number_of_different_values(csv,column):
    try:
        # df = pd.read_csv(csv, header=None, names=columns) #Uncomment if you want to use the Unsanatised version of the DF
        global count  # as Global to be able to print it outside the function
        count = df[column].value_counts(ascending=True)
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [None]:
# Test COUNT NUMBER OF DIFFERENT VALUES function

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file) # returns csv variable with the absolut path

column = "payment_type" 

count_number_of_different_values(csv,column)
print(str(count))

In [None]:
# 5. Counts the number of times a value is repeated on a given column
def count_number_of_times_a_value_is_repeated(csv,column,value):
    try:
        df = pd.read_csv(csv, header=None, names=columns)
        global count # as Global to be able to print it outside the function
        count = df[column].value_counts()[value]
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [None]:
# Test COUNT NUMBER OF TIMES A VALUE IS REPEATED function

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file) # returns csv variable with the absolut path

column = "payment_type"
value = "CARD" 

count_number_of_times_a_value_is_repeated(csv,column,value)
print("the value: " + value + " was found " + str(count) + " times.")

In [None]:
# 6. Saves Data frame to file as CSV

def save_df_to_csv(path,newfile):
    try:
        os.makedirs(path, exist_ok=True)  
        df.to_csv(path + newfile, header=False)

    except:
        print(f'Saving operation could not be completed')


In [None]:
# Test SAVE DATA FRAME TO CSV FILE function
path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile.csv"

save_df_to_csv(path,newfile)

In [None]:
# Saves Data frame to file as JSON

def save_df_to_json(path,newfile):
    try:
        with open(path + newfile, 'w') as f:
            f.write(df.to_json(orient='records', lines=True))

    except:
        print(f'Saving operation could not be completed')
    


In [None]:
# Test SAVE DATA FRAME TO JSON FILE function
path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile.json"

save_df_to_json(path,newfile)