In [None]:
# Imports and Variables used for testing
import pandas as pd                                                                                 # Loads Pandas package
import json                                                                                         # Loads JSON package 
import os                                                                                           # For OS related paths 

In [None]:
# 1. ABSOLUTE PATH FOR RAW DATA   

# takes: raw_data_name that should be the name of one of the csv's files 
# returns: absolute path to the raw_data subfolder folder plus raw_data_name AS a variable named csv 


def absolute_path_for_raw_data(raw_data_file):
    abspath = os.path.abspath("../raw_data")
    global csv
    csv = abspath + "\\" + raw_data_file


In [None]:
# 1. [TEST RUN] ABSOLUTE PATH FOR RAW DATA

# Current Files are Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file)
print(csv) # Prints the newly formed absolute path with the raw_data_file name given.


In [None]:
# 2. EXTRACT SANITISE CSV (Remove given Columns) 

# takes: csv(absolute path to filename), and a list of columns to be dropped on a variable called sanatise_these_columns
# returns: the sanitised dataframe as df

def extract_sanitise_csv(csv,sanitise_these_columns):
    try:
        global df # as Global to be able to print it outside the function
        df = pd.read_csv(csv, header=None, names=columns)
        df = df.drop(columns=sanitise_these_columns)
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return df

In [None]:
# 2. [TEST RUN] EXTRACT SANITISE CSV 

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

columns = ['date_time', 'Location', 'full_name', 'order', 'amount', 'payment_type', 'card_number']  # Headers for the orders csv files

absolute_path_for_raw_data(raw_data_file)  # returns csv variable with the absolute path
 
sanitise_these_columns = ['full_name', 'card_number']

extract_sanitise_csv(csv,sanitise_these_columns)
df.head(5)

In [None]:
# 3. FILTER BY COLUMN VALUE 

# Takes: csv(absolute path to filename), column variable with the column header id, value to check
# Returns: the rows in which a column contains a given value AS contain_values

def filter_by_column_value(csv,column,value):
    try:
        global contain_values # as Global to be able to print it outside the function
        contain_values = df[df[column].str.contains(value.upper())]
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return contain_values

In [None]:
# 3. [TEST RUN] FILTER BY COLUMN VALUE

# Current Files Chesterfield or Leeds (comment one out)
raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
# raw_data_file = "leeds_01-01-2020_09-00-00.csv"

column = "payment_type"  
value = "card"

filter_by_column_value(csv,column,value)
contain_values.head(10)

In [None]:
# 4. COUNT NUMBER OF DIFFERENT VALUES IN COLUMN 

# Takes: csv(absolute path to filename), column variable with the column header id,
# Returns: the count of the different values contained in a given column

def count_number_of_different_values(csv,column):
    try:
        global count  # as Global to be able to print it outside the function
        count = df[column].value_counts(ascending=True)
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [None]:
# 4. [TEST RUN] COUNT NUMBER OF DIFFERENT VALUES IN COLUMN 

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file) # returns csv variable with the absolut path

column = "payment_type" 

count_number_of_different_values(csv,column)
print(str(count))

In [None]:
# 5. COUNT NUMBER OF TIMES A VALUE IS REPEATED
# 
# Takes: csv(absolute path to filename), column variable with the column header id, value to check
# Retuns: the count of the number of times a single value is repeated on a given column AS count

def count_number_of_times_a_value_is_repeated(csv,column,value):
    try:
        df = pd.read_csv(csv, header=None, names=columns)
        global count # as Global to be able to print it outside the function
        count = df[column].value_counts()[value]
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [None]:
# 5. [TEST RUN] COUNT NUMBER OF TIMES A VALUE IS REPEATED

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file) # returns csv variable with the absolut path

column = "payment_type"
value = "CARD" 

count_number_of_times_a_value_is_repeated(csv,column,value)
print("the value: " + value + " was found " + str(count) + " times.")

In [None]:
# 6. SAVE DATA FRAME TO FILE AS CSV

# Takes: path for the desired save path, newfile for the desired new file name (.csv extension added at runtime)
# Retuns: a csv file saved to the desired location that contains the DF

def save_df_to_csv(path,newfile):
    try:
        os.makedirs(path, exist_ok=True)  
        df.to_csv(path + newfile +".csv", header=False)

    except:
        print(f'Saving operation could not be completed')


In [None]:
# 6. [TEST RUN] SAVE DATA FRAME TO FILE AS CSV
path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile"

save_df_to_csv(path,newfile)

In [None]:
# 7. SAVE DATAFRAME TO FILE AS JSON

# Takes: path for the desired save path, newfile for the desired new file name (.json extension added at runtime)
# Retuns: a csv file saved to the desired location that contains the DF

def save_df_to_json(path,newfile):
    try:
        with open(path + newfile +".json", 'w') as f:
            f.write(df.to_json(orient='records', lines=True))  # add compression='gzip' to get a zip file (and change newfile extension)

    except:
        print(f'Saving operation could not be completed')
    


In [None]:
# 7. [TEST RUN] SAVE DATAFRAME TO FILE AS JSON

path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile"

save_df_to_json(path,newfile)

In [None]:

# GET UNIQUE PRODUCTS IN ORDERS
# Returns a list with the number of  ocurrences for each product


def get_unique_products_in_orders():
    # df = pd.read_csv(csv, header=None, names=columns)  # Not needed if you're coming from another function that already read this
    orders = df['order'].str.split(',', expand = True)
        
    for item in orders:
        global result
        result = orders[item].drop_duplicates()

    return result 




In [None]:
# [TEST RUN] GET UNIQUE PRODUCTS IN ORDERS


# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file) # returns csv variable with the absolut path
result = ""


get_unique_products_in_orders()
result

In [None]:
#SEPARATE PRODUCTS IN ORDERS (string)


# the order_products string equals to a row in df['order']
# order_products = "Regular Flavoured iced latte - Hazelnut - 2.75, Large Latte - 2.45"
order_products = "Large Hot Chocolate - 1.70, Regular Hot Chocolate - 1.40, Large Chai latte - 2.60, Regular Chai latte - 2.30, Regular Speciality Tea - English breakfast - 1.30"

#LOGIC...

# A segment (named chuck in the program) is a part of the string delimited by commas (each product in the order with their price)
# If a segment has two dashes then the first one will be the product name and the second will be the price
# If a segment has three dashes then the first two will be the product name and the third will be the price

chunks = order_products.split(',')

for dashes in chunks:

  if dashes.count('-') == 1:
    print(dashes[:dashes.index("-")])
    print(f"Price: " + dashes.split('-')[1])
  else: 
    stripped = dashes.split('-')[0] + "-" + dashes.split('-')[1]  #If the name contains a dash, combine it.
    print(stripped)
    print(f"Price: " + dashes.split('-')[2])


In [None]:
#GET NUMBER OF ROWS OF THE DATA FRAME
# We can use any of the following methods to get the number of rows in a data frame

# len(df.index)
# df[df.columns[0]].count()
df.shape[0]