In [25]:
# Imports and Variables used for testing
import pandas as pd                                                                                 # Loads Pandas package 
import os                                                                                           # For OS related paths 

# Variable List (with their default value [Used in Testing])
raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"                                              # Csv filename on the raw_data subfolder
csv = ""                                                                                            # filepath to be read by functions, used after joint with raw_data_file NOTE: csv variable could be an url if needed
df = ""                                                                                             # Global for the DataFrame
count = ""                                                                                          # Global for count functions
column = "payment_type"                                                                             # Argument for functions
value = "CARD"                                                                                      # Argument for functions
columns = ['date_time', 'Location', 'full_name', 'order', 'transaction_total', 'payment_type', 'card_number']  # Headers for the orders csv files
sanitise_these_columns = ['full_name', 'card_number']                                               # Columns list to be sanitised
path = "results/"                                                                                   # to add folder/subfolder/ if needed
newfile = "newfile.csv"                                                                             # Filename to be created by the Create CSV/JSON function

In [26]:
# 1. ABSOLUTE PATH FOR RAW DATA   

# takes: raw_data_name that should be the name of one of the csv's files 
# returns: absolute path to the raw_data subfolder folder plus raw_data_name AS a variable named csv 


def absolute_path_for_raw_data(raw_data_file):
    abspath = os.path.abspath("../raw_data")
    global csv
    csv = abspath + "\\" + raw_data_file


In [110]:
# 1. [TEST RUN] ABSOLUTE PATH FOR RAW DATA

# Current Files are Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file)
print(csv) # Prints the newly formed absolute path with the raw_data_file name given.


c:\Users\jzzz\Python\final-project\brewed-awakening-final-project\raw_data\leeds_01-01-2020_09-00-00.csv


In [27]:
# 2. EXTRACT SANITISE CSV (Remove given Columns) 

# takes: csv(absolute path to filename), and a list of columns to be dropped on a variable called sanatise_these_columns
# returns: the sanitised dataframe as df

def extract_sanitise_csv(csv,sanitise_these_columns):
    try:
        global df # as Global to be able to print it outside the function
        df = pd.read_csv(csv, header=None, names=columns)
        df = df.drop(columns=sanitise_these_columns)
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return df

In [None]:
# 2. [TEST RUN] EXTRACT SANITISE CSV 

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file)  # returns csv variable with the absolute path
 
sanitise_these_columns = ['full_name', 'card_number']

extract_sanitise_csv(csv,sanitise_these_columns)
df.head(5)

In [117]:
# 3. FILTER BY COLUMN VALUE 

# Takes: csv(absolute path to filename), column variable with the column header id, value to check
# Returns: the rows in which a column contains a given value AS contain_values

def filter_by_column_value(csv,column,value):
    try:
        global contain_values # as Global to be able to print it outside the function
        contain_values = df[df[column].str.contains(value.upper())]
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return contain_values

In [118]:
# 3. [TEST RUN] FILTER BY COLUMN VALUE

# Current Files Chesterfield or Leeds (comment one out)
raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
# raw_data_file = "leeds_01-01-2020_09-00-00.csv"

column = "payment_type"  
value = "card"

filter_by_column_value(csv,column,value)
contain_values.head(10)

Unnamed: 0,date_time,Location,order,transaction_total,payment_type
1,01/01/2020 09:01,Leeds,"Large Chai latte - 2.60, Regular Filter coffee...",4.1,CARD
2,01/01/2020 09:03,Leeds,Large Speciality Tea - English breakfast - 1.6...,2.9,CARD
3,01/01/2020 09:04,Leeds,"Large Chai latte - 2.60, Large Iced americano ...",7.6,CARD
4,01/01/2020 09:06,Leeds,Regular Hot Chocolate - 1.40,1.4,CARD
5,01/01/2020 09:08,Leeds,"Regular Chai latte - 2.30, Regular Filter coff...",3.8,CARD
6,01/01/2020 09:09,Leeds,"Regular Iced americano - 2.15, Regular Hot Cho...",10.75,CARD
7,01/01/2020 09:11,Leeds,"Regular Filter coffee - 1.50, Regular Hot Choc...",8.4,CARD
8,01/01/2020 09:12,Leeds,"Large Iced americano - 2.50, Regular Chai latt...",4.8,CARD
12,01/01/2020 09:19,Leeds,"Regular Hot Chocolate - 1.40, Large Filter cof...",6.85,CARD
13,01/01/2020 09:21,Leeds,Large Speciality Tea - English breakfast - 1.6...,4.1,CARD


In [119]:
# 4. COUNT NUMBER OF DIFFERENT VALUES IN COLUMN 

# Takes: csv(absolute path to filename), column variable with the column header id,
# Returns: the count of the different values contained in a given column

def count_number_of_different_values(csv,column):
    try:
        global count  # as Global to be able to print it outside the function
        count = df[column].value_counts(ascending=True)
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [120]:
# 4. [TEST RUN] COUNT NUMBER OF DIFFERENT VALUES IN COLUMN 

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file) # returns csv variable with the absolut path

column = "payment_type" 

count_number_of_different_values(csv,column)
print(str(count))

payment_type
CASH    107
CARD    275
Name: count, dtype: int64


In [121]:
# 5. COUNT NUMBER OF TIMES A VALUE IS REPEATED
# 
# Takes: csv(absolute path to filename), column variable with the column header id, value to check
# Retuns: the count of the number of times a single value is repeated on a given column AS count

def count_number_of_times_a_value_is_repeated(csv,column,value):
    try:
        df = pd.read_csv(csv, header=None, names=columns)
        global count # as Global to be able to print it outside the function
        count = df[column].value_counts()[value]
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [122]:
# 5. [TEST RUN] COUNT NUMBER OF TIMES A VALUE IS REPEATED

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file) # returns csv variable with the absolut path

column = "payment_type"
value = "CARD" 

count_number_of_times_a_value_is_repeated(csv,column,value)
print("the value: " + value + " was found " + str(count) + " times.")

the value: CARD was found 275 times.


In [123]:
# 6. SAVE DATA FRAME TO FILE AS CSV

# Takes: path for the desired save path, newfile for the desired new file name (.csv extension added at runtime)
# Retuns: a csv file saved to the desired location that contains the DF

def save_df_to_csv(path,newfile):
    try:
        os.makedirs(path, exist_ok=True)  
        df.to_csv(path + newfile +".csv", header=False)

    except:
        print(f'Saving operation could not be completed')


In [124]:
# 6. [TEST RUN] SAVE DATA FRAME TO FILE AS CSV
path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile"

save_df_to_csv(path,newfile)

In [125]:
# 7. SAVE DATAFRAME TO FILE AS JSON

# Takes: path for the desired save path, newfile for the desired new file name (.json extension added at runtime)
# Retuns: a csv file saved to the desired location that contains the DF

def save_df_to_json(path,newfile):
    try:
        with open(path + newfile +".json", 'w') as f:
            f.write(df.to_json(orient='records', lines=True))  # add compression='gzip' to get a zip file (and change newfile extension)

    except:
        print(f'Saving operation could not be completed')
    


In [126]:
# 7. [TEST RUN] SAVE DATAFRAME TO FILE AS JSON

path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile"

save_df_to_json(path,newfile)

In [106]:

# GET UNIQUE PRODUCTS IN ORDERS [WORK IN PROGRESS]


def get_unique_products_in_orders():
    df = pd.read_csv(csv, header=None, names=columns)
    orders = df['order'].str.split(',', expand = True)
        
    for item in orders:
        global result
        result = orders[item].drop_duplicates()

    return result

# TODO: we need to separate prices from the values
# but we can't just use a split('-') because some products like the [Large Flavoured latte - Hazelnut - 2.85] will break the name - price norm. 




In [107]:
# [TEST RUN] GET UNIQUE PRODUCTS IN ORDERS [WORK IN PROGRESS]

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file) # returns csv variable with the absolut path
result = ""


get_unique_products_in_orders()
result

0                                                   None
50                         Regular Iced americano - 2.15
55                               Large Chai latte - 2.60
57      Regular Speciality Tea - English breakfast - ...
122      Large Speciality Tea - English breakfast - 1.60
142                          Large Iced americano - 2.50
154                            Regular Chai latte - 2.30
237                         Regular Filter coffee - 1.50
Name: 5, dtype: object

In [109]:
# SHOWS PRICE ONLY [TESTING]
df['order'].str.split(' ').str[-1]

0      1.30
1      1.50
2      1.30
3      2.50
4      1.40
5      1.50
6      2.30
7      1.50
8      2.30
9      2.30
10     2.15
11     2.30
12     2.15
13     2.50
14     2.50
15     1.70
16     1.70
17     1.70
18     2.15
19     2.60
20     2.50
21     1.80
22     1.50
23     1.50
24     1.80
25     1.70
26     2.15
27     1.50
28     1.80
29     1.30
30     2.15
31     2.15
32     2.30
33     1.40
34     1.30
35     1.80
36     2.50
37     1.70
38     1.30
39     1.50
40     1.80
41     2.50
42     2.15
43     1.40
44     1.60
45     1.80
46     1.80
47     1.30
48     1.30
49     1.80
50     2.15
51     1.50
52     2.50
53     2.50
54     1.70
55     2.60
56     2.15
57     1.30
58     2.15
59     1.40
60     1.40
61     1.30
62     1.30
63     1.80
64     1.50
65     1.30
66     2.50
67     1.30
68     2.60
69     1.40
70     1.50
71     2.30
72     1.50
73     1.80
74     1.80
75     1.30
76     1.80
77     2.15
78     2.15
79     1.70
80     1.70
81     2.60
82     1.60
83  