# Python Pandas Functions

Please run the imports first and keep in mind that some blocks require an absolute path csv and a DF loaded to work.

In [None]:
# Imports
import pandas as pd                                                                                 # Loads Pandas package
import os                                                                                           # For OS related paths 

#### 1. ABSOLUTE PATH FOR RAW DATA   

##### Takes: raw_data_name that should be the name of one of the csv's files 
##### Returns: a variable named csv containing the absolute path to the raw_data subfolder folder plus the name of the file in raw_data_name

In [None]:
# 1. ABSOLUTE PATH FOR RAW DATA   

def absolute_path_for_raw_data(raw_data_file):
    abspath = os.path.abspath("../raw_data")
    global csv
    csv = abspath + "\\" + raw_data_file


In [None]:
# 1. [TEST RUN] ABSOLUTE PATH FOR RAW DATA

# Current Files are Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file)
print(csv) # Prints the newly formed absolute path with the raw_data_file name given.


### #LOADS (EXTRACT) DATAFRAME

#### Loads the df with headers .... just put outside any function

In [None]:
#LOADS (EXTRACT) DATAFRAME
columns = ['date_time', 'Location', 'full_name', 'order', 'amount', 'payment_type', 'card_number']  # Headers for the orders csv files
df = pd.read_csv(csv, header=None, names=columns)

### 2. SANITISE CSV (Remove given Columns) 

#### Takes: csv(absolute path to filename), and a list of columns to be dropped on a variable called sanatise_these_columns
#### Returns: the sanitised dataframe as df


In [None]:
# 2. SANITISE CSV (Remove given Columns) 

def sanitise_csv(csv,sanitise_these_columns):
    try:
        global df # as Global to be able to print it outside the function
        sanatisedf = df.drop(columns=sanitise_these_columns)
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return df

In [None]:
# 2. [TEST RUN] SANITISE CSV 

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

columns = ['date_time', 'Location', 'full_name', 'order', 'amount', 'payment_type', 'card_number']  # Headers for the orders csv files

absolute_path_for_raw_data(raw_data_file)  # returns csv variable with the absolute path
 
sanitise_these_columns = ['full_name', 'card_number']

result = sanitise_csv(csv,sanitise_these_columns)
result.head(5)

### 3. FILTER BY COLUMN VALUE 

#### Takes: csv(absolute path to filename), column variable with the column header id, value to check
#### Returns: the rows in which a column contains a given value AS contain_values


In [None]:
# 3. FILTER BY COLUMN VALUE 

def filter_by_column_value(csv,column,value):
    try:
        global contain_values # as Global to be able to print it outside the function
        contain_values = df[df[column].str.contains(value.upper())]
    except FileNotFoundError as fnfe:
        print(f'File not found: {fnfe}')

    return contain_values

In [None]:
# 3. [TEST RUN] FILTER BY COLUMN VALUE

# Current Files Chesterfield or Leeds (comment one out)
raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
# raw_data_file = "leeds_01-01-2020_09-00-00.csv"

column = "payment_type"  
value = "card"

filter_by_column_value(csv,column,value)
contain_values.head(10)

### 4. COUNT NUMBER OF DIFFERENT VALUES IN COLUMN 

#### Takes: csv(absolute path to filename), column variable with the column header id,
#### Returns: the count of the different values contained in a given column

In [None]:
# 4. COUNT NUMBER OF DIFFERENT VALUES IN COLUMN 

def count_number_of_different_values(csv,column):
    try:
        count = df[column].value_counts(ascending=True)
    except FileNotFoundError as fnfe:
         print(f'File not found: {fnfe}')
    return count

In [None]:
# 4. [TEST RUN] COUNT NUMBER OF DIFFERENT VALUES IN COLUMN 

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file) # returns csv variable with the absolut path

column = "payment_type" 

result = count_number_of_different_values(csv,column)
print(str(result))

### 5. COUNT NUMBER OF TIMES A VALUE IS REPEATED
# 
#### Takes: csv(absolute path to filename), column variable with the column header id, value to check
#### Retuns: the count of the number of times a single value is repeated on a given column AS count


In [None]:
# 5. COUNT NUMBER OF TIMES A VALUE IS REPEATED

def count_number_of_times_a_value_is_repeated(csv,column,value):
    try:
        df = pd.read_csv(csv, header=None, names=columns)
        #global count # as Global to be able to print it outside the function
        count = df[column].value_counts()[value]
    except FileNotFoundError as fnfe:
         return f'File not found: {fnfe}'
         
    except KeyError  as kerr:
         return f'No ocurrences of {value} in {column}' 
        
    return f"the value: {value} was found {str(count)} times."

In [None]:
# 5. [TEST RUN] COUNT NUMBER OF TIMES A VALUE IS REPEATED

# Current Files Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file) # returns csv variable with the absolut path

column = "payment_type"
value = "CASH" 

result = count_number_of_times_a_value_is_repeated(csv,column,value)

print(result)



### 6. SAVE DATA FRAME TO FILE AS CSV

#### Takes: path for the desired save path, newfile for the desired new file name (.csv extension added at runtime)
#### Retuns: a csv file saved to the desired location that contains the DF

In [None]:
# 6. SAVE DATA FRAME TO FILE AS CSV

def save_df_to_csv(path,newfile):
    try:
        os.makedirs(path, exist_ok=True)  
        df.to_csv(path + newfile +".csv", header=False)

    except:
        print(f'Saving operation could not be completed')


In [None]:
# 6. [TEST RUN] SAVE DATA FRAME TO FILE AS CSV
path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile"

save_df_to_csv(path,newfile)

### 7. SAVE DATAFRAME TO FILE AS JSON

#### Takes: path for the desired save path, newfile for the desired new file name (.json extension added at runtime)
#### Retuns: a csv file saved to the desired location that contains the DF

In [None]:
# 7. SAVE DATAFRAME TO FILE AS JSON

def save_df_to_json(path,newfile):
    try:
        with open(path + newfile +".json", 'w') as f:
            f.write(df.to_json(orient='records', lines=True))  # add compression='gzip' to get a zip file (and change newfile extension)

    except:
        print(f'Saving operation could not be completed')
    


In [None]:
# 7. [TEST RUN] SAVE DATAFRAME TO FILE AS JSON

path = "results/" # to add folder/subfolder/ if needed
newfile = "newfile"

save_df_to_json(path,newfile)

### GET UNIQUE PRODUCTS IN ORDERS
#### Returns a list with the number of  ocurrences for each product

In [None]:

# GET UNIQUE PRODUCTS IN ORDERS

def get_unique_products_in_orders():
    orders = df['order'].str.split(',', expand = True)
    
    try:
       for item in orders:
            global result
            result = orders[item].drop_duplicates()
       return result 
    
    except:
        print("Operation could not be completed.")


In [None]:
# [TEST RUN] GET UNIQUE PRODUCTS IN ORDERS


# The Current Raw csv Files are: Chesterfield or Leeds (comment one out)
raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
# raw_data_file = "leeds_01-01-2020_09-00-00.csv"


absolute_path_for_raw_data(raw_data_file)                                                           # Returns a variable called csv with the absolut path for the raw_data_file name [Run this block if fails]
columns = ['date_time', 'Location', 'full_name', 'order', 'amount', 'payment_type', 'card_number']  # Headers for the DF
df = pd.read_csv(csv, header=None, names=columns)                                                   # Creates the DF

result = ""


get_unique_products_in_orders()
result

In [None]:
#SEPARATE PRODUCTS IN ORDERS (string)


# the order_products string equals to a row in df['order']
# order_products = "Regular Flavoured iced latte - Hazelnut - 2.75, Large Latte - 2.45"
order_products = "Large Hot Chocolate - 1.70, Regular Hot Chocolate - 1.40, Large Chai latte - 2.60, Regular Chai latte - 2.30, Regular Speciality Tea - English breakfast - 1.30"

#LOGIC...

# A segment (named chuck in the program) is a part of the string delimited by commas (each product in the order a dash and their price)
# If a segment has two dashes then the first one will be the product name and the second will be the price
# If a segment has three dashes then the first two will be the product name and the third will be the price

chunks = order_products.split(',')

for dashes in chunks:

  if dashes.count('-') == 1:
    print(dashes[:dashes.index("-")])
    print(f"Price: " + dashes.split('-')[1])
  else: 
    stripped = dashes.split('-')[0] + "-" + dashes.split('-')[1]  #If the name contains a dash, combine it.
    print(stripped)
    print(f"Price: " + dashes.split('-')[2])


# We could use this to easily separate flavors from the drinks as well  they will be in: dashes.split('-')[1] assuming the Tea Types as flavors... #
# but that's another table to relate and probably won't be worth the trouble.  

In [None]:
# SEPARATE 

#SEPARATE PRODUCTS IN ORDERS WITH FLAVORS (string)


# the order_products string equals to a row in df['order']
order_products = "Regular Flavoured iced latte - Hazelnut - 2.75, Large Latte - 2.45"
# order_products = "Large Hot Chocolate - 1.70, Regular Hot Chocolate - 1.40, Large Chai latte - 2.60, Regular Chai latte - 2.30, Regular Speciality Tea - English breakfast - 1.30"

#LOGIC...

# A segment (named chuck in the program) is a part of the string delimited by commas (each product in the order a dash and their price)
# If a segment has two dashes then the first one will be the product name and the second will be the price
# If a segment has three dashes then the first two will be the product name and the third will be the price

chunks = order_products.split(',') 

for dashes in chunks:

  if dashes.count('-') == 1:
    product = dashes[:dashes.index("-")]  
    price = dashes.split('- ')[1]
    
    print(f"Product: {product}")
    print(f"Price: {price}")
    
  else: 
    product = dashes.split('-')[0]
    flavor = dashes.split('-')[1]
    price = dashes.split('-')[2] 
    
    print(f"Product: {stripped}")
    print(f"Flavor: {flavor}")
    print(f"Price: {price}")
 
    
    # With this one we can have better statistics and also count the number of flavors.
    # REMEMBER TO REMOVE THE EMPTY SPACE AFTER THE COMMA... (TYPE(LIST)) FOR PRODUCTS AND PRICE

### GET NUMBER OF ROWS OF THE DATA FRAME
#### We can use any of the following methods to get the number of rows in a data frame

In [None]:
#GET NUMBER OF ROWS OF THE DATA FRAME

# len(df.index)
# df[df.columns[0]].count()
df.shape[0]

### Check if a file exist

In [None]:
#CHECKS IF A FILE EXIST 

file_exists = os.path.exists(csv)
print(file_exists)

### SORTING THE DATE FOR POSTGRE'S [YYYY-MM-DD H:MM:SS] FORMAT

In [None]:

# SORTING THE DATE FOR POSTGRE'S YYYY-MM-DD H:MM:SS FORMAT

# Current Files are Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"


def sort_time_to_postgre_format():
    df['date_time'] = pd.to_datetime(df['date_time'], dayfirst=True)
    
    return df



In [None]:
# [TEST RUN] SORTING THE DATE FOR POSTGRE'S YYYY-MM-DD H:MM:SS FORMAT

# Current Files are Chesterfield or Leeds (comment one out)
# raw_data_file = "chesterfield_25-08-2021_09-00-00.csv"
raw_data_file = "leeds_01-01-2020_09-00-00.csv"

absolute_path_for_raw_data(raw_data_file)                                                           
columns = ['date_time', 'Location', 'full_name', 'order', 'amount', 'payment_type', 'card_number']  # Headers for the DF
# df = pd.read_csv(csv, header=None, names=columns)                                                   # Creates the DF

sort_time_to_postgre_format()
df 