# Consumer Behaviour data cleaning

## Install dependencies

In [None]:
! pip3 install pandas
! pip3 install matplotlib.pyplot
! pip3 install plotly.express
! pip3 install openpyxl
! pip3 install petl

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import petl as etl
import glob as glob
import os

## Import files from un-zipped folder

### csv files
- Using glob, we extract all the files from the file ending in '.csv'
- We use iteration to loop each file:
    - export it into frame
    - create a new 'store' column, as then alter the basename, removing the '.csv' and inserting file name into 'store' to differentiate which store each row belows too.
    - we then append frame to the csv_data empty object
- We use concatenation to bring all the files together
- Finally exporting to csv file

In [None]:
# csv_data_import = glob.glob('data/*.csv')
# csv_data = []
# for csv in csv_data_import:
#     frame = pd.read_csv(csv)
#     frame['store'] = os.path.basename(csv).split(".")[0]
#     csv_data.append(frame)

# csv_final = pd.concat(csv_data, ignore_index=True)

# csv_final.to_csv('index_output/csv_final.csv')

### json files
- Using glob, we extract all the files from the file ending in '.json'
- We use iteration to loop each file:
    - export it into frame
    - create a new 'store' column, as then alter the basename, removing the '.json' and inserting file name into 'store' to differentiate which store each row belows too.
    - we then append frame to the json_data empty object
- We use concatenation to bring all the files together
- Finally exporting to json file

In [None]:
# json_data_import = glob.glob('data/*.json')
# print(json_data_import)

# json_data = []
# for json in json_data_import:
#     frame = pd.read_json(json)
#     frame['store'] = os.path.basename(json).split(".")[0]
#     json_data.append(frame)

# json_final = pd.concat(json_data
# json_final.to_csv('index_output/json_final.csv')

## Import csv files into etl tables

In [None]:
# csv_table = etl.fromcsv('csv_final.csv')
# json_table = etl.fromcsv('json_final.csv')

## Table Transformation

### csv table
- As some of the columns have slightly differnet names but same meaningful data, We use the etl.addfield propety to create a 'new_quantity' and 'new_item" 
column and added the values from the various named columns to it
- With the new columns in place, we removed all the 'old' columns using etl.cutout
- In order to match other tables and to succesffuly join them, we need the columns names the same.
    - we did this by converting all cells within the 'store' column, spliting at the _ and replacing the other _ to a space. ('2010-2020_bradford_branch' --> 'bradford branch')

In [None]:
# csv_table =etl.addfield(csv_table, 'new_quantity', lambda cell: cell['quantity'] + cell ['total_quantity'] + cell ['quantity_purchased'] + cell ['total_quantity_purchased'])
# csv_table =etl.addfield(csv_table, 'new_item', lambda cell: cell['item'] + cell ['product'] + cell ['sku'] )

In [None]:

# csv_table = etl.cutout(csv_table, 'quantity')
# csv_table = etl.cutout(csv_table, 'total_quantity')
# csv_table = etl.cutout(csv_table, 'quantity_purchased')
# csv_table = etl.cutout(csv_table, 'total_quantity_purchased')
# csv_table = etl.cutout(csv_table, 'item')
# csv_table = etl.cutout(csv_table, 'product')
# csv_table = etl.cutout(csv_table, 'sku')

# csv_table = etl.convert(csv_table, 'store', lambda cell: "_".join(cell.split('_')[1:]).replace("_", " ").strip())

### json table
- As some of the columns have slightly differnet names but same meaningful data, We use the etl.addfield propety to create a 'new_quantity' and 'new_item" 
column and added the values from the various named columns to it
- With the new columns in place, we removed all the 'old' columns using etl.cutout
- In order to match other tables and to succesffuly join them, we need the columns names the same.
    - we did this by converting all cells within the 'store' column, spliting at the _ and replacing the other _ to a space. ('2010-2020_bradford_branch' --> 'bradford branch')

In [None]:
# json_table =etl.addfield(json_table, 'new_quantity', lambda cell: cell['quantity'] + cell ['total_quantity'] + cell ['quantity_purchased'] + cell ['total_quantity_purchased'])
# json_table =etl.addfield(json_table, 'new_item', lambda cell: cell['item'] + cell ['product'] + cell ['sku'] )

In [None]:
# json_table = etl.cutout(json_table, 'quantity')
# json_table = etl.cutout(json_table, 'total_quantity')
# json_table = etl.cutout(json_table, 'quantity_purchased')
# json_table = etl.cutout(json_table, 'total_quantity_purchased')
# json_table = etl.cutout(json_table, 'item')
# json_table = etl.cutout(json_table, 'product')
# json_table = etl.cutout(json_table, 'sku')

# json_table = etl.convert(json_table, 'store', lambda cell: "_".join(cell.split('_')[1:]).replace("_", " ").strip())

### We renamed the 'new_item' & 'new_quantity' -> 'item' & 'quantity' in both tables

In [None]:
# csv_table = etl.rename(csv_table, {'new_quantity': 'quantity','new_item':'product', 'store':'branch_name'})
# json_table = etl.rename(json_table, {'new_quantity': 'quantity','new_item':'product', 'store':'branch_name'})

### Imported store data from xlsx & product list from csv

In [None]:
# store_data_table = etl.fromxlsx('data/branch_list.xlsx')
# product_list = etl.fromcsv('data/products_list.csv')

### Join the store data table to both the csv and json tables using the 'branch_name'

In [None]:
# csv_table_with_store_data = etl.leftjoin(csv_table,store_data_table, key='branch_name')
# json_table_with_store_data = etl.leftjoin(json_table,store_data_table, key='branch_name')

### Using the etl.cat, we concatinated the csv and json tables into 'data_transformation_df'

In [None]:
# data_transformation_df = etl.cat(csv_table_with_store_data,json_table_with_store_data)

### Join the product list table to the  using the 'product' into the df

In [None]:
# data_transformation_df = etl.leftjoin(data_transformation_df,product_list, key='product')


### once joined, we exported into a csv, in order to be able to work on it a bit quicker

In [None]:
# etl.tocsv(data_transformation_df, 'index_output/consumer_behaviour_df.csv')

### We import the csv from the line above and add some final touches
- We converted the year, month, day, hour, established_on and amount_in_gbp from a str into a float to be able to continue working on it like an int
- we converted quantity into an int
- We had two versions of manufactuter, catergory and price (which we recently found was obsolete as contained the wrong data)
    - because of this, we used the etl.cutout to remove the manufactuter, catergory and both prices from the table


In [None]:
# consumer_behaviour_df = etl.fromcsv('consumer_behaviour_df.csv')

In [None]:
# consumer_behaviour_df= etl.convert(consumer_behaviour_df, 'year', float)
# consumer_behaviour_df= etl.convert(consumer_behaviour_df, 'month', float)
# consumer_behaviour_df= etl.convert(consumer_behaviour_df, 'day', float)
# consumer_behaviour_df= etl.convert(consumer_behaviour_df, 'hour', float)
# consumer_behaviour_df= etl.convert(consumer_behaviour_df, 'established_on', float)
# consumer_behaviour_df= etl.convert(consumer_behaviour_df, 'amount_in_gbp', float)
# consumer_behaviour_df= etl.convert(consumer_behaviour_df, 'quantity', int)



# consumer_behaviour_df = etl.cutout(consumer_behaviour_df, 'manufactuter')
# consumer_behaviour_df = etl.cutout(consumer_behaviour_df, 'category')
# consumer_behaviour_df = etl.cutout(consumer_behaviour_df, 'price')
# consumer_behaviour_df = etl.cutout(consumer_behaviour_df, 'price')

### Using the .look(), we can check that the table is how we want it before final export


In [None]:
# consumer_behaviour_df.look()

### We export the final table to csv, ready to be used in the dash app


In [None]:
# etl.tocsv(consumer_behaviour_df, 'index_output/final_consumer_behaviour_df.csv')

### import branch_expenses.xlsx and save as a csv

In [None]:
branch_expenses = etl.fromxlsx('data/branch_expenses.xlsx')
branch_expenses.look()

In [None]:
etl.tocsv(branch_expenses, 'index_output/branch_expenses.csv')

### In order to use the exported files, reduce the file size/ check the code is plotting the right data, I continued my cleaning in './testing_out.ipynb'