In [None]:
# Check the project workspace.
import os
print("Project Workspace:", os.getcwd())
print()

# Import all necessary packages for the project.
import pandas as pan
import statistics as myStats
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.cluster import DBSCAN as sklearnDBSCAN
import seaborn as sb
import glob
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from matplotlib import pyplot as plt

# Revome scientific notation to expand numbers in the dataset up to the
# second decimal place.
pan.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
# From the "Sale Date" data, we get the day, month and date columns for each transaction.
def fetchMonthAndYearColumns(dataFile):
    dataFile["Day"] = dataFile["Sale_Date"].apply(lambda month: month.split("-")[2])
    dataFile["Month"] = dataFile["Sale_Date"].apply(lambda month: month.split("-")[1])
    dataFile["Year"] = dataFile["Sale_Date"].apply(lambda year: year.split("-")[0])
    dataFile.drop(columns = ["Sale_Date"])
    return dataFile

# Aggregate the data for each month by "UniSA Customer Number" and "Month of Sale"
# to get "No of Trips per day", "No of Purchases" and the "Total Sale Amount Inclusive of GST".
def aggregateYearData(dataFile):
    # Drop all rows with "NA" for "UniSA Customer Number".
    dataFile = dataFile.dropna(subset = ["UniSA_Customer_No"])
    # Ensure that the minimum value for "Quantity Sold" is "1".
    dataFile["Quantity_Sold"][dataFile["Quantity_Sold"] < 1.0] = 1
    # First aggregate over a day.
    dataFile = dataFile.groupby(by = ["Day", "Month", "Year", "UniSA_Receipt_No1", "UniSA_Customer_No"], as_index = False).agg({"Total_Sale_Amount_InclusiveGST": "mean", "Quantity_Sold": "sum"})
    dataFile = dataFile.groupby(by = ["Day", "Month", "Year", "UniSA_Customer_No"], as_index = False).agg({"UniSA_Receipt_No1": "count", "Total_Sale_Amount_InclusiveGST": "sum", "Quantity_Sold": "sum"}).rename(columns = {"UniSA_Receipt_No1": "No_of_Trips", "Total_Sale_Amount_InclusiveGST": "Total_Sale_Amount"})
    print("Size of Dataset:", dataFile.shape)
    # Then aggregate over a month.
    dataFile = dataFile.groupby(by = ["Month", "Year", "UniSA_Customer_No"], as_index = False).agg({"No_of_Trips": "sum", "Total_Sale_Amount": "sum", "Quantity_Sold": "sum"}).rename(columns = {"No_of_Trips": "Monthly_Total_Trips", "Quantity_Sold": "Total_Quantity_Sold"})
    print("Size of Dataset:", dataFile.shape)
    dataFile = dataFile[(dataFile["Monthly_Total_Trips"] >= 4) & (dataFile["Monthly_Total_Trips"] <= 31)]
    # And finally aggregate over a year
    dataFile = dataFile.groupby(by = ["Year", "UniSA_Customer_No"], as_index = False).agg({"Monthly_Total_Trips": "mean", "Total_Sale_Amount": "mean", "Total_Quantity_Sold": "mean"}).rename(columns = {"Monthly_Total_Trips": "Average_Monthly_Trips", "Total_Sale_Amount": "Average_Sale_Amount", "Total_Quantity_Sold": "Average_Quantity_Sold"})
    dataFile["Average_Monthly_Trips"] = dataFile["Average_Monthly_Trips"].round()    
    dataFile["Average_Quantity_Sold"] = dataFile["Average_Quantity_Sold"].round()
    print("Size of Dataset:", dataFile.shape)
    return dataFile

# Aggregate the data for each month by "UniSA Customer Number" and "Month of Sale"
# to get "No of Trips per day", "No of Purchases" and the "Total Sale Amount Inclusive of GST".
def aggregateMonthData(dataFile):
    # Drop all rows with "NA" for "UniSA Customer Number".
    dataFile = dataFile.dropna(subset = ["UniSA_Customer_No"])
    # Ensure that the minimum value for "Quantity Sold" is "1".
    dataFile["Quantity_Sold"][dataFile["Quantity_Sold"] < 1.0] = 1
    # First aggregate over a day.
    dataFile = dataFile.groupby(by = ["Day", "Month", "Year", "UniSA_Receipt_No1", "UniSA_Customer_No"], as_index = False).agg({"Total_Sale_Amount_InclusiveGST": "mean", "Quantity_Sold": "sum"})
    dataFile = dataFile.groupby(by = ["Day", "Month", "Year", "UniSA_Customer_No"], as_index = False).agg({"UniSA_Receipt_No1": "count", "Total_Sale_Amount_InclusiveGST": "sum", "Quantity_Sold": "sum"}).rename(columns = {"UniSA_Receipt_No1": "No_of_Trips", "Total_Sale_Amount_InclusiveGST": "Total_Sale_Amount"})
    # Eliminate all those customers who made more than 7 trips in a day.
    dataFile = dataFile[dataFile['No_of_Trips'] < 6]
    # Then aggregate over a month.
    dataFile = dataFile.groupby(by = ["Month", "Year", "UniSA_Customer_No"], as_index = False).agg({"No_of_Trips": "mean", "Total_Sale_Amount": "mean", "Quantity_Sold": "mean"}).rename(columns = {"No_of_Trips": "Daily_Average_Trips", "Total_Sale_Amount": "Average_Sale_Amount", "Quantity_Sold": "Average_Quantity_Sold"})
    dataFile["Daily_Average_Trips"] = dataFile["Daily_Average_Trips"].round()    
    dataFile["Average_Quantity_Sold"] = dataFile["Average_Quantity_Sold"].round()
    return dataFile

# Import data for each day and collate the data. For this, we create a function
# "importAndCollate()".
def importAndCollate():
    dataFileList = []
    fileNames = [fileName for fileName in glob.glob("<insert_year>/*.csv", recursive = True)]
    for fileName in fileNames:
        dataFileList.append(pan.read_csv(fileName))
    yearDataFile = pan.concat(dataFileList, ignore_index = False)
    print("Size of Dataset:", yearDataFile.shape)
    return yearDataFile

In [None]:
# Import all data for a year and collate it in one dataframe.
yearDataFile = importAndCollate()

In [None]:
# Transform the "Sale Date" column into three different columns, one each
# for "Sale Day", "Sale Month" and "Sale Year"
yearDataFile = fetchMonthAndYearColumns(yearDataFile)

In [None]:
# Aggregate the data over each month to get "Number of Trips in a Month",
# "Average Sale Amount" and "Quantity Sold" to each loyal "UniSA Customer Number".
aggregatedMonthlyData = aggregateMonthData(yearDataFile)
aggregatedMonthlyData.head(10)

In [None]:
# Let's explore the different customer segments for a single day
# using different clustering techniques. For starters, we consider
# only data for one month's data. In this case January (Month == "01").
# Each"UniSA Customer Number" is a point in three dimensions: "Average Number of Trips made Daily"
# "Average Sale Amount", and "Average Quantity Sold".

# Before we proceed, let's plot our data to get an idea of the different
# customer segements that may exist. Our data is three dimensional, but
# we need to plot it in two dimensions. For this we'll use Principal
# Component Analysis. Before that we normalize our data.
aggregatedJanuaryData = aggregatedMonthlyData[aggregatedMonthlyData["Month"] == "01"]
aggregatedJanuaryDataForPCA = aggregatedJanuaryData[['Daily_Average_Trips', 'Average_Sale_Amount', 'Average_Quantity_Sold']]
aggregatedJanuaryDataForPCA['Daily_Average_Trips'] = (aggregatedJanuaryDataForPCA['Daily_Average_Trips'] - myStats.mean(aggregatedJanuaryDataForPCA['Daily_Average_Trips'])) / myStats.stdev(aggregatedJanuaryDataForPCA['Daily_Average_Trips'])
aggregatedJanuaryDataForPCA['Average_Sale_Amount'] = (aggregatedJanuaryDataForPCA['Average_Sale_Amount'] - myStats.mean(aggregatedJanuaryDataForPCA['Average_Sale_Amount'])) / myStats.stdev(aggregatedJanuaryDataForPCA['Average_Sale_Amount'])
aggregatedJanuaryDataForPCA['Average_Quantity_Sold'] = (aggregatedJanuaryDataForPCA['Average_Quantity_Sold'] - myStats.mean(aggregatedJanuaryDataForPCA['Average_Quantity_Sold'])) / myStats.stdev(aggregatedJanuaryDataForPCA['Average_Quantity_Sold'])
aggregatedJanuaryData.index = range(0, len(aggregatedJanuaryData.index.tolist()))
aggregatedJanuaryDataForPCA.index = range(0, len(aggregatedJanuaryDataForPCA.index.tolist()))

# We create an object for Principal Component Analysis (PCA)
pCA = sklearnPCA(n_components = 2)
pCAJanuaryData = pan.DataFrame(pCA.fit_transform(aggregatedJanuaryDataForPCA))
sb.scatterplot(x = pCAJanuaryData[0], y = pCAJanuaryData[1])

In [None]:
# The PCA scatterplot for shows distinct groups of straight lines, of
# which one group might be outliers. To deal with such data, we use the
# DBSCAN Clustering method for customer segmentation.
dbscanClustering = sklearnDBSCAN(eps = 3, min_samples = 10).fit(aggregatedJanuaryDataForPCA)
clusterNumbers = [cluster + 1 for cluster in dbscanClustering.labels_.tolist()]
aggregatedJanuaryDataForPCA["Cluster"] = clusterNumbers
pCAJanuaryData.columns = ["0", "1"]
pCAJanuaryData["Cluster"] = clusterNumbers
aggregatedJanuaryData["Cluster"] = clusterNumbers
scatterPlot = sb.scatterplot(x = pCAJanuaryData["0"], y = pCAJanuaryData["1"], hue = pCAJanuaryData["Cluster"], data = pCAJanuaryData)

In [None]:
# From the results of the DBSCAN Clustering for the month of January,
# we see that there are 4 clusters, from which one cluster is simply
# outliers in the data. We remove these outliers for further analysis.
outlierRecordIndices = pCAJanuaryData[pCAJanuaryData["Cluster"] == 0].index.tolist()
pCAJanuaryData = pCAJanuaryData.drop(outlierRecordIndices, axis = 0)
aggregatedJanuaryDataForPCA = aggregatedJanuaryDataForPCA.drop(outlierRecordIndices, axis = 0)
aggregatedJanuaryData = aggregatedJanuaryData.drop(outlierRecordIndices, axis = 0)
print("Number of records for PCA plot of January Data:", len(aggregatedJanuaryDataForPCA.index))
print("Number of records for January Data:", len(aggregatedJanuaryData.index))

In [None]:
# Now that we have our three clusters, let's check the distributions of the
# following variables in each of these clusters: "Average Number of Trips
# made per day", "Average Sale Amount for the month" and "Average Quantity
# Sold to the Customer".

# Let's create box plots for "Average Number of Trips made in the month".
plt.figure()
boxPlotAverageNoOfTripsJanuary = sb.boxplot(x = "Cluster", y = "Daily_Average_Trips", data = aggregatedJanuaryData)
boxPlotAverageNoOfTripsJanuary.set(xlabel = "Segment No.", ylabel = "Average No. of Trips per Day")

# Let's create box plots for "Average Sale Amount".
plt.figure()
boxPlotAverageSaleAmountJanuary = sb.boxplot(x = "Cluster", y = "Average_Sale_Amount", data = aggregatedJanuaryData)
boxPlotAverageSaleAmountJanuary.set(xlabel = "Segment No.", ylabel = "Average Spending per Day")

# Let's create box plots for "Average Quantity Sold to the Customer".
plt.figure()
boxPlotAverageQuantitySoldJanuary = sb.boxplot(x = "Cluster", y = "Average_Quantity_Sold", data = aggregatedJanuaryData)
boxPlotAverageQuantitySoldJanuary.set(xlabel = "Segment No.", ylabel = "Average No. of Products Purchased per Day")

In [None]:
# Aggregate the data over each month to get "Number of Trips in a Month",
# "Average Sale Amount" and "Quantity Sold" to each loyal "UniSA Customer Number".
aggregatedYearlyData = aggregateYearData(yearDataFile)
aggregatedYearlyData.head(10)

In [None]:
# Let's explore the different customer segments for the entire year
# using different clustering techniques. Each "UniSA Customer Number"
# is a point in three dimensions: "Average Number of Trips made in a Month"
# "Average Sale Amount", and "Average Quantity Sold".

# Before we proceed, let's plot our data to get an idea of the different
# customer segements that may exist. Our data is three dimensional, but
# we need to plot it in two dimensions. For this we'll use Principal
# Component Analysis. Before that we normalize our data.
aggregatedYearlyDataForPCA = aggregatedYearlyData[["Average_Monthly_Trips", "Average_Sale_Amount", "Average_Quantity_Sold"]]
aggregatedYearlyDataForPCA["Average_Monthly_Trips"] = (aggregatedYearlyDataForPCA["Average_Monthly_Trips"] - myStats.mean(aggregatedYearlyDataForPCA["Average_Monthly_Trips"])) / myStats.stdev(aggregatedYearlyDataForPCA["Average_Monthly_Trips"])
aggregatedYearlyDataForPCA["Average_Sale_Amount"] = (aggregatedYearlyDataForPCA["Average_Sale_Amount"] - myStats.mean(aggregatedYearlyDataForPCA["Average_Sale_Amount"])) / myStats.stdev(aggregatedYearlyDataForPCA["Average_Sale_Amount"])
aggregatedYearlyDataForPCA["Average_Quantity_Sold"] = (aggregatedYearlyDataForPCA["Average_Quantity_Sold"] - myStats.mean(aggregatedYearlyDataForPCA["Average_Quantity_Sold"])) / myStats.stdev(aggregatedYearlyDataForPCA["Average_Quantity_Sold"])

# We create an object for Principal Component Analysis (PCA)
pCA = sklearnPCA(n_components = 2)
pCAYearData = pan.DataFrame(pCA.fit_transform(aggregatedYearlyDataForPCA))
pCAYearData.columns = ["0", "1"]
pCAYearData.index = aggregatedYearlyData["UniSA_Customer_No"]
aggregatedYearlyDataForPCA.index = aggregatedYearlyData["UniSA_Customer_No"]
aggregatedYearlyData.index = aggregatedYearlyData["UniSA_Customer_No"]
sb.scatterplot(x = pCAYearData["0"], y = pCAYearData["1"])

In [None]:
# The PCA scatterplot for shows distinct groups of straight lines, of
# which one group might be outliers. To deal with such data, we use the
# DBSCAN Clustering method for customer segmentation.
dbscanClustering = sklearnDBSCAN(eps = 3, min_samples = 10).fit(aggregatedYearlyDataForPCA)
clusterNumbers = [cluster + 1 for cluster in dbscanClustering.labels_.tolist()]
aggregatedYearlyDataForPCA["Cluster"] = clusterNumbers
pCAYearData.columns = ["0", "1"]
pCAYearData["Cluster"] = clusterNumbers
aggregatedYearlyData["Cluster"] = clusterNumbers
scatterPlot = sb.scatterplot(x = pCAYearData["0"], y = pCAYearData["1"], hue = pCAYearData["Cluster"], data = pCAYearData)

In [None]:
# It is evident that there is one outlier which will cause problems while
# clustering our data. For this reason, we remove the outlier belonging to
# cluster "0".
outlierRecordIndices = pCAYearData[pCAYearData["Cluster"] == 0].index.tolist()
outlierRecordIndices
pCAYearData = pCAYearData.drop(outlierRecordIndices, axis = 0)
aggregatedYearlyDataForPCA = aggregatedYearlyDataForPCA.drop(outlierRecordIndices, axis = 0)
aggregatedYearlyData = aggregatedYearlyData.drop(outlierRecordIndices, axis = 0)
scatterPlot = sb.scatterplot(x = pCAYearData["0"], y = pCAYearData["1"], hue = pCAYearData["Cluster"], data = pCAYearData)

In [None]:
# We can see that there are no distinct groups in the data. Let's
# try using Herarchical Clustering methods.
linked = linkage(pCAYearData, method = "ward")

labelList = pCAYearData.index.tolist()

plt.figure(figsize = (10, 8))  
heirarchicalClustering = dendrogram(linked, orientation = "left", labels = labelList, distance_sort = "descending", show_leaf_counts = True, color_threshold = 55)
plt.show()  

In [None]:
# We see that there are three possible segments of customers for the year.
# We need to extract the segment numbers for each segment.
heirarchicalClustering = fcluster(linked, t = 55, depth = 4, criterion = "distance")

# Next, we add these segment details to our aggregated data.
aggregatedYearlyData["Segment_No"] = heirarchicalClustering

# Let's check the distribution of "UniSA Customer Numbers" based on "Average Number
# of Trips made per Month", "Average Sale Amount for the Month" and "Average Quantity
# Sold."

# Let's create box plots for "Average Number of Trips made in the month".
plt.figure()
boxPlotAverageNoOfTrips = sb.boxplot(x = "Segment_No", y = "Average_Monthly_Trips", data = aggregatedYearlyData)
boxPlotAverageNoOfTrips.set(xlabel = "Segment No.", ylabel = "Average No. of Trips per Month")

# Let's create box plots for "Average Sale Amount".
plt.figure()
boxPlotAverageSaleAmount = sb.boxplot(x = "Segment_No", y = "Average_Sale_Amount", data = aggregatedYearlyData)
boxPlotAverageSaleAmount.set(xlabel = "Segment No.", ylabel = "Average Spending per Month")

# Let's create box plots for "Average Quantity Sold to the Customer".
plt.figure()
boxPlotAverageQuantitySold = sb.boxplot(x = "Segment_No", y = "Average_Quantity_Sold", data = aggregatedYearlyData)
boxPlotAverageQuantitySold.set(xlabel = "Segment No.", ylabel = "Average No. of Products Purchased per Month")