In [7]:
# import the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
import os
import sys 

In [8]:
# import transactions dataset
df = pd.read_csv('data/transactions.csv')
print(df.head())

# import auxiliary datasets (customer list, demographics, addresses)
df_customer_list = pd.read_csv('data/new_customer_list.csv')
df_demographics = pd.read_csv('data/customer_demographics.csv')
df_addresses = pd.read_csv('data/customer_addresses.csv')

   transaction_id  product_id  customer_id transaction_date online_order  \
0               1           2         2950       25/02/2017        False   
1               2           3         3120       21/05/2017         True   
2               3          37          402       16/10/2017        False   
3               4          88         3135       31/08/2017        False   
4               5          78          787       01/10/2017         True   

  order_status           brand product_line product_class product_size  \
0     Approved           Solex     Standard        medium       medium   
1     Approved   Trek Bicycles     Standard        medium        large   
2     Approved      OHM Cycles     Standard           low       medium   
3     Approved  Norco Bicycles     Standard        medium       medium   
4     Approved  Giant Bicycles     Standard        medium        large   

   list_price standard_cost  product_first_sold_date  
0       71.49        $53.62                

# Part 1 - Descriptive Statistics
- Calculate mean, median, and mode of transaction amounts.
- Integrate Probability Density Function (PDF) to model the probability distribution.
- Compute the expected value of transaction amounts.

For transaction amounts, we have two parameters: 
- Standard Cost (SC) and
- List Price (LP).

List Price is the price at which a product or service is advertised or listed for sale. It's also sometimes referred to as the "sticker price" or the "retail price." The list price may not always reflect the actual price paid by customers, as discounts or promotions may apply.

Standard cost is the predetermined cost of manufacturing a product or providing a service, based on factors such as materials, labor, and overhead. It represents the expected cost under normal conditions and is used for planning, budgeting, and evaluating performance. Standard cost can serve as a benchmark against which actual costs are compared to assess efficiency and variance.

In [9]:
# Calculate mean, median, and mode of transaction amounts.

# list_price
mean = df['list_price'].mean()
median = df['list_price'].median()
mode = df.mode()['list_price']

print(f"Mean: {mean}, Median: {median}, Mode: {mode}")

# standard_cost

# remove $ sign and convert to float
df['standard_cost'] = df['standard_cost'].replace('[\$,]', '', regex=True).astype(float) 

mean_std_cost = df['standard_cost'].mean()
median_std_cost = df['standard_cost'].median()
mode_std_cost = df['standard_cost'].mode()

print(f"Mean: {mean_std_cost}, Median: {median_std_cost}, Mode: {mode_std_cost}")

Mean: 1107.8294489999998, Median: 1163.89, Mode: 0        2091.47
1            NaN
2            NaN
3            NaN
4            NaN
          ...   
19995        NaN
19996        NaN
19997        NaN
19998        NaN
19999        NaN
Name: list_price, Length: 20000, dtype: float64
Mean: 556.0469512209312, Median: 507.58, Mode: 0    388.92
Name: standard_cost, dtype: float64


In [12]:
mode = df.mode(axis=0)
mode.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,0.0,1068.0,14/02/2017,True,Approved,Solex,Standard,medium,medium,2091.47,388.92,33879.0
1,2,,2183.0,18/08/2017,,,,,,,,,
2,3,,2476.0,,,,,,,,,,
3,4,,,,,,,,,,,,
4,5,,,,,,,,,,,,
