#### Import Packages

In [59]:
# dataset retrieved from https://archive.ics.uci.edu/dataset/352/online+retail
import pandas as pd
import numpy as np
import plotly.express as px

#### Import Data

In [60]:
file_path = "data/Online_Retail.csv"


df = pd.read_csv(file_path,
                 header= 0,
                #  nrows= 5,
                 parse_dates= ["InvoiceDate"],
                 date_format= "%d/%m/%y %H:%M" # account for DD/MM/YY HH:MM
                 )

# print(df.info())
# print(df.head())

#### Account for null values
- 1,454 values in description (dropped)
- 135,080 values in CustomerID, likely guests

540,455 rows left after dropping NA values in description

In [61]:
print("Number of NA values in each column:")
print(df.isna().sum())

total_na = df.isna().sum().sum()
print(f"Total number of NA values in the DataFrame: {total_na}")

# drop rows without descriptions
df = df.dropna(subset=['Description'])

print(df.isna().sum())
# print(len(df))

Number of NA values in each column:
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64
Total number of NA values in the DataFrame: 136534
InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     133626
Country             0
dtype: int64


#### Split Dataset
- valid_sales (531167 rows)
- cancellations (9288 rows)


In [62]:
# sort valid sales from cancellations based on the starting 'C'
valid_sales = df[~df['InvoiceNo'].str.startswith('C')] # ~ is the bitwise NOT operator
cancellations = df[df['InvoiceNo'].str.startswith('C')]

# print(len(valid_sales))
# print(len(cancellations))

#### Valid Sales

In [63]:
# print(cancellations[cancellations['StockCode'].str.len() < 5])
# print(valid_sales[valid_sales['StockCode'].str.len() < 5])

#### Cleaning valid_sales
- Items with quantities of 0 or less removed / 474 removed
- Items with StockCode describing postage e.g. POST, M etc. removed / 2,312 removed
- Items with invalid price (£0 or less) removed / 578 removed

527,803 rows left

In [80]:
# print(valid_sales.info())
print(len(valid_sales), "rows before cleaning") # 531,167 rows initially
valid_sales = valid_sales[valid_sales['Quantity'] >= 1] # keep valid quantities | 474 rows removed
# print(len(valid_sales)) # 530,693 rows left
valid_sales = valid_sales[valid_sales['StockCode'].str.len() >= 5] # keep stockcodes for valid items | 2312 rows removed
# print(len(valid_sales)) # 528,381 rows left
valid_sales = valid_sales[valid_sales['UnitPrice'] > 0] # remove any items with price £0 or less | 578 rows removed
# print(len(valid_sales)) # 527,803 rows left

print(len(valid_sales), "rows after cleaning") # 527,803 rows left

527803 rows before cleaning
527803 rows after cleaning


#### Cleaning cancellations

In [81]:
print(cancellations.info())
print(cancellations.head())
print(len(cancellations), "rows before cleaning") # 9,288 rows initially
cancellations['Quantity'] = cancellations['Quantity'].abs() # convert all quantities to positive
print(len(cancellations)) # 9,288 rows left
cancellations = cancellations[cancellations['StockCode'].str.len() >= 5] # keep stockcodes for valid items | 527 rows removed
print(len(cancellations)) # 8,761 rows left
cancellations = cancellations[cancellations['UnitPrice'] > 0] # remove any items with price £0 or less | 0 rows removed
print(len(cancellations)) # 8,761 rows left

<class 'pandas.core.frame.DataFrame'>
Index: 8761 entries, 154 to 541717
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   InvoiceNo    8761 non-null   object        
 1   StockCode    8761 non-null   object        
 2   Description  8761 non-null   object        
 3   Quantity     8761 non-null   int64         
 4   InvoiceDate  8761 non-null   datetime64[ns]
 5   UnitPrice    8761 non-null   float64       
 6   CustomerID   8539 non-null   float64       
 7   Country      8761 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 616.0+ KB
None
    InvoiceNo StockCode                        Description  Quantity  \
154   C536383    35004C    SET OF 3 COLOURED  FLYING DUCKS         1   
235   C536391     22556     PLASTERS IN TIN CIRCUS PARADE         12   
236   C536391     21984   PACK OF 12 PINK PAISLEY TISSUES         24   
237   C536391     21983   PAC

##### Sales Analysis
Revenue Analysis (By Month)

In [65]:
# New column to reflect revenue earned from each sale
valid_sales["Revenue"] = valid_sales['Quantity'] * valid_sales['UnitPrice']

# New column YearMonth to separate sales by monthly buckets
valid_sales['YearMonth'] = valid_sales['InvoiceDate'].dt.to_period('M') ## YearMonth in format YYYY-MM

# print(valid_sales.head())
#* Sum up revenue columns based on buckets of YearMonth
monthly_sales = valid_sales.groupby('YearMonth')['Revenue'].sum().reset_index()
monthly_sales['YearMonth'] = monthly_sales['YearMonth'].astype(str) # convert to str for plotly

# print(monthly_sales)
#* identify best/worst performing month
highest_revenue_index = monthly_sales.loc[monthly_sales['Revenue'].idxmax()]
lowest_revenue_index = monthly_sales.loc[monthly_sales['Revenue'].idxmin()]
print(highest_revenue_index)
print(lowest_revenue_index)

print("Average revenue:", monthly_sales['Revenue'].mean())

fig_monthly_sales = px.line(monthly_sales,
              x = 'YearMonth', y = 'Revenue',
              title = "Total Revenue by Month",
              labels = {'YearMonth': 'Date', 'Revenue':'Revenue (Pounds Sterling)'},
              markers= True)

# Center the title
fig_monthly_sales.update_layout(
    title={
        'text': "Total Revenue by Month",
        'x': 0.5,                   # x position of title (0 is left, 1 is right)
        'xanchor': 'center'         # Center of title aligned with x=0.5
    }
)

fig_monthly_sales.show()

YearMonth       2011-11
Revenue      1457775.53
Name: 11, dtype: object
YearMonth      2011-02
Revenue      508952.87
Name: 2, dtype: object
Average revenue: 791234.2277692307


Revenue Analysis (By Country)

In [None]:

# print(valid_sales['Country'].unique())
# print(valid_sales[valid_sales['Country'] == "Unspecified"]) # 446 items aren't associated with any country
# * group by 'Country', perform aggregation on Revenue, Quantity, InvoiceNo
country_analysis = valid_sales.groupby('Country').agg(
    TotalRevenue=('Revenue', 'sum'),
    TotalQuantity=('Quantity', 'sum'),
    TotalInvoices=('InvoiceNo', 'nunique')
).reset_index()

# Calculate Average Order Value (AOV) i.e. average revenue per transaction
country_analysis['AOV'] = country_analysis['TotalRevenue'] / country_analysis['TotalInvoices']

# Sort countries by revenue (desc)
country_analysis = country_analysis.sort_values(by='TotalRevenue', ascending=False)

# Top 10 countries by revenue
top_ten_revenue = country_analysis.head(10)

# print(top_ten_revenue)

top_ten_revenue_fig = px.bar(top_ten_revenue,
                     x = 'Country', y = 'TotalRevenue',
                     labels = {'Country': 'Country', 'TotalRevenue':'Revenue (Pounds Sterling)'},
                     color = 'Country',
                     text_auto= True)

top_ten_revenue_fig.show()

           Country  TotalRevenue  TotalQuantity  TotalInvoices          AOV
36  United Kingdom   8762173.561        4654402          17914   489.124347
24     Netherlands    283889.340         200258             93  3052.573548
10            EIRE    271164.300         147062            282   961.575532
14         Germany    205569.890         118139            443   464.040384
13          France    184582.740         111269            382   483.200890
0        Australia    138171.310          83900             56  2467.344821
31           Spain     55725.110          27731             88   633.239886
33     Switzerland     53087.900          30527             50  1061.758000
20           Japan     37416.370          26016             19  1969.282632
3          Belgium     36927.340          22962             98   376.809592


Top Average Order Values (Top 10)

In [74]:
country_analysis = country_analysis.sort_values(by='AOV', ascending=False)

# Top 10 countries by AOV
top_ten_AOV = country_analysis.head(10)

# print(top_ten_AOV)

top_ten_aov_fig = px.bar(top_ten_AOV,
                     x = 'Country', y = 'AOV',
                     labels = {'Country': 'Country', 'AOV':'Average Order Value (Pound Sterling)'},
                     color = 'Country',
                     text_auto= True)

top_ten_aov_fig.show()