#### Import Packages

In [3]:
# dataset retrieved from https://archive.ics.uci.edu/dataset/352/online+retail
import pandas as pd
import numpy as np
import plotly.express as px

#### Import Data

In [31]:
file_path = "data/Online_Retail.csv"


df = pd.read_csv(file_path,
                 header= 0,
                #  nrows= 5,
                 parse_dates= ["InvoiceDate"],
                 date_format= "%d/%m/%y %H:%M" # account for DD/MM/YY HH:MM
                 )

# print(df.info())
# print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB
None
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    

#### Account for null values
- 1,454 values in description (dropped)
- 135,080 values in CustomerID, likely guests

540,455 rows left after dropping NA values in description

In [32]:
print("Number of NA values in each column:")
print(df.isna().sum())

total_na = df.isna().sum().sum()
print(f"Total number of NA values in the DataFrame: {total_na}")

# drop rows without descriptions
df = df.dropna(subset=['Description'])

print(df.isna().sum())
# print(len(df))

Number of NA values in each column:
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64
Total number of NA values in the DataFrame: 136534
InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     133626
Country             0
dtype: int64


#### Split Dataset
- valid_sales (531167 rows)
- cancellations (9288 rows)


In [33]:
# sort valid sales from cancellations based on the starting 'C'
valid_sales = df[~df['InvoiceNo'].str.startswith('C')] # ~ is the bitwise NOT operator
cancellations = df[df['InvoiceNo'].str.startswith('C')]

# print(len(valid_sales))
# print(len(cancellations))

#### Valid Sales

In [55]:
# print(cancellations[cancellations['StockCode'].str.len() < 5])
# print(valid_sales[valid_sales['StockCode'].str.len() < 5])

#### Cleaning valid_sales
- Items with quantities of 0 or less removed / 474 removed
- Items with StockCode describing postage e.g. POST, M etc. removed / 2,312 removed
- Items with invalid price (£0 or less) removed / 578 removed

527,803 rows left

In [34]:
# print(valid_sales.info())
print(len(valid_sales), "rows before cleaning") # 531,167 rows initially
valid_sales = valid_sales[valid_sales['Quantity'] >= 1] # keep valid quantities / removed 474 rows
# print(len(valid_sales)) # 530,693 rows left
valid_sales = valid_sales[valid_sales['StockCode'].str.len() >= 5] # keep stockcodes for valid items / removed 2312 rows
# print(len(valid_sales)) # 528,381 rows left
valid_sales = valid_sales[valid_sales['UnitPrice'] > 0] # remove any items with price £0 or less / 578 removed
# print(len(valid_sales)) # 527,803 rows left

print(len(valid_sales), "rows after cleaning") # 527,803 rows left

531167 rows before cleaning
527803 rows after cleaning


##### Sales Analysis
Revenue Analysis (By Month)

In [39]:
# New column to reflect revenue earned from each sale
valid_sales["Revenue"] = valid_sales['Quantity'] * valid_sales['UnitPrice']

# New column YearMonth to separate sales by monthly buckets
valid_sales['YearMonth'] = valid_sales['InvoiceDate'].dt.to_period('M') ## YearMonth in format YYYY-MM

# print(valid_sales.head())
#* Sum up revenue columns based on buckets of YearMonth
monthly_sales = valid_sales.groupby('YearMonth')['Revenue'].sum().reset_index()
monthly_sales['YearMonth'] = monthly_sales['YearMonth'].astype(str) # convert to str for plotly

print(monthly_sales)

fig_monthly_sales = px.line(monthly_sales,
              x = 'YearMonth', y = 'Revenue',
              title = "Total Revenue by Month",
              labels = {'YearMonth': 'Date', 'Revenue':'Revenue (Pounds Sterling)'},
              markers= True)

# Center the title
fig_monthly_sales.update_layout(
    title={
        'text': "Total Revenue by Month",
        'x': 0.5,                   # x position of title (0 is left, 1 is right)
        'xanchor': 'center'         # Center of title aligned with x=0.5
    }
)

fig_monthly_sales.show()

   YearMonth      Revenue
0    2010-12   791564.690
1    2011-01   672007.410
2    2011-02   508952.870
3    2011-03   691485.700
4    2011-04   516324.790
5    2011-05   741290.740
6    2011-06   738876.880
7    2011-07   689397.740
8    2011-08   725605.160
9    2011-09  1030530.361
10   2011-10  1106716.490
11   2011-11  1457775.530
12   2011-12   615516.600
