In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import plotly.express as px
from sqlalchemy import create_engine

In [2]:
# Save csv file to df
df = pd.read_csv('../data/online_retail.csv')

In [3]:
print(f'''{df.head()},
      {df.describe()},
      {df.info()},
      {df.columns}''')
      

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLL

In [4]:
# Remove rows that contain null invoice numbers or customer IDs
df = df.dropna(subset=['InvoiceNo', 'CustomerID'])

# Convert invoice date to date time format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Remove all negative quantities and unit prices
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 397884 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    397884 non-null  object        
 1   StockCode    397884 non-null  object        
 2   Description  397884 non-null  object        
 3   Quantity     397884 non-null  int64         
 4   InvoiceDate  397884 non-null  datetime64[ns]
 5   UnitPrice    397884 non-null  float64       
 6   CustomerID   397884 non-null  float64       
 7   Country      397884 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.3+ MB


In [5]:
# Add total revenue column
df['Revenue'] = df['UnitPrice'] * df['Quantity']
df['Revenue'].sum()

np.float64(8911407.904)

In [6]:
# Return top selling products
product_revenue = df.groupby('Description')['Revenue'].sum().sort_values(ascending = False)
product_revenue.head(20)

Description
PAPER CRAFT , LITTLE BIRDIE           168469.60
REGENCY CAKESTAND 3 TIER              142592.95
WHITE HANGING HEART T-LIGHT HOLDER    100448.15
JUMBO BAG RED RETROSPOT                85220.78
MEDIUM CERAMIC TOP STORAGE JAR         81416.73
POSTAGE                                77803.96
PARTY BUNTING                          68844.33
ASSORTED COLOUR BIRD ORNAMENT          56580.34
Manual                                 53779.93
RABBIT NIGHT LIGHT                     51346.20
CHILLI LIGHTS                          46286.51
PAPER CHAIN KIT 50'S CHRISTMAS         42660.83
PICNIC BASKET WICKER 60 PIECES         39619.50
BLACK RECORD COVER FRAME               39064.55
JUMBO BAG PINK POLKADOT                37289.59
DOORMAT KEEP CALM AND COME IN          35913.85
SPOTTY BUNTING                         35539.25
WOOD BLACK BOARD ANT WHITE FINISH      34478.01
SET OF 3 CAKE TINS PANTRY DESIGN       33347.80
JAM MAKING SET WITH JARS               32662.97
Name: Revenue, dtype: float6

In [7]:
# Return most loyal customers
customer_spendings = df.groupby('CustomerID')['Revenue'].sum().sort_values(ascending=False)
customer_spendings.head(20)

CustomerID
14646.0    280206.02
18102.0    259657.30
17450.0    194550.79
16446.0    168472.50
14911.0    143825.06
12415.0    124914.53
14156.0    117379.63
17511.0     91062.38
16029.0     81024.84
12346.0     77183.60
16684.0     66653.56
14096.0     65164.79
13694.0     65039.62
15311.0     60767.90
13089.0     58825.83
17949.0     58510.48
15769.0     56252.72
15061.0     54534.14
14298.0     51527.30
14088.0     50491.81
Name: Revenue, dtype: float64

In [8]:
# Display a bar plot using plotly showcasing the top 20 selling products
px.bar(
    x=product_revenue.head(20).index,
    y=product_revenue.head(20).round(-3).values,
    color=product_revenue.head(20).round(-3).values,
    labels = {'x': 'Product', 'y': 'Revenue'},
    title = 'Top Selling Products'
).update_layout(xaxis=dict(showticklabels=False))

In [9]:
# Display a bar plot using plotly showcasing the top 20 customers
px.bar(
    x=customer_spendings.head(20).index,
    y=customer_spendings.head(20).round().values,
    color=product_revenue.head(20).round().values,
    labels = {'x': 'CustomerID', 'y': 'Spending'},
    title = 'Top Customers'
).update_layout(xaxis=dict(showticklabels=False))