In [2]:
# import packages
import pandas as pd
from pathlib import Path
from IPython.display import display

# setup directory to load data
RAW_DIR = Path("../data/raw")
RAW_FILE = RAW_DIR / "online_retail_II.csv"    
retail = pd.read_csv(RAW_FILE, encoding="ISO-8859-1")

In [3]:
# =========================
# DATA OVERVIEW
# =========================

# quick peek
display(retail.head(10))
display(retail.tail(5))
display(retail.sample(5, random_state=42))

# shape + columns
print("Shape (rows, cols):", retail.shape)
display(retail.columns.to_frame(name="column_name"))

# data types + missing values
display(retail.dtypes.to_frame(name="dtype"))
display(retail.isna().sum().sort_values(ascending=False).to_frame(name="missing_count"))

# summary stats (numeric)
display(retail.describe().T)

# unique counts per column
display(retail.nunique().sort_values(ascending=False).to_frame(name="n_unique"))

# top values
display(retail["Country"].value_counts().head(10).to_frame(name="count"))
display(retail["Description"].value_counts().head(10).to_frame(name="count"))


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
5,489434,22064,PINK DOUGHNUT TRINKET POT,24,2009-12-01 07:45:00,1.65,13085.0,United Kingdom
6,489434,21871,SAVE THE PLANET MUG,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
7,489434,21523,FANCY FONT HOME SWEET HOME DOORMAT,10,2009-12-01 07:45:00,5.95,13085.0,United Kingdom
8,489435,22350,CAT BOWL,12,2009-12-01 07:46:00,2.55,13085.0,United Kingdom
9,489435,22349,"DOG BOWL , CHASING BALL DESIGN",12,2009-12-01 07:46:00,3.75,13085.0,United Kingdom


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France
1067370,581587,POST,POSTAGE,1,2011-12-09 12:50:00,18.0,12680.0,France


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
455941,532657,21314,SMALL GLASS HEART TRINKET POT,12,2010-11-14 11:10:00,2.1,14562.0,United Kingdom
826291,563214,22383,LUNCH BAG SUKI DESIGN,2,2011-08-14 12:56:00,1.65,16370.0,United Kingdom
191636,507597,22561,WOODEN SCHOOL COLOURING SET,12,2010-05-10 13:21:00,1.65,17700.0,United Kingdom
25864,491634,21588,RETRO SPOT GIANT TUBE MATCHES,1,2009-12-11 15:40:00,2.55,17841.0,United Kingdom
73233,496007,85232B,SET/3 RUSSIAN DOLL STACKING TINS,3,2010-01-28 12:32:00,4.95,15203.0,United Kingdom


Shape (rows, cols): (1067371, 8)


Unnamed: 0,column_name
Invoice,Invoice
StockCode,StockCode
Description,Description
Quantity,Quantity
InvoiceDate,InvoiceDate
Price,Price
Customer ID,Customer ID
Country,Country


Unnamed: 0,dtype
Invoice,object
StockCode,object
Description,object
Quantity,int64
InvoiceDate,object
Price,float64
Customer ID,float64
Country,object


Unnamed: 0,missing_count
Customer ID,243007
Description,4382
Invoice,0
StockCode,0
Quantity,0
InvoiceDate,0
Price,0
Country,0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,1067371.0,9.938898,172.705794,-80995.0,1.0,3.0,10.0,80995.0
Price,1067371.0,4.649388,123.553059,-53594.36,1.25,2.1,4.15,38970.0
Customer ID,824364.0,15324.638504,1697.46445,12346.0,13975.0,15255.0,16797.0,18287.0


Unnamed: 0,n_unique
Invoice,53628
InvoiceDate,47635
Customer ID,5942
Description,5698
StockCode,5305
Price,2807
Quantity,1057
Country,43


Unnamed: 0_level_0,count
Country,Unnamed: 1_level_1
United Kingdom,981330
EIRE,17866
Germany,17624
France,14330
Netherlands,5140
Spain,3811
Switzerland,3189
Belgium,3123
Portugal,2620
Australia,1913


Unnamed: 0_level_0,count
Description,Unnamed: 1_level_1
WHITE HANGING HEART T-LIGHT HOLDER,5918
REGENCY CAKESTAND 3 TIER,4412
JUMBO BAG RED RETROSPOT,3469
ASSORTED COLOUR BIRD ORNAMENT,2958
PARTY BUNTING,2765
STRAWBERRY CERAMIC TRINKET BOX,2613
LUNCH BAG BLACK SKULL.,2529
JUMBO STORAGE BAG SUKI,2434
HEART OF WICKER SMALL,2319
JUMBO SHOPPER VINTAGE RED PAISLEY,2297


## Missing Customer IDs

CustomerID is missing for some rows.
Possible explanations:
- Guest checkout
- Incomprical recations

Decision on handling missing CustomerID will be taken
in the data cleaning stage based on analysis goals.
