In [1]:
#Data retrieved from: https://www.kaggle.com/datasets/carrie1/ecommerce-data

In [93]:
#Import Dependencies
import pandas as pd
import numpy as np

In [94]:
#Import csv file, save to variable 'data'
data = pd.read_csv('ecommerce_data.csv')

## Examine Data

In [95]:
#View that format of the dataframe
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [96]:
#Discover null values
data.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [97]:
#Discover number of rows with complete data
data.count()

InvoiceNo      541909
StockCode      541909
Description    540455
Quantity       541909
InvoiceDate    541909
UnitPrice      541909
CustomerID     406829
Country        541909
dtype: int64

In [98]:
#Discover value types
data.dtypes

InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID     float64
Country         object
dtype: object

In [99]:
#Discover number of unique data points
print(f'There are {data.InvoiceNo.nunique()} unique invoice numbers\n')
print(f'There are {data.InvoiceDate.nunique()} unique invoice dates/times\n')
print(f'There are {data.CustomerID.nunique()} unique customer IDs\n')
print(f'There are {data.Country.nunique()} unique countries')

There are 25900 unique invoice numbers

There are 23260 unique invoice dates/times

There are 4372 unique customer IDs

There are 38 unique countries


## Cleaning & Reformatting

In [105]:
#Copy the dataframe in order to clean and save it
data_clean = data

In [106]:
#Reformat 'InvoiceDate' as datetime data type
data_clean['InvoiceDate'] = pd.to_datetime(data_clean['InvoiceDate'], format = '%m/%d/%Y %H:%M')
data_clean.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

## Write To csv File To Access Data Outside of File

In [109]:
#Export cleaned data to a new csv file
#Without 'index=False', a new index column is created which is not necessary here
data_clean.to_csv('ecommerce_data_clean.csv', index=False) 