In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
%matplotlib inline 

In [2]:
# Floats (decimal numbers) should be displayed rounded with 2 decimal places
pd.options.display.float_format = "{:,.2f}".format
# Set style for plots
plt.style.use('fivethirtyeight') 

In [3]:
start_orders = pd.read_csv('./data/orders.csv')
start_order_process = pd.read_csv('./data/order_process_data.csv')
start_intern_study = pd.read_csv('./data/interndata_study.csv')
start_campaign = pd.read_csv('./data/campaign_data.csv')

In [4]:
%store start_orders
%store start_order_process
%store start_intern_study
%store start_campaign

Stored 'start_orders' (DataFrame)
Stored 'start_order_process' (DataFrame)
Stored 'start_intern_study' (DataFrame)
Stored 'start_campaign' (DataFrame)


### Check overview of total data

In [5]:
# Reading start_orders

#start_orders.columns
#start_orders.shape
#start_orders.duplicated().value_counts()
#start_orders.info()
#start_orders['Sub-Category'].unique()

In [6]:
# Reading orders process data

#start_order_process.columns
#start_order_process.shape
start_order_process.duplicated().value_counts()
#start_order_process.info()
#start_order_process['On Truck Scan Date'].unique()

False    5899
Name: count, dtype: int64

In [7]:
# Reading intern data study

#intern_study.columns
#intern_study.shape
#intern_study.duplicated().value_counts()
#intern_study.info()

In [8]:
# Reading campaign data

#campaign.columns
#campaign.shape
#campaign.duplicated().value_counts()
#campaign.info()

### Check head & tail of all DataFrames next to each other

In [9]:
## check ORDERS head & tail
display(start_orders.head(2))
print("-----------------------------")
display(start_orders.tail(2))

Unnamed: 0,Index,Order ID,Order Date,Ship Mode,Customer ID,Customer Name,Origin Channel,Country/Region,City,State,Postal Code,Region,Category,Sub-Category,Product ID,Sales,Quantity,Discount,Profit
0,27,CA-2019-121755,16/1/2019,Second Class,EH-13945,Eric Hoffmann,Email,United States,Los Angeles,California,90049.0,West,Special Projects Muesil,Gluten Free,TEC-AC-10003027,90.57,3,0.0,11.77
1,45,CA-2019-118255,11/3/2019,First Class,ON-18715,Odella Nelson,Sales,United States,Eagan,Minnesota,55122.0,Central,Special Projects Muesil,Gluten Free,TEC-AC-10000171,45.98,2,0.0,19.77


-----------------------------


Unnamed: 0,Index,Order ID,Order Date,Ship Mode,Customer ID,Customer Name,Origin Channel,Country/Region,City,State,Postal Code,Region,Category,Sub-Category,Product ID,Sales,Quantity,Discount,Profit
9992,9920,CA-2019-149272,15/3/2019,Standard Class,MY-18295,Muhammed Yedwab,Sales,United States,Bryan,Texas,77803.0,Central,Toasted Muesli,With Nuts,FUR-CH-10000863,528.43,5,0.3,-143.43
9993,9948,CA-2020-121559,1/6/2020,Second Class,HW-14935,Helen Wasserman,Sales,United States,Indianapolis,Indiana,46203.0,Central,Toasted Muesli,With Nuts,FUR-CH-10003746,1925.88,6,0.0,539.25


In [10]:
## check ORDER PROCESS head & tail
display(start_order_process.head(2))
print("-----------------------------")
display(start_order_process.tail(2))

Unnamed: 0,Row ID,Order ID,Order Date,On Truck Scan Date,Ship Mode
0,3074,CA-2019-125206,3/1/2019,07/01/2019,Express
1,4919,CA-2019-160304,2/1/2019,09/01/2019,Standard Processing


-----------------------------


Unnamed: 0,Row ID,Order ID,Order Date,On Truck Scan Date,Ship Mode
5897,1298,CA-2020-115427,30/12/2020,06/01/2021,Standard Processing
5898,5092,CA-2020-156720,30/12/2020,06/01/2021,Standard Processing


In [11]:
## check INTERN STUDY head & tail
display(start_intern_study.head(2))
print("-----------------------------")
display(start_intern_study.tail(2))

Unnamed: 0,Order ID,Ready to Ship Date,Pickup Date
0,CA-2019-116540,02/09/2019,03/09/2019
1,CA-2019-116540,02/09/2019,03/09/2019


-----------------------------


Unnamed: 0,Order ID,Ready to Ship Date,Pickup Date
288,CA-2020-119305,04/12/2020,07/12/2020
289,CA-2020-142090,07/12/2020,09/12/2020


In [12]:
## check CAMPAIGNS head & tail
display(start_campaign.head(2))
print("-----------------------------")
display(start_campaign.tail(2))

Unnamed: 0,Order ID,Arrival Scan Date,Customer Name
0,CA-2019-109666,03/05/2019,Kunst Miller
1,CA-2019-138933,03/05/2019,Jack Lebron


-----------------------------


Unnamed: 0,Order ID,Arrival Scan Date,Customer Name
331,US-2020-104451,15/05/2020,Michelle Moray
332,US-2020-139647,15/05/2020,Todd Sumrall


### Check Shape, Size & Type of data across each DataFrame

In [13]:
## check ORDERS info
display(start_orders.shape)
display(start_orders.size)
display(start_orders.dtypes)

(9994, 19)

189886

Index               int64
Order ID           object
Order Date         object
Ship Mode          object
Customer ID        object
Customer Name      object
Origin Channel     object
Country/Region     object
City               object
State              object
Postal Code       float64
Region             object
Category           object
Sub-Category       object
Product ID         object
Sales             float64
Quantity            int64
Discount          float64
Profit            float64
dtype: object

In [14]:
## check ORDER PROCESS
display(start_order_process.shape)
display(start_order_process.size)
display(start_order_process.dtypes)

(5899, 5)

29495

Row ID                 int64
Order ID              object
Order Date            object
On Truck Scan Date    object
Ship Mode             object
dtype: object

In [15]:
## check INTERN STUDY info
display(start_intern_study.shape)
display(start_intern_study.size)
display(start_intern_study.dtypes)

(290, 3)

870

Order ID              object
Ready to Ship Date    object
Pickup Date           object
dtype: object

In [16]:
## check CAMPAIGN info
display(start_campaign.shape)
display(start_campaign.size)
display(start_campaign.dtypes)

(333, 3)

999

Order ID             object
Arrival Scan Date    object
Customer Name        object
dtype: object

### Do we have any duplicated Items or NULL values in each DataFrame?

In [17]:
# check for ORDERS
display(start_orders.duplicated().value_counts())
display(start_orders.isnull().sum())

False    9994
Name: count, dtype: int64

Index              0
Order ID           0
Order Date         0
Ship Mode          0
Customer ID        0
Customer Name      0
Origin Channel     0
Country/Region     0
City               0
State              0
Postal Code       11
Region             0
Category           0
Sub-Category       0
Product ID         0
Sales              0
Quantity           0
Discount           0
Profit             0
dtype: int64

In [18]:
# check for ORDER PROCESS
display(start_order_process.duplicated().value_counts())
display(start_order_process.isnull().sum())

False    5899
Name: count, dtype: int64

Row ID                0
Order ID              0
Order Date            0
On Truck Scan Date    0
Ship Mode             0
dtype: int64

In [19]:
# # check for INTERN STUDY
display(start_intern_study.duplicated().value_counts())
display(start_intern_study.isnull().sum())

False    204
True      86
Name: count, dtype: int64

Order ID              0
Ready to Ship Date    0
Pickup Date           0
dtype: int64

In [20]:
# check for CAMPAIGN
display(start_campaign.duplicated().value_counts())
display(start_campaign.isnull().sum())

False    333
Name: count, dtype: int64

Order ID             0
Arrival Scan Date    0
Customer Name        0
dtype: int64