# Orders

## Import

In [1]:
import pandas as pd

# URLs for raw content of the CSV files on GitHub
orders_url = "https://raw.githubusercontent.com/MerleSt/Eniac/main/Data-Eniac/orders.csv"
orders_df = pd.read_csv(orders_url)
orders = orders_df.copy()

## Drop Duplicates

In [2]:
orders.duplicated().sum()

0

## Missing Values

In [3]:
orders.isna().sum()

order_id        0
created_date    0
total_paid      5
state           0
dtype: int64

In [4]:
print(f"5 missing values represents {((orders.total_paid.isna().sum() / orders.shape[0])*100).round(5)}% of the rows in our DataFrame")

5 missing values represents 0.0022% of the rows in our DataFrame


In [5]:
orders.total_paid.isna().value_counts(normalize=True)

False    0.999978
True     0.000022
Name: total_paid, dtype: float64

In [6]:
orders = orders.loc[~orders.total_paid.isna(), :]
orders.isna().sum()

order_id        0
created_date    0
total_paid      0
state           0
dtype: int64

## Datatypes & Format

In [7]:
orders.dtypes

order_id          int64
created_date     object
total_paid      float64
state            object
dtype: object

In [8]:
orders['order_id'] = orders['order_id'].astype(str)

In [9]:
orders['created_date']  = pd.to_datetime(orders['created_date'])

In [10]:
orders['state'] = orders['state'].astype('category')

In [11]:
orders.dtypes

order_id                object
created_date    datetime64[ns]
total_paid             float64
state                 category
dtype: object

In [12]:
orders.loc[orders['total_paid']<=0]

Unnamed: 0,order_id,created_date,total_paid,state
150,296010,2017-01-09 23:47:00,0.0,Completed
264,299605,2017-01-01 10:33:46,0.0,Shopping Basket
308,299651,2017-01-01 12:23:58,0.0,Shopping Basket
377,299731,2017-01-01 14:52:18,0.0,Shopping Basket
380,299734,2017-01-01 14:55:20,0.0,Shopping Basket
...,...,...,...,...
226835,527328,2018-03-14 13:14:12,0.0,Shopping Basket
226851,527344,2018-03-14 13:25:21,0.0,Place Order
226853,527346,2018-03-14 13:26:14,0.0,Place Order
226855,527348,2018-03-14 13:28:18,0.0,Place Order


Remember, some prices are zero, but it makes sense since they may have not been delivered.

## Export

In [13]:
orders.to_csv('/Users/merlesteffen/Documents/GitHub/Eniac/Data-Eniac/Data_Cleaned/orders_clean.csv', index=False)
orders.to_parquet('/Users/merlesteffen/Documents/GitHub/Eniac/Data-Eniac/Data_Cleaned/orders_clean.parquet', index=False)