In [1]:
# import necessary packages
import pandas as pd

In [2]:
all_data=pd.read_csv('data/all_data.csv') # read data from cvs file
all_data.sample(10) # show 10 random samples

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
117789,173967,AAA Batteries (4-pack),1,2.99,03/09/19 09:29,"488 Maple St, Atlanta, GA 30301"
63147,157777,34in Ultrawide Monitor,1,379.99,02/17/19 16:15,"704 Hickory St, San Francisco, CA 94016"
93883,212048,Vareebadd Phone,1,400.0,06/12/19 11:28,"37 Cedar St, San Francisco, CA 94016"
70574,144105,AA Batteries (4-pack),1,3.84,01/13/19 13:00,"845 14th St, Seattle, WA 98101"
23186,241280,Wired Headphones,1,11.99,08/22/19 16:07,"599 Park St, Atlanta, GA 30301"
170894,274723,AA Batteries (4-pack),1,3.84,10/19/19 16:19,"421 11th St, Boston, MA 02215"
178120,250983,Lightning Charging Cable,1,14.95,09/02/19 20:04,"71 Hickory St, San Francisco, CA 94016"
32267,297442,AA Batteries (4-pack),1,3.84,12/28/19 12:56,"148 Lake St, Portland, OR 97035"
147818,289015,USB-C Charging Cable,1,11.95,11/29/19 09:10,"929 7th St, New York City, NY 10001"
44140,308782,AA Batteries (4-pack),2,3.84,12/08/19 11:26,"234 Maple St, San Francisco, CA 94016"


In [3]:
# get familiar with features using describe and info
all_data.describe()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
count,186305,186305,186305,186305.0,186305,186305
unique,178438,20,10,24.0,142396,140788
top,Order ID,USB-C Charging Cable,1,11.95,Order Date,Purchase Address
freq,355,21903,168552,21903.0,355,355


In [4]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186850 entries, 0 to 186849
Data columns (total 6 columns):
Order ID            186305 non-null object
Product             186305 non-null object
Quantity Ordered    186305 non-null object
Price Each          186305 non-null object
Order Date          186305 non-null object
Purchase Address    186305 non-null object
dtypes: object(6)
memory usage: 8.6+ MB


### Clean up the data!

#### 1. check for nans

In [5]:
all_data.isna().sum()

Order ID            545
Product             545
Quantity Ordered    545
Price Each          545
Order Date          545
Purchase Address    545
dtype: int64

In [6]:
# we will remove nans , their count is small relative to dataset size
all_data.dropna(inplace=True)
# check again
all_data.isna().sum()

Order ID            0
Product             0
Quantity Ordered    0
Price Each          0
Order Date          0
Purchase Address    0
dtype: int64

In [7]:
all_data[all_data['Order Date']=='Order Date'] # there is issue in data , we need to fix this

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
519,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
1149,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
1155,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
2878,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
2893,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
...,...,...,...,...,...,...
185164,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
185551,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
186563,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
186632,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address


In [8]:
all_data = all_data[all_data['Order Date'].str[0]!='O']
all_data

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,176558,USB-C Charging Cable,2,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
2,176559,Bose SoundSport Headphones,1,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
3,176560,Google Phone,1,600,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
4,176560,Wired Headphones,1,11.99,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
5,176561,Wired Headphones,1,11.99,04/30/19 09:27,"333 8th St, Los Angeles, CA 90001"
...,...,...,...,...,...,...
186845,259353,AAA Batteries (4-pack),3,2.99,09/17/19 20:56,"840 Highland St, Los Angeles, CA 90001"
186846,259354,iPhone,1,700,09/01/19 16:00,"216 Dogwood St, San Francisco, CA 94016"
186847,259355,iPhone,1,700,09/23/19 07:39,"220 12th St, San Francisco, CA 94016"
186848,259356,34in Ultrawide Monitor,1,379.99,09/19/19 17:30,"511 Forest St, San Francisco, CA 94016"


#### 2 . rearrange order date feature

In [9]:
all_data['Order Date dmy']=pd.to_datetime(all_data['Order Date'].apply(lambda x:str(x).split(' ')[0]))
all_data['Order Date time']=pd.to_datetime(all_data['Order Date'].apply(lambda x:str(x).split(' ')[1])).dt.time

In [10]:
# check rearrangment is done
all_data['Order Date dmy']

0        2019-04-19
2        2019-04-07
3        2019-04-12
4        2019-04-12
5        2019-04-30
            ...    
186845   2019-09-17
186846   2019-09-01
186847   2019-09-23
186848   2019-09-19
186849   2019-09-30
Name: Order Date dmy, Length: 185950, dtype: datetime64[ns]

In [11]:
all_data['Order Date time']

0         08:46:00
2         22:30:00
3         14:38:00
4         14:38:00
5         09:27:00
            ...   
186845    20:56:00
186846    16:00:00
186847    07:39:00
186848    17:30:00
186849    00:18:00
Name: Order Date time, Length: 185950, dtype: object

In [12]:
# also for analysis purpose we may need to add new column month column
all_data['Order Date month'] = pd.to_datetime(all_data['Order Date']).dt.month

In [13]:
all_data['Order Date month'].sample(10)

134715     5
89349      7
130390     5
22888      8
99075      6
55996      2
171916    10
9423       4
15564      4
114211     3
Name: Order Date month, dtype: int64