## 1. Library import

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

## 2. Read Dataset

In [15]:
deptinfo = pd.read_csv("deptinfo.csv", header = None, names = ['Dept', 'DeptDESC', 'unknown'] )
skuinfo = pd.read_csv("skuinfo.csv", header = None, dtype = 'str', 
                     names = ['SKU', 'DEPT', 'CLASSID', 'UPC', 'STYLE',
                             'COLOR', 'SIZE', 'PACKSIZE', 'VENDOR', 'BRAND', 
                             'Unkown1', 'Unknown2', 'Unknown3'])
strinfo = pd.read_csv("strinfo.csv", header = None, names = ['City', 'State', 'Zip', 'unkown'])
skstinfo = pd.read_csv("skstinfo.csv", header = None, names = ['SKU', "Store", "Cost", "Retail", "unknown"])

## we will be deal with this after loading it into postgre
# trnsact =  pd.read_csv("trnsact.csv", header = None, names = [""])

## 3. Data Clean and (very) Basic EDA

### 3.a. Data Clean for Department File

In [16]:
## take a brief look at the department info
deptinfo.head()

Unnamed: 0,Dept,DeptDESC,unknown
0,800,CLINIQUE,0
1,801,LESLIE,0
2,1100,GARY F,0
3,1107,JACQUES,0
4,1202,CABERN,0


In [17]:
## Investigate unknown column 
np.unique(deptinfo.unknown)

array([0, 1])

#### Given the fact that the column unknow only has 0 and 1, and it does not appear in our schema, we decided to drop it.

In [18]:
deptinfo = deptinfo.drop("unknown", axis =1)

#### Check is there any duplicate? looks like a no.

In [21]:
deptinfo[deptinfo.duplicated()] 

Unnamed: 0,Dept,DeptDESC


#### Check is there any null values? NO.

In [22]:
deptinfo.isnull().sum()

Dept        0
DeptDESC    0
dtype: int64

In [23]:
len(deptinfo)

60

### 3.b. Data Clean for SKU INFO

In [24]:
skuinfo.head()

Unnamed: 0,SKU,DEPT,CLASSID,UPC,STYLE,COLOR,SIZE,PACKSIZE,VENDOR,BRAND,Unkown1,Unknown2,Unknown3
0,3,6505,113,400000003000,00 F55KT2,WHISPERWHITE,P8EA,1,5119207,TURNBURY,0,,
1,4,8101,002,400000004000,22 615CZ4,SPEARMI,S,1,3311144,C A SPOR,0,,
2,5,7307,003,400000005000,7LBS 245-01,34 SILVER,KING,1,5510554,BEAU IDE,0,,
3,8,3404,00B,400000008000,622 F05H84,MORNING MI,2T,1,2912827,HARTSTRI,0,,
4,15,2301,004,400000015000,126 MDU461,255CAMEL,12,1,23272,JONES/LA,0,,


In [47]:
## looks at what are the values in unknown1
np.unique(skuinfo.Unkown1)

array([' ', '  ', ' I ', ' IN ', ' INC ', ' L ', ' LL ', '0', '0060904',
       '0514761', '1', '1216222', 'ALL ACCE ', 'ARRO ', 'CALVIN K ',
       'CAROLEE  ', 'CHANEL I ', 'LLC ', 'RE ', 'RUGGED D ', 'SARA LEE ',
       'WEE ONES '], dtype=object)

It looks like some Brands name tho, we decided to keep it in case it has some useful info.

In [50]:
np.unique(skuinfo[skuinfo.Unknown2.notna()]["Unknown2"])

array(['0', 'BROWN SH ', 'NINA FOO ', 'THE TREN '], dtype=object)

it looks like has some infomation, but only 3 unique values, let's see what are the percentage of na in this row later to see if we want to drop it or keep it.

In [51]:
np.unique(skuinfo[skuinfo.Unknown3.notna()]["Unknown3"])

array(['0'], dtype=object)

this tells us it has 0 information, we should drop it.

In [53]:
## drop unknown 3
skuinfo = skuinfo.drop("Unknown3", axis = 1)

#### Null Value Sumary for each column (in percentage)

In [58]:
((skuinfo.isna().mean() * 100).round(2)).apply(lambda x: f'{x:.2f}%')

SKU          0.00%
DEPT         0.00%
CLASSID      0.00%
UPC          0.00%
STYLE        0.00%
COLOR        0.00%
SIZE         0.00%
PACKSIZE     0.00%
VENDOR       0.00%
BRAND        0.00%
Unkown1      0.00%
Unknown2    99.48%
dtype: object

In [61]:
## Accordding to the graph, 99% of the unknown2 is None Value, so we decided to drop it
## drop unknown 2
skuinfo = skuinfo.drop("Unknown2", axis = 1)

### 3.c. Data Clean for STORE INFO

In [66]:
np.unique(strinfo.unkown)

array([0, 1])

In [67]:
## same logic above, drop the unknown for strinfo
strinfo = strinfo.drop("unkown", axis = 1)

In [68]:
strinfo.head()

Unnamed: 0,City,State,Zip
2,ST. PETERSBURG,FL,33710
3,ST. LOUIS,MO,63126
4,LITTLE ROCK,AR,72201
7,FORT WORTH,TX,76137
9,TEMPE,AZ,85281


#### There is no Duplicates

In [40]:
strinfo.isnull().sum()

City      0
State     0
Zip       0
unkown    0
dtype: int64

In [69]:
len(strinfo)

453

### 3.d. Data Clean for STORE INFO

In [71]:
np.unique(skstinfo.unknown)

array([0, 1])

In [75]:
## drop it for no info
skstinfo = skstinfo.drop("unknown", axis =1)

In [76]:
skstinfo

Unnamed: 0,SKU,Store,Cost,Retail
0,3,102,123.36,440.00
1,3,103,123.36,440.00
2,3,104,123.36,440.00
3,3,202,123.36,440.00
4,3,203,123.36,440.00
...,...,...,...,...
39230141,9999997,2007,15.00,19.50
39230142,9999997,2707,15.00,9.75
39230143,9999997,3307,15.00,19.50
39230144,9999997,7507,15.00,19.50


#### It looks like there is no NA values

In [77]:
skstinfo.isnull().sum()

SKU       0
Store     0
Cost      0
Retail    0
dtype: int64

#### Some SKU Retail Price is lower than the COST!!

In [92]:
higher_cost = skstinfo[skstinfo.Retail < skstinfo.Cost]
higher_cost_sku_percent = f'{(len(set(higher_cost.SKU)) / len(set(skstinfo.SKU)))* 100:.2f}%'

In [93]:
print(f"There are aboout {higher_cost_sku_percent} SKU whose Cost is higher than Retail Price")

There are aboout 49.46% SKU whose Cost is higher than Retail Price
