# Data Cleaning and Feature Engineering 
## Developed on 10,000 entries
## Create Percentage of Purchase vs Percentage of Return by SKU
## Create Average Percent Discount by SKU

In [1]:
import pandas as pd
import numpy as np
import statistics as stat

dat = pd.read_csv('trnsact_10k.csv')
print(dat.shape)
dat.head(10)

(10000, 14)


Unnamed: 0,sku,store,register,trannum,seq,saledate,stype,quantity,orgprice,amt,amt2,interid,mic,zero
0,6767033,3909,140,5800,0,2004-12-17,P,1,34.0,34.0,34.0,974000139,777,0
1,6767033,4003,620,9400,0,2005-02-26,P,1,34.0,4.25,4.25,508800293,777,0
2,6767033,4004,410,2100,0,2005-02-06,P,1,34.0,17.0,17.0,780200057,777,0
3,6767033,4007,240,18700,440008916,2005-01-29,P,1,34.0,11.9,11.9,853500167,777,0
4,6767033,4102,320,7100,933004517,2004-11-20,P,1,34.0,34.0,34.0,69900097,777,0
5,6767033,4103,930,4500,321100478,2005-02-24,P,1,34.0,4.25,4.25,136400236,777,0
6,6767033,4104,330,500,0,2004-11-03,P,1,34.0,34.0,34.0,769200040,777,0
7,6767033,4107,20,1800,643508749,2005-01-05,P,1,34.0,24.99,24.99,828500068,777,0
8,6767033,4109,400,4800,0,2005-01-28,P,1,34.0,11.9,11.9,764600122,777,0
9,6767033,4202,21,400,0,2005-01-18,P,1,34.0,17.0,17.0,921200048,777,0


In [100]:
# Replacing 0 values in orgprice and amt as the mean of the orgprice and amt of that specific sku
dat['orgprice'] = dat['orgprice'].replace(0, np.nan)
dat['orgprice'] = dat['orgprice'].fillna(dat.groupby(['sku'])['orgprice'].transform('mean'))

dat['amt'] = dat['amt'].replace(0, np.nan)
dat['amt'] = dat['amt'].fillna(dat.groupby(['sku'])['amt'].transform('mean'))
dat.head(10)

Unnamed: 0,sku,store,register,trannum,seq,saledate,stype,quantity,orgprice,amt,amt2,interid,mic,zero
0,6767033,3909,140,5800,0,2004-12-17,P,1,34.0,34.0,34.0,974000139,777,0
1,6767033,4003,620,9400,0,2005-02-26,P,1,34.0,4.25,4.25,508800293,777,0
2,6767033,4004,410,2100,0,2005-02-06,P,1,34.0,17.0,17.0,780200057,777,0
3,6767033,4007,240,18700,440008916,2005-01-29,P,1,34.0,11.9,11.9,853500167,777,0
4,6767033,4102,320,7100,933004517,2004-11-20,P,1,34.0,34.0,34.0,69900097,777,0
5,6767033,4103,930,4500,321100478,2005-02-24,P,1,34.0,4.25,4.25,136400236,777,0
6,6767033,4104,330,500,0,2004-11-03,P,1,34.0,34.0,34.0,769200040,777,0
7,6767033,4107,20,1800,643508749,2005-01-05,P,1,34.0,24.99,24.99,828500068,777,0
8,6767033,4109,400,4800,0,2005-01-28,P,1,34.0,11.9,11.9,764600122,777,0
9,6767033,4202,21,400,0,2005-01-18,P,1,34.0,17.0,17.0,921200048,777,0


In [96]:
# Note that quantity is 1, so we do not need to take into account the number of returns / purchases per transaction
percent_purchase = dat.groupby(['sku', 'stype'], group_keys=False)["stype"].aggregate(['count']).reset_index()
purchase_return = percent_purchase.pivot(index = 'sku', columns = 'stype', values = 'count').reset_index().fillna(0)
purchase_return["Percentage Purchase"] = purchase_return.apply(lambda x: 100 * x.P / (x.P + x.R), axis = 1)
purchase_return["Percentage Return"] = purchase_return.apply(lambda x: 100 * x.R / (x.P + x.R), axis = 1)
purchase_return.head(10)

stype,sku,P,R,Percentage Purchase,Percentage Return
0,6767033,176.0,12.0,93.617021,6.382979
1,6767036,102.0,15.0,87.179487,12.820513
2,6767073,35.0,5.0,87.5,12.5
3,6767075,98.0,4.0,96.078431,3.921569
4,6767086,10.0,1.0,90.909091,9.090909


In [101]:
percent_discount = dat.groupby(['sku']).apply(lambda x: (100 * (x.orgprice - x.amt) / x.orgprice).mean()).reset_index()
discount = pd.DataFrame(percent_discount).rename(columns={0: "Percent Discount"})
discount.head(10)

Unnamed: 0,sku,Percent Discount
0,6767033,50.89831
1,6767036,61.087349
2,6767073,58.326449
3,6767075,57.43076
4,6767086,25.0
5,6767097,27.5
6,6767113,62.272727
7,6767114,24.765625
8,6767120,42.764414
9,6767139,49.361883
