## 1. Load data

In [98]:
# pip install chardet   # 如果还没装

# import chardet

# with open('./data.csv', 'rb') as f:
#     raw = f.read()               # 读取全部二进制（如果文件很大可以只读前几 MB）
#     result = chardet.detect(raw)

# print(result)

# 可能的输出例子：
# {'encoding': 'GBK', 'confidence': 0.99, 'language': 'Chinese'}

In [99]:
import pandas as pd
pd.set_option('display.max_rows', 45)
raw_df = pd.read_csv('./data.csv', encoding='latin1')

In [100]:
raw_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [101]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


#### TODO: data type cast
- InvoiceDate -> 日期类型
-  CustomerID -> str

## 2. Procedure
1. 结构性评估: 三原则
2. 内容性评估: 
   -  缺失数据 NaN
   -  重复数据 duplicated
   -  不一致数据 .value_count
   -  无效/错误数据

### 缺失数据

In [102]:
# 处理description
raw_df[raw_df["Description"].isnull()] # capture所有description为NaN的行
# 观察很多 UnitPrice 为 0

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,12/1/2010 11:52,0.0,,United Kingdom
1970,536545,21134,,1,12/1/2010 14:32,0.0,,United Kingdom
1971,536546,22145,,1,12/1/2010 14:33,0.0,,United Kingdom
1972,536547,37509,,1,12/1/2010 14:33,0.0,,United Kingdom
1987,536549,85226A,,1,12/1/2010 14:34,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
535322,581199,84581,,-2,12/7/2011 18:26,0.0,,United Kingdom
535326,581203,23406,,15,12/7/2011 18:31,0.0,,United Kingdom
535332,581209,21620,,6,12/7/2011 18:35,0.0,,United Kingdom
536981,581234,72817,,27,12/8/2011 10:33,0.0,,United Kingdom


In [103]:
# 验证 当 NaN description 时UnitPrice 是否全部为 0
raw_df[(raw_df["Description"].isnull()) & (raw_df["UnitPrice"] != 0.0) ]  

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


#### TODO: NaN
- 输出缺少Description的行 <=> 删除 UnitPrice = 0 的行

In [104]:
raw_df[raw_df["Description"].isnull()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,12/1/2010 11:52,0.0,,United Kingdom
1970,536545,21134,,1,12/1/2010 14:32,0.0,,United Kingdom
1971,536546,22145,,1,12/1/2010 14:33,0.0,,United Kingdom
1972,536547,37509,,1,12/1/2010 14:33,0.0,,United Kingdom
1987,536549,85226A,,1,12/1/2010 14:34,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
535322,581199,84581,,-2,12/7/2011 18:26,0.0,,United Kingdom
535326,581203,23406,,15,12/7/2011 18:31,0.0,,United Kingdom
535332,581209,21620,,6,12/7/2011 18:35,0.0,,United Kingdom
536981,581234,72817,,27,12/8/2011 10:33,0.0,,United Kingdom


In [105]:
raw_df[raw_df['CustomerID'].isnull()] #

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,12/1/2010 11:52,0.00,,United Kingdom
1443,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,12/1/2010 14:32,2.51,,United Kingdom
1444,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,12/1/2010 14:32,2.51,,United Kingdom
1445,536544,21786,POLKADOT RAIN HAT,4,12/1/2010 14:32,0.85,,United Kingdom
1446,536544,21787,RAIN PONCHO RETROSPOT,2,12/1/2010 14:32,1.66,,United Kingdom
...,...,...,...,...,...,...,...,...
541536,581498,85099B,JUMBO BAG RED RETROSPOT,5,12/9/2011 10:26,4.13,,United Kingdom
541537,581498,85099C,JUMBO BAG BAROQUE BLACK WHITE,4,12/9/2011 10:26,4.13,,United Kingdom
541538,581498,85150,LADIES & GENTLEMEN METAL SIGN,1,12/9/2011 10:26,4.96,,United Kingdom
541539,581498,85174,S/4 CACTI CANDLES,1,12/9/2011 10:26,10.79,,United Kingdom


### 重复数据

In [106]:
# unique value
raw_df.duplicated().sum()

5268

### 无效/错误数据

In [107]:
# 不一致数据 : USA = united state; UK = U.K.
raw_df['Country'].value_counts()

United Kingdom          495478
Germany                   9495
France                    8557
EIRE                      8196
Spain                     2533
Netherlands               2371
Belgium                   2069
Switzerland               2002
Portugal                  1519
Australia                 1259
Norway                    1086
Italy                      803
Channel Islands            758
Finland                    695
Cyprus                     622
Sweden                     462
Unspecified                446
Austria                    401
Denmark                    389
Japan                      358
Poland                     341
Israel                     297
USA                        291
Hong Kong                  288
Singapore                  229
Iceland                    182
Canada                     151
Greece                     146
Malta                      127
United Arab Emirates        68
European Community          61
RSA                         58
Lebanon 

In [108]:
raw_df.describe() # quantity and unitprice negative

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


In [109]:
# Given: C 开头的 InvoiceNo 意味着交易被取消
# 验证是不是
raw_df[(raw_df['Quantity'] < 0) & (raw_df['InvoiceNo'].str[0] != 'C')]
# 观察到UnitPrice 为0

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
2406,536589,21777,,-10,12/1/2010 16:50,0.0,,United Kingdom
4347,536764,84952C,,-38,12/2/2010 14:42,0.0,,United Kingdom
7188,536996,22712,,-20,12/3/2010 15:30,0.0,,United Kingdom
7189,536997,22028,,-20,12/3/2010 15:30,0.0,,United Kingdom
7190,536998,85067,,-6,12/3/2010 15:30,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
535333,581210,23395,check,-26,12/7/2011 18:36,0.0,,United Kingdom
535335,581212,22578,lost,-1050,12/7/2011 18:38,0.0,,United Kingdom
535336,581213,22576,check,-30,12/7/2011 18:38,0.0,,United Kingdom
536908,581226,23090,missing,-338,12/8/2011 9:56,0.0,,United Kingdom


In [110]:
raw_df[(raw_df['Quantity'] < 0) & (raw_df['InvoiceNo'].str[0] != 'C') & (raw_df['UnitPrice'] != 0)]
# 当 quantity = 0时, 要么单价为0, 要么InvoiceNo startswith 'C'

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


#### TODO: 删除raw_df['Quantity'] < 0

In [111]:
# UnitPrice
raw_df[raw_df['UnitPrice'] < 0] # Adjust bad debt : 坏账调整

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
299983,A563186,B,Adjust bad debt,1,8/12/2011 14:51,-11062.06,,United Kingdom
299984,A563187,B,Adjust bad debt,1,8/12/2011 14:52,-11062.06,,United Kingdom


#### TODO: 删除raw_df['UnitPrice'] < 0

 ### 综上所述, 需要做的为

## 数据处理

In [112]:
clean_df = raw_df.copy()

In [113]:
# date data type cast

clean_df['InvoiceDate'] = pd.to_datetime(raw_df['InvoiceDate'])

# id to str
clean_df["CustomerID"] = clean_df['CustomerID'].astype(str) # id 带有.0

clean_df['CustomerID'] = clean_df['CustomerID'].str.slice(0, -2)
# OR clean_df["CustomerID"] = clean_df['CustomerID'].apply(lambda x : x[:-2]) 

In [114]:
# 删除 NaN 
clean_df.dropna(subset=['Description'], inplace=True) # 验证 clean_df['CustomerID'].isnull().sum()

In [115]:
# 替换值
clean_df["Country"].replace({"USA": "United States"}, inplace=True)


In [116]:
# 删除quantities < 0的行
clean_df = clean_df[clean_df['Quantity'] >= 0]

In [117]:
# 删除unitprice < 0 的行
clean_df = clean_df[clean_df['UnitPrice'] >= 0]

保存清理后的数据

In [118]:
clean_df.to_csv('e_commerce_cleaned.csv', index= False) # 不保存索引