In [1]:
##pip install featuretools
##自動化特徵工程
##參考https://www.itread01.com/content/1544494326.html

In [2]:
import featuretools as ft
import numpy as np
import pandas as pd
train=pd.read_csv("BigMart_train.csv")
test = pd.read_csv("BigMart_test.csv")

In [3]:
#先把目標Item_Outlet_Sales儲存到sales變數，把test_Item_Identifier和test_Outlet_Identifier儲存到id變數
test_Item_Identifier = test['Item_Identifier'] 
test_Outlet_Identifier = test['Outlet_Identifier'] 
sales = train['Item_Outlet_Sales'] 
train.drop(['Item_Outlet_Sales'], axis=1, inplace=True)

In [4]:
#然後，組合訓練集和測試集，這樣省去兩次執行相同步驟的麻煩
combi = train.append(test, ignore_index=True)

In [5]:
combi.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1


In [6]:
#檢查遺漏值
combi.isnull().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [7]:
#變數Item_Weight和Outlet_size中有相當多缺失值，用fillna方法快速處理
combi['Item_Weight'].fillna(combi['Item_Weight'].mean(), 
                            inplace = True) 
combi['Outlet_Size'].fillna("missing", inplace = True)

In [8]:
combi['Item_Fat_Content'].value_counts()

Low Fat    8485
Regular    4824
LF          522
reg         195
low fat     178
Name: Item_Fat_Content, dtype: int64

In [9]:
#Item_Fat_Content似乎只包含兩個類別，即“低脂”和“常規”，未涉及到“冗餘”類別，所以我們把它轉換成二進位制變數。
fat_content_dict = {'Low Fat':0, 'Regular':1, 'LF':0, 'reg':1, 
                    'low fat':0} 
combi['Item_Fat_Content'] = combi['Item_Fat_Content'].replace(   
                            fat_content_dict, regex=True)

In [10]:
#對於資料集，必須具有唯一識別符號特徵，但是我們的資料集目前還沒有。
#因此，我們要為這個組合資料集建立唯一ID。
#你可能會注意到，資料集中有兩個ID，一個用於item，另一個用於outlet。
#因此，對這兩者簡單相加會得到一個唯一ID。

In [11]:
#由於不再需要，我刪除了特徵Item_Identifier。但是，我保留了特徵Outlet_Identifier，因為我稍後還要用到它。
combi['id'] = combi['Item_Identifier'] + combi['Outlet_Identifier'] 
combi.drop(['Item_Identifier'], axis=1, inplace=True)
combi.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,id
0,9.3,0,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,FDA15OUT049
1,5.92,1,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,DRC01OUT018
2,17.5,0,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,FDN15OUT049
3,19.2,1,0.0,Fruits and Vegetables,182.095,OUT010,1998,missing,Tier 3,Grocery Store,FDX07OUT010
4,8.93,0,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,NCD19OUT013


In [12]:
#建立一個實體集EntitySet。實體集是一種包含多個數據幀及其之間關係的結構。那麼，我們建立一個EntitySet並新增資料幀組合。
# creating and entity set 'es' 
es = ft.EntitySet(id = 'sales') 
# adding a dataframe 
es.entity_from_dataframe(entity_id = 'bigmart', 
                         dataframe = combi, 
                         index = 'id')

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 11]
  Relationships:
    No relationships

In [13]:
#我們資料中包含兩個級別的資訊，即 item級別和 outlet級別。
#Featuretools能把一個數據集拆分成多個表格。
#我們根據outlet ID Outlet_Identifier從BigMart表中建立一個新表“outlet”。

In [14]:
es.normalize_entity(base_entity_id='bigmart', 
                    new_entity_id='outlet', 
                    index = 'Outlet_Identifier', 
                    additional_variables =   
                    ['Outlet_Establishment_Year', 'Outlet_Size',  
                     'Outlet_Location_Type', 'Outlet_Type'])

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier

In [15]:
#如上所示，它包含兩個實體，為bigmart和outlet。
#這兩個表之間也形成了一種關係，用Outlet_Identifier連線。這種關係將在生成新特徵中發揮關鍵作用。

In [16]:
#現在我們要使用DFS來自動建立新特徵。上面提到，DFS使用特徵基元和實體集中給出的多個表來建立特徵。
feature_matrix, feature_names = ft.dfs(entityset=es, 
                                       target_entity = 'bigmart', 
                                       max_depth = 2, 
                                       verbose = 1, 
                                       n_jobs = 3)

Built 37 features
EntitySet scattered to workers in 2.541 seconds
Elapsed: 00:01 | Remaining: 00:00 | Progress: 100%|██████████████████████████████████████████| Calculated: 11/11 chunks


In [17]:
#target_entity只是建立新特徵的實體ID，這種情況下為實體“bigmart”。
#引數max_depth控制著通過堆疊基元生成的要素複雜性。
#引數n_jobs通過使用多個核心來輔助並行特徵計算。
#這就是使用Featuretools的過程，它已經產生了許多新特徵。
feature_matrix.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'outlet.Outlet_Establishment_Year',
       'outlet.Outlet_Size', 'outlet.Outlet_Location_Type',
       'outlet.Outlet_Type', 'outlet.SUM(bigmart.Item_Weight)',
       'outlet.SUM(bigmart.Item_Fat_Content)',
       'outlet.SUM(bigmart.Item_Visibility)', 'outlet.SUM(bigmart.Item_MRP)',
       'outlet.STD(bigmart.Item_Weight)',
       'outlet.STD(bigmart.Item_Fat_Content)',
       'outlet.STD(bigmart.Item_Visibility)', 'outlet.STD(bigmart.Item_MRP)',
       'outlet.MAX(bigmart.Item_Weight)',
       'outlet.MAX(bigmart.Item_Fat_Content)',
       'outlet.MAX(bigmart.Item_Visibility)', 'outlet.MAX(bigmart.Item_MRP)',
       'outlet.SKEW(bigmart.Item_Weight)',
       'outlet.SKEW(bigmart.Item_Fat_Content)',
       'outlet.SKEW(bigmart.Item_Visibility)', 'outlet.SKEW(bigmart.Item_MRP)',
       'outlet.MIN(bigmart.Item_Weight)',
       'outlet.MIN(bigmart.Item_Fat_Content)',
       

In [18]:
#這個資料幀存在一個問題，即未正確排序。我們必須根據combi資料幀中的id變數對其進行排序
feature_matrix = feature_matrix.reindex(index=combi['id']) 
feature_matrix = feature_matrix.reset_index()
feature_matrix.head()

Unnamed: 0,id,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,...,outlet.MIN(bigmart.Item_Fat_Content),outlet.MIN(bigmart.Item_Visibility),outlet.MIN(bigmart.Item_MRP),outlet.MEAN(bigmart.Item_Weight),outlet.MEAN(bigmart.Item_Fat_Content),outlet.MEAN(bigmart.Item_Visibility),outlet.MEAN(bigmart.Item_MRP),outlet.COUNT(bigmart),outlet.NUM_UNIQUE(bigmart.Item_Type),outlet.MODE(bigmart.Item_Type)
0,FDA15OUT049,9.3,0,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,...,0,0.0,32.4558,12.803003,0.352903,0.059,141.163199,1550,16,Fruits and Vegetables
1,DRC01OUT018,5.92,1,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,...,0,0.0,31.89,12.803638,0.353816,0.059976,141.000899,1546,16,Fruits and Vegetables
2,FDN15OUT049,17.5,0,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,...,0,0.0,32.4558,12.803003,0.352903,0.059,141.163199,1550,16,Fruits and Vegetables
3,FDX07OUT010,19.2,1,0.0,Fruits and Vegetables,182.095,OUT010,1998,missing,Tier 3,...,0,0.0,32.6558,12.72287,0.356757,0.101939,141.159742,925,16,Fruits and Vegetables
4,NCD19OUT013,8.93,0,0.0,Household,53.8614,OUT013,1987,High,Tier 3,...,0,0.0,31.49,12.788139,0.353509,0.060242,141.128428,1553,16,Fruits and Vegetables


In [19]:
#構建模型
#將使用它們來構建模型並預測Item_Outlet_Sales值。
#由於最終資料feature_matrix具有多個分類特徵，因此我決定使用CatBoost演算法。
#它可以直接使用分類特徵，並且本質上是可擴充套件的。

In [20]:
from catboost import CatBoostRegressor

In [21]:
#CatBoost要求所有分類變數都使用字串格式。因此，我們首先將資料中的分類變數轉換為字串
categorical_features = np.where(feature_matrix.dtypes =='object')[0] 
for i in categorical_features: 
    feature_matrix.iloc[:,i]=feature_matrix.iloc[:,i].astype('str')

In [22]:
#把feature_matrix分解為訓練集和測試集。
feature_matrix.drop(['id'], axis=1, inplace=True) 
train = feature_matrix[:8523] 
test = feature_matrix[8523:]
# removing uneccesary variables 
train.drop(['Outlet_Identifier'], axis=1, inplace=True) 
test.drop(['Outlet_Identifier'], axis=1, inplace=True)
# identifying categorical features categorical_features = np.where(train.dtypes == 'object')[0]

In [23]:
#然後把訓練資料拆分為訓練和驗證集，並本地驗證模型效能。
from sklearn.model_selection import train_test_split 
# splitting train data into training and validation set 
xtrain, xvalid, ytrain, yvalid = train_test_split(train, sales, 
                                                  test_size=0.25, 
                                                  random_state=11)

In [26]:
#最後，訓練模型時，我們使用的評估指標是RMSE（均方根誤差）。
model_cat = CatBoostRegressor(iterations=100, learning_rate=0.3, 
                              depth=6, eval_metric='RMSE',  
                              random_seed=7) 
# training model 
model_cat.fit(xtrain, ytrain, cat_features=categorical_features, 
              use_best_model=True)
# validation score 
model_cat.score(xvalid, yvalid)

CatBoostError: Invalid cat_features[6] = 37 value: must be < 36.

In [30]:
categorical_features

array([ 0,  4,  6,  8,  9, 10, 37], dtype=int64)