### xfeat GitHub https://github.com/pfnet-research/xfeat
### tutorial https://github.com/pfnet-research/xfeat/blob/master/examples/xfeat_tutorial_notebook.ipynb  

In [1]:
import pandas as pd
import numpy as np
import os

from IPython.display import display

import xfeat
from xfeat import SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, ArithmeticCombinations,  LambdaEncoder

import pickle

In [2]:
path = os.getcwd() + "/"
d_name = ["train", "test"]

In [3]:
for i in d_name:
    xfeat.utils.compress_df(pd.read_csv(path + i + ".csv")).to_feather(path + i + ".ftr")

In [4]:
# Check the serialized data.
dtrain = pd.read_feather("./train.ftr")
dtest = pd.read_feather("./test.ftr")

display(dtrain.head(3))
display(dtrain.tail(3))
display(dtrain.dtypes)
display(dtest.head(3))
display(dtest.tail(3))
display(dtest.dtypes)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283302,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.450001,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


PassengerId      int16
Survived          int8
Pclass            int8
Name            object
Sex             object
Age            float32
SibSp             int8
Parch             int8
Ticket          object
Fare           float32
Cabin           object
Embarked        object
dtype: object

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.358299,,C


PassengerId      int16
Pclass            int8
Name            object
Sex             object
Age            float32
SibSp             int8
Parch             int8
Ticket          object
Fare           float32
Cabin           object
Embarked        object
dtype: object

In [5]:
# カテゴリデータ（object）のみ集計　SelectCategorical().fit_transform()
display(SelectCategorical().fit_transform(dtrain).head())
display(SelectCategorical().fit_transform(dtest).head())

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Kelly, Mr. James",male,330911,,Q
1,"Wilkes, Mrs. James (Ellen Needs)",female,363272,,S
2,"Myles, Mr. Thomas Francis",male,240276,,Q
3,"Wirz, Mr. Albert",male,315154,,S
4,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,3101298,,S


In [6]:
# ラベルエンコーディング

# Takes categorical columns from the data frame and performs label encoding on them.
# The converted data is stored in the column with suffix defined in `output_suffix`.
# By defining `output_suffix=""`, it is possible to store the result in the same column.

# データフレームからカテゴリ型の列を取り出し、ラベルエンコーディングを行います。
# 変換されたデータは `output_suffix` で定義されたサフィックスを持つ列に格納されます。
# output_suffix=""`を定義することで、同じカラムに結果を格納することができます。

encoder = Pipeline([
    # 除外するカラムをexclude_colsに指定
    SelectCategorical(exclude_cols=["Name"]),
    LabelEncoder(output_suffix=""),
])

In [7]:
display(encoder.fit_transform(dtrain).head())
display(encoder.fit_transform(dtest).head())

Unnamed: 0,Sex,Ticket,Cabin,Embarked
0,0,0,-1,0
1,1,1,0,1
2,1,2,-1,0
3,1,3,1,0
4,0,4,-1,0


Unnamed: 0,Sex,Ticket,Cabin,Embarked
0,0,0,-1,0
1,1,1,-1,1
2,0,2,-1,0
3,0,3,-1,1
4,1,4,-1,1


In [8]:
encoder = Pipeline([
    # 除外するカラムをexclude_colsに指定
    SelectCategorical(exclude_cols=["Name"]),

    # If there are many categorical columns,
    # users can specify the columns to be combined with `input_cols` kwargs.
    # `r=2` specifies the number of columns to combine the columns.
    
    # カテゴリ列が多数存在する場合.
    # ユーザーは `input_cols` kwargs で結合するカラムを指定することができます。
    # `r=2` は結合するカラムの数を指定します。
    
    ConcatCombination(drop_origin=True, output_suffix="", r=2),
    
    LabelEncoder(output_suffix=""),
])

### 4つの項目から2つを選ぶ組み合わせ　4C2 = 6　6通りが出力される

In [9]:
display(encoder.fit_transform(dtrain).head())
display(encoder.fit_transform(dtest).head())

Unnamed: 0,SexTicket,SexCabin,SexEmbarked,TicketCabin,TicketEmbarked,CabinEmbarked
0,0,0,0,0,0,0
1,1,1,1,1,1,1
2,2,2,2,2,2,0
3,3,3,2,3,3,2
4,4,0,0,4,4,0


Unnamed: 0,SexTicket,SexCabin,SexEmbarked,TicketCabin,TicketEmbarked,CabinEmbarked
0,0,0,0,0,0,0
1,1,1,1,1,1,1
2,2,0,0,2,2,0
3,3,0,2,3,3,1
4,4,1,1,4,4,1


In [10]:
display(dtrain.dtypes)
display(dtest.dtypes)

PassengerId      int16
Survived          int8
Pclass            int8
Name            object
Sex             object
Age            float32
SibSp             int8
Parch             int8
Ticket          object
Fare           float32
Cabin           object
Embarked        object
dtype: object

PassengerId      int16
Pclass            int8
Name            object
Sex             object
Age            float32
SibSp             int8
Parch             int8
Ticket          object
Fare           float32
Cabin           object
Embarked        object
dtype: object

In [11]:
# SelectNumerical extracts only the column of numerical data from the input dataframe.
# 入力データフレームから数値データの列のみを抽出します

# exclude_cols で出力しないカラムを指定
display(SelectNumerical(exclude_cols=["PassengerId", "Survived"]).fit_transform(dtrain).head())
display(SelectNumerical(exclude_cols=["PassengerId"]).fit_transform(dtest).head())

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.283302
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.099998
4,3,35.0,0,0,8.05


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,34.5,0,0,7.8292
1,3,47.0,1,0,7.0
2,2,62.0,0,0,9.6875
3,3,27.0,0,0,8.6625
4,3,22.0,1,1,12.2875


In [12]:
# xfeat.ArithmeticCombinations　creates new columns by applying arithmetic combinations.
# 算術的な組み合わせを適用して新しい列を作成します。

encoder = Pipeline([
    SelectNumerical(exclude_cols=["PassengerId", "Survived"]),
    ArithmeticCombinations(
        drop_origin=True,
        operator="+",
        r=2,
        output_suffix="",
    ),
])

display(encoder.fit_transform(dtrain).head())

Unnamed: 0,PclassAge,PclassSibSp,PclassParch,PclassFare,AgeSibSp,AgeParch,AgeFare,SibSpParch,SibSpFare,ParchFare
0,25.0,4,3,10.25,23.0,22.0,29.25,1,8.25,7.25
1,39.0,2,1,72.283302,39.0,38.0,109.283302,1,72.283302,71.283302
2,29.0,3,3,10.925,26.0,26.0,33.924999,0,7.925,7.925
3,36.0,2,1,54.099998,36.0,35.0,88.099998,1,54.099998,53.099998
4,38.0,3,3,11.05,35.0,35.0,43.049999,0,8.05,8.05


In [13]:
encoder = Pipeline([
    SelectNumerical(exclude_cols=["PassengerId"]),
    ArithmeticCombinations(
        drop_origin=True,
        operator="+",
        r=2,
        output_suffix="",
    ),
])

display(encoder.fit_transform(dtest).head())

Unnamed: 0,PclassAge,PclassSibSp,PclassParch,PclassFare,AgeSibSp,AgeParch,AgeFare,SibSpParch,SibSpFare,ParchFare
0,37.5,3,3,10.8292,34.5,34.5,42.329201,0,7.8292,7.8292
1,50.0,4,3,10.0,48.0,47.0,54.0,1,8.0,7.0
2,64.0,2,2,11.6875,62.0,62.0,71.6875,0,9.6875,9.6875
3,30.0,3,3,11.6625,27.0,27.0,35.662498,0,8.6625,8.6625
4,25.0,4,4,15.2875,23.0,23.0,34.287498,2,13.2875,13.2875


In [14]:
# xfeat.LambdaEncoder takes a lambda function as an argument and transforms the columns of the data frame.
# ラムダ関数を引数に取り、データフレームの列を変換します。

encoder = Pipeline([
    SelectNumerical(exclude_cols=["PassengerId", "Survived"]),
    ArithmeticCombinations(
        drop_origin=True,
        operator="+",
        r=2,
        output_suffix="",
    ),

    # 
    LambdaEncoder(
        lambda x: float(str(x)[:5]),
        output_prefix="",
        output_suffix="",
        drop_origin=True,
    ),
])

encoder.fit_transform(dtrain).head()

Unnamed: 0,PclassAge,PclassSibSp,PclassParch,PclassFare,AgeSibSp,AgeParch,AgeFare,SibSpParch,SibSpFare,ParchFare
0,25.0,4.0,3.0,10.25,23.0,22.0,29.25,1.0,8.25,7.25
1,39.0,2.0,1.0,72.28,39.0,38.0,109.2,1.0,72.28,71.28
2,29.0,3.0,3.0,10.92,26.0,26.0,33.92,0.0,7.925,7.925
3,36.0,2.0,1.0,54.09,36.0,35.0,88.09,1.0,54.09,53.09
4,38.0,3.0,3.0,11.05,35.0,35.0,43.04,0.0,8.05,8.05


In [15]:
encoder = Pipeline([
    SelectNumerical(exclude_cols=["PassengerId"]),
    ArithmeticCombinations(
        drop_origin=True,
        operator="+",
        r=2,
        output_suffix="",
    ),

    LambdaEncoder(
        # 
        lambda x: float(str(x)[:5]),
        output_prefix="",
        output_suffix="",
        drop_origin=True,
    ),
])

encoder.fit_transform(dtest).head()

Unnamed: 0,PclassAge,PclassSibSp,PclassParch,PclassFare,AgeSibSp,AgeParch,AgeFare,SibSpParch,SibSpFare,ParchFare
0,37.5,3.0,3.0,10.82,34.5,34.5,42.32,0.0,7.829,7.829
1,50.0,4.0,3.0,10.0,48.0,47.0,54.0,1.0,8.0,7.0
2,64.0,2.0,2.0,11.68,62.0,62.0,71.68,0.0,9.687,9.687
3,30.0,3.0,3.0,11.66,27.0,27.0,35.66,0.0,8.662,8.662
4,25.0,4.0,4.0,15.28,23.0,23.0,34.28,2.0,13.28,13.28


### 【用語解説】シリアライズ・デシリアライズとは
http://cloudcafe.tech/?p=2639

シリアライズ（serialize）とは、プログラミングでオプジェクト化されたデータを、ファイルやストレージに保存したり、ネットワークで送受信したりできるような形に変換することを言います。  

逆に、シリアライズされたデータをプログラミングで扱えるようにオブジェクトの型に復元することをデシリアライズ（deserialize）といいます。  

In [16]:
# Serialize/Deserialize
# The parameters of the encoder can be serialized/deserialized by pickle.
# エンコーダのパラメータをPickleでシリアライズ/デシリアライズすることができます。

encoder = Pipeline([
    SelectCategorical(exclude_cols=["Name"]),
    LabelEncoder(output_suffix=""),
])

dtrain_encoded = encoder.fit_transform(dtrain)

# wb バイナリーデータへの書き込み
with open("label_train.pkl", "wb") as f:
    pickle.dump(encoder, f)
    
dtrain_encoded.head()

Unnamed: 0,Sex,Ticket,Cabin,Embarked
0,0,0,-1,0
1,1,1,0,1
2,1,2,-1,0
3,1,3,1,0
4,0,4,-1,0


In [17]:
with open("label_train.pkl", "rb") as f:
    encoder = pickle.load(f)

encoder.transform(dtrain).head()

Unnamed: 0,Sex,Ticket,Cabin,Embarked
0,0,0,-1,0
1,1,1,0,1
2,1,2,-1,0
3,1,3,1,0
4,0,4,-1,0


In [18]:
# dtrain のエンコーダーでdtestを読み込むとおかしな感じに・・・
encoder.transform(dtest).head()

Unnamed: 0,Sex,Ticket,Cabin,Embarked
0,0,-1,-1,2
1,1,-1,-1,0
2,0,-1,-1,2
3,0,-1,-1,0
4,1,405,-1,0


In [19]:
encoder = Pipeline([
    SelectCategorical(exclude_cols=["Name"]),
    LabelEncoder(output_suffix=""),
])

dtest_encoded = encoder.fit_transform(dtest)

# wb バイナリーデータへの書き込み
with open("label_test.pkl", "wb") as f:
    pickle.dump(encoder, f)
    
dtest_encoded.head()

Unnamed: 0,Sex,Ticket,Cabin,Embarked
0,0,0,-1,0
1,1,1,-1,1
2,0,2,-1,0
3,0,3,-1,1
4,1,4,-1,1


In [20]:
with open("label_test.pkl", "rb") as f:
    encoder = pickle.load(f)

encoder.transform(dtest).head()

Unnamed: 0,Sex,Ticket,Cabin,Embarked
0,0,0,-1,0
1,1,1,-1,1
2,0,2,-1,0
3,0,3,-1,1
4,1,4,-1,1


The Label encoding mapping is kept in train.csv and test.csv. Unseen values are assigned to -1 in this case.  

ラベルエンコーディングマッピングは train.csv と test.csv に保持されます。この場合、未見の値は-1に代入されます。  