In [1]:
!pip install datawig

Collecting numpy<1.15.0,>=1.8.2
[?25l  Downloading https://files.pythonhosted.org/packages/18/84/49b7f268741119328aeee0802aafb9bc2e164b36fc312daf83af95dae646/numpy-1.14.6-cp37-cp37m-manylinux1_x86_64.whl (13.8MB)
[K     |████████████████████████████████| 13.8MB 326kB/s 
[31mERROR: xarray 0.15.1 has requirement numpy>=1.15, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: umap-learn 0.5.1 has requirement numpy>=1.17, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: tifffile 2021.2.1 has requirement numpy>=1.15.1, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: tensorflow 2.4.1 has requirement numpy~=1.19.2, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: spacy 2.2.4 has requirement numpy>=1.15.0, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: seaborn 0.11.1 has requirement numpy>=1.15, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mERROR: pyerfa 1.7.2 has requirement numpy

### ライブラリのインポート

In [2]:
import pandas as pd
import datawig
import os
import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score

### シード値の固定

In [3]:
SEED = 42

random.seed(SEED)

### データの作成

In [4]:
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/raw/titanic.csv"

df = pd.read_csv(url, encoding="utf-8")

In [5]:
df.isnull().sum()

survived      0
pclass        0
name          0
sex           0
age         177
sibsp         0
parch         0
ticket        0
fare          0
cabin       687
embarked      2
dtype: int64

In [6]:
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
train, test = train_test_split(df, test_size=0.3)

### パラメータ

In [8]:
INPUT_COLUMNS = ['name', "sex", "sibsp", "parch", "fare", "survived"]
EPOCH = 1000

## embarked の欠損値処理

In [9]:
# 学習結果を保存するパスの作成
FILE_PATH = "imputer_model_embarked"

if os.path.isdir(FILE_PATH):
    pass
else:
    os.mkdir(FILE_PATH)

In [10]:
imputer = datawig.SimpleImputer(
    input_columns=INPUT_COLUMNS,
    output_column='embarked',
    output_path = FILE_PATH
)

In [11]:
# 学習
imputer.fit(
    train_df=train,
    num_epochs=EPOCH
    )

2021-02-27 13:20:15,038 [INFO]  CategoricalEncoder for column embarked                                found only 96 occurrences of value C
2021-02-27 13:20:15,040 [INFO]  CategoricalEncoder for column embarked                                found only 52 occurrences of value Q
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
2021-02-27 13:20:15,081 [INFO]  NumExpr defaulting to 2 threads.
2021-02-27 13:20:15,264 [INFO]  
2021-02-27 13:20:15,634 [INFO]  Epoch[0] Batch [0-18]	Speed: 850.56 samples/sec	cross-entropy=0.896939	embarked-accuracy=0.694079
2021-02-27 13:20:15,953 [INFO]  Epoch[0] Train-cross-entropy=0.761021
2021-02-27 13:20:15,957 [INFO]  Epoch[0] Train-embarked-accuracy=0.728571
2021-02-27 13:20:15,959 [INFO]  Epoch[0] Time cost=0.690
2021-02-

<datawig.simple_imputer.SimpleImputer at 0x7f86311ea150>

In [12]:
# 欠損値に相当する項目の予測
predictions = imputer.predict(test)
predictions.shape

(268, 13)

In [13]:
predictions[predictions["embarked"].isnull()]

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,embarked_imputed,embarked_imputed_proba
829,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,S,0.593263


### 非欠損部分で予測精度を検証

In [14]:
temp = predictions[predictions["embarked"].notnull()].copy()
temp.shape

(267, 13)

In [15]:
le = preprocessing.LabelEncoder()
temp["le_embarked"] = le.fit_transform(temp["embarked"])
temp["le_embarked_imputed"] = le.fit_transform(temp["embarked_imputed"])

In [16]:
temp.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,embarked_imputed,embarked_imputed_proba,le_embarked,le_embarked_imputed
709,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C,S,0.876562,0,2
439,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S,S,0.948137,2,2
840,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S,S,0.756405,2,2
720,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S,S,0.59187,2,2
39,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C,Q,0.389601,0,1


In [17]:
f1_score(temp["le_embarked"], temp["le_embarked_imputed"], average='macro')

0.639421620673713

## cabin の欠損値処理

In [18]:
FILE_PATH = "imputer_model_cabin"

if os.path.isdir(FILE_PATH):
    pass
else:
    os.mkdir(FILE_PATH)

In [19]:
imputer = datawig.SimpleImputer(
    input_columns=INPUT_COLUMNS,
    output_column='cabin',
    output_path = FILE_PATH
)

In [20]:
# 学習
imputer.fit(
    train_df=train,
    num_epochs=EPOCH
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<datawig.simple_imputer.SimpleImputer at 0x7f862dd78e50>

In [21]:
# 欠損値に相当する項目の予測
predictions = imputer.predict(test)
predictions.shape

(268, 13)

### 非欠損部分で予測精度を検証

In [22]:
temp = predictions[predictions["cabin"].notnull()].copy()
temp.shape

(65, 13)

In [23]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
temp["le_cabin"] = le.fit_transform(temp["cabin"])
temp["le_cabin_imputed"] = le.fit_transform(temp["cabin_imputed"])

In [24]:
temp.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,cabin_imputed,cabin_imputed_proba,le_cabin,le_cabin_imputed
136,1,1,"Newsom, Miss. Helen Monypeny",female,19.0,0,2,11752,26.2833,D47,S,G6,0.465823,43,37
137,0,1,"Futrelle, Mr. Jacques Heath",male,37.0,1,0,113803,53.1,C123,S,E31,0.251338,20,31
853,1,1,"Lines, Miss. Mary Conover",female,16.0,0,1,PC 17592,39.4,D28,S,E33,0.302671,41,32
621,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42.0,1,0,11753,52.5542,D19,S,B20,0.166794,38,4
110,0,1,"Porter, Mr. Walter Chamberlain",male,47.0,0,0,110465,52.0,C110,S,E46,0.070175,19,33


In [25]:
f1_score(temp["le_cabin"], temp["le_cabin_imputed"], average='macro')

0.028735632183908042

## age の欠損値処理

In [26]:
FILE_PATH = "imputer_model_age"

if os.path.isdir(FILE_PATH):
    pass
else:
    os.mkdir(FILE_PATH)

In [27]:
imputer = datawig.SimpleImputer(
    input_columns=INPUT_COLUMNS,
    output_column='age',
    output_path = FILE_PATH
)

In [28]:
# 学習
imputer.fit(
    train_df=train,
    num_epochs=EPOCH
    )

<datawig.simple_imputer.SimpleImputer at 0x7f862dda8210>

In [29]:
# 欠損値に相当する項目の予測
predictions = imputer.predict(test)
predictions.shape

(268, 12)

### 非欠損部分で予測精度を検証

In [30]:
temp = predictions[predictions["age"].notnull()].copy()
temp.shape

(215, 12)

In [31]:
temp.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,age_imputed
439,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S,35.729319
840,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S,24.359213
720,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S,14.459458
39,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C,19.566256
290,1,1,"Barber, Miss. Ellen ""Nellie""",female,26.0,0,0,19877,78.85,,S,34.928979


In [32]:
r2_score(temp["age"], temp["age_imputed"])

0.25760977146668074