<a href="https://colab.research.google.com/github/Itsuki-Hamano123/auto_ml/blob/master/TPOT/radon_regressor_drop_country.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install -q tfds-nightly tensorflow matplotlib tpot
%pip show tpot

Name: TPOT
Version: 0.11.5
Summary: Tree-based Pipeline Optimization Tool
Home-page: https://github.com/EpistasisLab/tpot
Author: Randal S. Olson
Author-email: rso@randalolson.com
License: GNU/LGPLv3
Location: /usr/local/lib/python3.6/dist-packages
Requires: update-checker, scikit-learn, stopit, joblib, numpy, deap, tqdm, scipy, pandas
Required-by: 


In [2]:
import cloudpickle
import datetime
import os
import pprint

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow_datasets as tfds
from tpot import TPOTRegressor

## データセット読み込み
- radonデータセット[https://www.tensorflow.org/datasets/catalog/radon](https://www.tensorflow.org/datasets/catalog/radon)

In [3]:
def fetch_tf_dataset(data_name, 
                     split='train',
                     shuffle_files=True, as_supervised=True,
                     with_info=True, batch_size=None):
  '''TensorFlowデータセットからデータをフェッチ'''
  dataset, data_info = tfds.load(data_name, split=split,
                                  shuffle_files=shuffle_files,
                                  as_supervised=as_supervised,
                                  with_info=with_info,
                                  batch_size=batch_size)
  return dataset, data_info
  


data_name = 'radon' # @param{type:'string'}
x_name = 'features' # @param{type:'string'}
y_name = 'activity' # @param{type:'string'}
# インメモリでデータを読み込む
# TPOTがdatasetV1Adapter,generatorを受け付けないため
batch_size = -1

# データのフェッチ
tf_dataset, ds_info = fetch_tf_dataset(data_name=data_name,
                                     as_supervised=False,
                                     batch_size=batch_size)

display(ds_info)

tfds.core.DatasetInfo(
    name='radon',
    version=1.0.0,
    description='Radon is a radioactive gas that enters homes through contact
points with the ground. It is a carcinogen that is the primary cause of lung
cancer in non-smokers. Radon levels vary greatly from household to household.
This dataset contains measured radon levels in U.S homes by county and state.
The 'activity' label is the measured radon concentration in pCi/L. Important
predictors are 'floor' (the floor of the house in which the measurement was
taken), 'county' (the U.S. county in which the house is located), and 'Uppm' (a
measurement of uranium level of the soil by county).',
    homepage='http://www.stat.columbia.edu/~gelman/arm/examples/radon/',
    features=FeaturesDict({
        'activity': tf.float32,
        'features': FeaturesDict({
            'Uppm': tf.float32,
            'adjwt': tf.float32,
            'basement': tf.string,
            'cntyfips': tf.int32,
            'county': tf.string,
      

### Dataframeに変換
TPOTがTFDS形式を受け付けないため

In [4]:
%%time

def convert_tfds_to_df(tf_ds, x_key, y_key):
  '''tf DatasetをDataframeに変換'''
  
  # yについて{key:numpy}形式のdictに変換
  y = {}
  y[y_key] = tf_ds[y_key].numpy()

  # xについて[{key:numpy},{key:numpy},,,]形式のdictに変換
  x = {}
  for f_key, element in tf_ds[x_key].items():
    if element.dtype == 'string':
      x[f_key] = _convert_byte_np_to_string_np(bytes_np=element.numpy())
    else:
      x[f_key] = element.numpy()
  
  y_df = pd.DataFrame.from_dict(y)
  x_df = pd.DataFrame.from_dict(x)
  df = pd.concat([y_df, x_df], axis=1)

  return df


def _convert_byte_np_to_string_np(bytes_np, decode_char='utf-8'):
  '''バイナリ文字列形式のnumpy配列を文字列形式のnumpy配列に変換'''
  decode_np = np.array([])
  for b in bytes_np:
    decode_np = np.append(decode_np, b.decode(decode_char))
  decode_np = decode_np.astype('object')
  return decode_np


# tf datasetからDataframeに変換
dataset = convert_tfds_to_df(tf_dataset, x_key=x_name, y_key=y_name)


display(dataset.head())
display(dataset.shape)


# カラム名の設定
y_col = y_name
x_cols = dataset.drop(columns=y_col).columns

display('y_col:{}'.format(y_col))
display('x_cols num:{}, names:{}'.format(len(x_cols), x_cols))

Unnamed: 0,activity,Uppm,adjwt,basement,cntyfips,county,dupflag,floor,idnum,lat,lon,pcterr,region,rep,room,startdt,starttm,state,state2,stfips,stopdt,stoptm,stratum,typebldg,wave,windoor,zip,zipflag
0,1.6,2.49759,292.219543,N,3,ALLEN,0,1,1681,41.091,-85.067001,12.7,2,3,2,11489,2230,IN,IN,18,11689,2235,2,1,92,,46835,0
1,0.5,2.66527,364.958313,N,25,YAVAPAI,0,1,1457,34.599998,-112.554001,0.0,2,5,2,22688,833,AZ,AZ,4,22888,1010,2,1,56,,86325,0
2,3.0,2.43343,626.564575,Y,17,MIDDLESEX,0,0,4224,42.485001,-71.391998,9.7,6,1,2,32288,1000,MA,MA,25,32488,1000,1,1,23,,1778,0
3,0.6,2.43343,570.611755,N,17,MIDDLESEX,0,1,4094,42.485001,-71.391998,27.6,6,3,3,32488,700,MA,MA,25,32688,1200,1,1,62,,1432,0
4,1.2,2.10811,426.798859,Y,95,JACKSON,0,0,6849,39.007999,-94.347,23.5,1,3,4,32288,605,MO,MO,29,32488,610,2,1,91,,64134,0


(12573, 28)

'y_col:activity'

"x_cols num:27, names:Index(['Uppm', 'adjwt', 'basement', 'cntyfips', 'county', 'dupflag', 'floor',\n       'idnum', 'lat', 'lon', 'pcterr', 'region', 'rep', 'room', 'startdt',\n       'starttm', 'state', 'state2', 'stfips', 'stopdt', 'stoptm', 'stratum',\n       'typebldg', 'wave', 'windoor', 'zip', 'zipflag'],\n      dtype='object')"

CPU times: user 8.93 s, sys: 337 ms, total: 9.26 s
Wall time: 9.27 s


## 不要なカラムを削除
- country:one-hot-encodeすると400次元という大きな次元数になるため

In [5]:
drop_columns = ['county']

dataset = dataset.drop(columns=drop_columns)
display(dataset.columns)

Index(['activity', 'Uppm', 'adjwt', 'basement', 'cntyfips', 'dupflag', 'floor',
       'idnum', 'lat', 'lon', 'pcterr', 'region', 'rep', 'room', 'startdt',
       'starttm', 'state', 'state2', 'stfips', 'stopdt', 'stoptm', 'stratum',
       'typebldg', 'wave', 'windoor', 'zip', 'zipflag'],
      dtype='object')

## 文字列をone-hotエンコード
TPOTが文字列を受け付けないため

In [6]:
%%time

def one_hot_encode_df_val(df, target_col):
  '''文字列のカラムをone-hot-encodeしたカラムに置き換え
  WARNING : target_colはdropされる
  '''
  mlb = MultiLabelBinarizer()
  one_hot_feature = mlb.fit_transform([{str(val)} for val in df[target_col].values])
  insert_columns = [target_col+'_'+str(x+1) for x in range(len(one_hot_feature[0]))]
  insert_df = pd.DataFrame(one_hot_feature, columns=insert_columns)
  df = df.drop(columns=target_col)
  df = pd.concat([df, insert_df], axis=1)
  return df

target_cols = ['basement', 'state', 'state2', 'windoor']
for col in target_cols:
  dataset = one_hot_encode_df_val(dataset, col)
display(dataset.head())


# カラム名の設定
y_col = y_name
x_cols = dataset.drop(columns=y_col).columns

display('y_col:{}'.format(y_col))
display('x_cols num:{}, names:{}'.format(len(x_cols), x_cols))

Unnamed: 0,activity,Uppm,adjwt,cntyfips,dupflag,floor,idnum,lat,lon,pcterr,region,rep,room,startdt,starttm,stfips,stopdt,stoptm,stratum,typebldg,wave,zip,zipflag,basement_1,basement_2,basement_3,basement_4,state_1,state_2,state_3,state_4,state_5,state_6,state_7,state_8,state2_1,state2_2,state2_3,state2_4,state2_5,state2_6,state2_7,state2_8,state2_9,windoor_1
0,1.6,2.49759,292.219543,3,0,1,1681,41.091,-85.067001,12.7,2,3,2,11489,2230,18,11689,2235,2,1,92,46835,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
1,0.5,2.66527,364.958313,25,0,1,1457,34.599998,-112.554001,0.0,2,5,2,22688,833,4,22888,1010,2,1,56,86325,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
2,3.0,2.43343,626.564575,17,0,0,4224,42.485001,-71.391998,9.7,6,1,2,32288,1000,25,32488,1000,1,1,23,1778,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,0.6,2.43343,570.611755,17,0,1,4094,42.485001,-71.391998,27.6,6,3,3,32488,700,25,32688,1200,1,1,62,1432,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,1.2,2.10811,426.798859,95,0,0,6849,39.007999,-94.347,23.5,1,3,4,32288,605,29,32488,610,2,1,91,64134,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1


'y_col:activity'

"x_cols num:44, names:Index(['Uppm', 'adjwt', 'cntyfips', 'dupflag', 'floor', 'idnum', 'lat', 'lon',\n       'pcterr', 'region', 'rep', 'room', 'startdt', 'starttm', 'stfips',\n       'stopdt', 'stoptm', 'stratum', 'typebldg', 'wave', 'zip', 'zipflag',\n       'basement_1', 'basement_2', 'basement_3', 'basement_4', 'state_1',\n       'state_2', 'state_3', 'state_4', 'state_5', 'state_6', 'state_7',\n       'state_8', 'state2_1', 'state2_2', 'state2_3', 'state2_4', 'state2_5',\n       'state2_6', 'state2_7', 'state2_8', 'state2_9', 'windoor_1'],\n      dtype='object')"

CPU times: user 108 ms, sys: 13.9 ms, total: 122 ms
Wall time: 123 ms


## データ分割

In [7]:
SEED = 7 #@param{type:'number'}

X_train, X_test, y_train, y_test = train_test_split(dataset[x_cols].values, dataset[y_col].values,
                                                    test_size=0.2, random_state=SEED)

display(X_train.shape, y_train.shape)
display(X_test.shape, y_test.shape)

(10058, 44)

(10058,)

(2515, 44)

(2515,)

## TPOTで回帰モデル作成

In [8]:
SEARCH_STRATEGY = 'TPOT light' #@param{type:'string'}
GENERATION =  100#@param{type:'number'}
POPULATION =  100#@param{type:'number'}
CV =  5#@param{type:'number'}
EARLY_STOP_ROUND = 2 #@param:{type:'number'}
N_JOBS = -1 #@param{type:'number'}
VERBOSITY = 2 #@param{type:'number'}


pipeline_optimizer = TPOTRegressor(config_dict=SEARCH_STRATEGY,
                                   generations=GENERATION, # (default:100)
                                   population_size=POPULATION, # (default:100)
                                   cv=CV, #(default:5)
                                   early_stop=EARLY_STOP_ROUND,
                                   n_jobs=N_JOBS, # (default:1)
                                   random_state=SEED,
                                   verbosity=VERBOSITY,
                                   warm_start=True, # 以前のfit結果があれば続きから開始
                                   )


In [9]:
pipeline_optimizer.fit(X_train, y_train)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=10100.0, style=ProgressStyle(…


Generation 1 - Current best internal CV score: -22.99877707897601
Generation 2 - Current best internal CV score: -22.883992191189385
Generation 3 - Current best internal CV score: -22.845563974161184
Generation 4 - Current best internal CV score: -22.845563974161184
Generation 5 - Current best internal CV score: -22.157072780884143
Generation 6 - Current best internal CV score: -22.073454471077174
Generation 7 - Current best internal CV score: -22.073454471077174
Generation 8 - Current best internal CV score: -21.933428756078055
Generation 9 - Current best internal CV score: -21.933428756078055
Generation 10 - Current best internal CV score: -21.225182361496813
Generation 11 - Current best internal CV score: -21.096826061289157
Generation 12 - Current best internal CV score: -21.096826061289157
Generation 13 - Current best internal CV score: -21.096826061289157
Generation 14 - Current best internal CV score: -21.096826061289157
Generation 15 - Current best internal CV score: -21.09682

TPOTRegressor(config_dict='TPOT light', crossover_rate=0.1, cv=5,
              disable_update_check=False, early_stop=2, generations=100,
              log_file=<ipykernel.iostream.OutStream object at 0x7fe7cf5e84a8>,
              max_eval_time_mins=5, max_time_mins=None, memory=None,
              mutation_rate=0.9, n_jobs=-1, offspring_size=None,
              periodic_checkpoint_folder=None, population_size=100,
              random_state=7, scoring=None, subsample=1.0, template=None,
              use_dask=False, verbosity=2, warm_start=True)

## パイプラインの出力

In [10]:
%cd /content/drive/My Drive/機械学習練習/AutoML/TPOT
%ls

/content/drive/My Drive/機械学習練習/AutoML/TPOT
[0m[01;34mexport_pipeline[0m/  radon-regressor_drop_country.ipynb  radon-regressor.ipynb


In [11]:
%%time

EXPORT_DIR =  './export_pipeline' # @param {type:'string'}
file_prefix = 'radon-regressor_drop_country_pipeline' # @param{type:'string'}

now = datetime.datetime.now().strftime('%Y%m%d_%H%m%s')
export_file = os.path.join(EXPORT_DIR, now+'_'+file_prefix)

# ベストなパイプラインのスクリプトファイル出力
pipeline_optimizer.export(output_file_name=export_file+'.py')


def dump_pkl(obj, path):
    '''objをpklファイルで出力'''
    with open(path, 'wb') as f:
        f.write(cloudpickle.dumps(obj))

# ベストなパイプラインをpklで出力
best_pipline = pipeline_optimizer.fitted_pipeline_
dump_pkl(best_pipline, export_file+'.pkl')

%ls {EXPORT_DIR}

20201026_08101603699860_radon-regressor_pipeline.pkl
20201026_08101603699860_radon-regressor_pipeline.py
20201026_08101603700167_radon-regressor_drop_country_pipeline.pkl
20201026_08101603700167_radon-regressor_drop_country_pipeline.py
CPU times: user 8.17 ms, sys: 8.23 ms, total: 16.4 ms
Wall time: 140 ms


## モデルの評価

### TPOTモデルクラスの評価関数

In [12]:
%%time
score = pipeline_optimizer.score(testing_features=X_test,
                                 testing_target=y_test)
display(score)

-43.569238127713795

CPU times: user 5.68 ms, sys: 1.31 ms, total: 7 ms
Wall time: 8.51 ms


### sklearnの評価関数

In [13]:
%%time
def _calc_score(y_true, y_pred, sklearn_metric):
  '''sklearnの評価指標のスコアを計算'''
  score = sklearn_metric(y_true=y_true, y_pred=y_pred)
  return score


def evaluete_sk_metrics(y_true, y_pred, sklearn_metrics):
  '''sklearnの各種、評価関数を使用'''
  result = {}
  for sk_metric in sklearn_metrics:
    result[sk_metric.__name__] = _calc_score(y_true=y_true, y_pred=y_pred, sklearn_metric=sk_metric)
  return result


SK_METRICS=[r2_score, explained_variance_score, mean_squared_error]
eval_scores = {'train': {}, 'test': {}}

for key, X, y_true in zip(['train', 'test'], [X_train, X_test], [y_train, y_test]):
  y_pred = pipeline_optimizer.predict(X)
  eval_scores[key] = evaluete_sk_metrics(y_true=y_true, y_pred=y_pred, sklearn_metrics=SK_METRICS)
pprint.pprint(eval_scores)

{'test': {'explained_variance_score': 0.379062269079358,
          'mean_squared_error': 43.569238127713795,
          'r2_score': 0.37899843902532915},
 'train': {'explained_variance_score': 0.9181988295072356,
           'mean_squared_error': 7.29414406915668,
           'r2_score': 0.9181988314155407}}
CPU times: user 11.8 ms, sys: 843 µs, total: 12.7 ms
Wall time: 14.6 ms
