In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [2]:
train_data = pd.read_csv('./data/train.csv', index_col='index')
test_data = pd.read_csv('./data/test_x.csv', index_col='index')

In [3]:
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

In [4]:
x_all_train = train_data.drop('voted', axis=1)
x_all_label = train_data['voted']

In [5]:
object_columns = x_all_train.dtypes[x_all_train.dtypes == 'object'].index.tolist()
object_columns

['age_group', 'gender', 'race', 'religion']

In [6]:
from sklearn.preprocessing import LabelEncoder
label_en = LabelEncoder()
x_all_train['label_age'] = label_en.fit_transform(x_all_train['age_group'])
test_data['label_age'] = label_en.transform(test_data['age_group'])
x_all_train['label_gender'] = label_en.fit_transform(x_all_train['gender'])
test_data['label_gender'] = label_en.transform(test_data['gender'])
x_all_train['label_race'] = label_en.fit_transform(x_all_train['race'])
test_data['label_race'] = label_en.transform(test_data['race'])
x_all_train['label_religion'] = label_en.fit_transform(x_all_train['religion'])
test_data['label_religion'] = label_en.transform(test_data['religion'])

In [7]:
x_all_train = x_all_train.drop(object_columns, axis=1)
test_data = test_data.drop(object_columns, axis=1)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(x_all_train, x_all_label, test_size=0.3)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((31872, 76), (13660, 76), (31872,), (13660,))

In [9]:
bayesian_params = {
    'max_depth': (6, 16), 
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.01, 50) 
}

In [10]:
def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":500, "learning_rate":0.02,
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 300)
    valid_proba = lgb_model.predict_proba(X_valid)[:, 1]
    roc_auc = roc_auc_score(y_valid, valid_proba)
    
    return roc_auc 

In [11]:
# BayesianOptimization객체를 수행할 함수와 search할 parameter 범위를 설정하여 생성. 
lgbBO = BayesianOptimization(lgb_roc_eval,bayesian_params , random_state=0)
# 함수 반환값이 최대가 되는 입력값 유추를 위한 iteration 수행. 
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.776366	training's binary_logloss: 0.564959	valid_1's auc: 0.766107	valid_1's binary_logloss: 0.569672
[200]	training's auc: 0.792352	training's binary_logloss: 0.544224	valid_1's auc: 0.769018	valid_1's binary_logloss: 0.55871
[300]	training's auc: 0.806635	training's binary_logloss: 0.532212	valid_1's auc: 0.769895	valid_1's binary_logloss: 0.557158
[400]	training's auc: 0.819907	training's binary_logloss: 0.521609	valid_1's auc: 0.769825	valid_1's binary_logloss: 0.556792
[500]	training's auc: 0.831652	training's binary_logloss: 0.511943	valid_1's auc: 0.769624	valid_1's binary_logloss: 0.556775
Did not meet early stopping. Best iterati

[400]	training's auc: 0.867837	training's binary_logloss: 0.483391	valid_1's auc: 0.770331	valid_1's binary_logloss: 0.555187
[500]	training's auc: 0.881811	training's binary_logloss: 0.469975	valid_1's auc: 0.769846	valid_1's binary_logloss: 0.555633
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.881811	training's binary_logloss: 0.469975	valid_1's auc: 0.769846	valid_1's binary_logloss: 0.555633
| [0m 8       [0m | [0m 0.7698  [0m | [0m 0.5237  [0m | [0m 399.1   [0m | [0m 10.06   [0m | [0m 99.13   [0m | [0m 42.12   [0m | [0m 58.58   [0m | [0m 0.4063  [0m | [0m 4.455   [0m | [0m 0.5701  [0m |
Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.768004	training's binary_logloss: 0.570149	valid_1's auc: 0.76452	valid_1's binary_logloss: 0.571164
[200]	training's auc: 0.777337	training's binary_logloss: 0.554927	valid_1's auc: 0.767237	valid_1's binary_logloss: 0.560858
[300]	training's auc: 0.7851	training's

Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.784376	training's binary_logloss: 0.559951	valid_1's auc: 0.768982	valid_1's binary_logloss: 0.567353
[200]	training's auc: 0.803933	training's binary_logloss: 0.536166	valid_1's auc: 0.771136	valid_1's binary_logloss: 0.55643
[300]	training's auc: 0.821356	training's binary_logloss: 0.521182	valid_1's auc: 0.771912	valid_1's binary_logloss: 0.554874
[400]	training's auc: 0.837512	training's binary_logloss: 0.50782	valid_1's auc: 0.772006	valid_1's binary_logloss: 0.554698
[500]	training's auc: 0.851774	training's binary_logloss: 0.495514	valid_1's auc: 0.771658	valid_1's binary_logloss: 0.55479
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.851774	training's binary_logloss: 0.495514	valid_1's auc: 0.771658	valid_1's binary_logloss: 0.55479
| [95m 16      [0m | [95m 0.7717  [0m | [95m 0.7405  [0m | [95m 120.1   [0m | [95m 12.18   [0m | [95m 53.76   [0m | [95m 42.6

Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.774382	training's binary_logloss: 0.569132	valid_1's auc: 0.766951	valid_1's binary_logloss: 0.572435
[200]	training's auc: 0.785926	training's binary_logloss: 0.549167	valid_1's auc: 0.769419	valid_1's binary_logloss: 0.559297
[300]	training's auc: 0.79706	training's binary_logloss: 0.538779	valid_1's auc: 0.770811	valid_1's binary_logloss: 0.556252
[400]	training's auc: 0.806755	training's binary_logloss: 0.530747	valid_1's auc: 0.771472	valid_1's binary_logloss: 0.555429
[500]	training's auc: 0.815524	training's binary_logloss: 0.523557	valid_1's auc: 0.771489	valid_1's binary_logloss: 0.555325
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.815524	training's binary_logloss: 0.523557	valid_1's auc: 0.771489	valid_1's binary_logloss: 0.555325
| [0m 24      [0m | [0m 0.7715  [0m | [0m 0.5741  [0m | [0m 116.0   [0m | [0m 9.64    [0m | [0m 49.82   [0m | [0m 34.27   

In [12]:
lgbBO.res

[{'target': 0.7696239538289908,
  'params': {'colsample_bytree': 0.7744067519636624,
   'max_bin': 360.44278952248555,
   'max_depth': 12.027633760716439,
   'min_child_samples': 113.52780476941041,
   'min_child_weight': 21.75908516760633,
   'num_leaves': 49.835764522666246,
   'reg_alpha': 21.884984691022,
   'reg_lambda': 8.917838234820016,
   'subsample': 0.9818313802505146}},
 {'target': 0.7714495726990379,
  'params': {'colsample_bytree': 0.6917207594128889,
   'max_bin': 397.94526866050563,
   'max_depth': 11.288949197529044,
   'min_child_samples': 117.92846660784714,
   'min_child_weight': 46.35423527634039,
   'num_leaves': 26.841442327915477,
   'reg_alpha': 4.36559369208002,
   'reg_lambda': 0.20316375600581688,
   'subsample': 0.916309922773969}},
 {'target': 0.7690128766113444,
  'params': {'colsample_bytree': 0.8890783754749252,
   'max_bin': 436.30595264094137,
   'max_depth': 15.78618342232764,
   'min_child_samples': 161.8401272011775,
   'min_child_weight': 23.61248

In [13]:
lgb_model = LGBMClassifier(
                nthread=4,
                n_estimators=1000,
                learning_rate=0.02,
                max_depth = 11,
                num_leaves=27,
                colsample_bytree=0.736,
                subsample=0.9163,
                max_bin=398,
                reg_alpha=4.366,
                reg_lambda=0.203,
                min_child_weight=46,
                min_child_samples=118,
                silent=-1,
                verbose=-1,
                )

In [14]:
lgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],
             eval_metric='auc', verbose=100, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.78156	training's binary_logloss: 0.560793	valid_1's auc: 0.769183	valid_1's binary_logloss: 0.56665
[200]	training's auc: 0.796438	training's binary_logloss: 0.540606	valid_1's auc: 0.771464	valid_1's binary_logloss: 0.556123
[300]	training's auc: 0.810189	training's binary_logloss: 0.528951	valid_1's auc: 0.771639	valid_1's binary_logloss: 0.555112
[400]	training's auc: 0.822178	training's binary_logloss: 0.519224	valid_1's auc: 0.771176	valid_1's binary_logloss: 0.555318
Early stopping, best iteration is:
[305]	training's auc: 0.810837	training's binary_logloss: 0.528434	valid_1's auc: 0.771684	valid_1's binary_logloss: 0.555074


LGBMClassifier(colsample_bytree=0.736, learning_rate=0.02, max_bin=398,
               max_depth=11, min_child_samples=118, min_child_weight=46,
               n_estimators=1000, nthread=4, num_leaves=27, reg_alpha=4.366,
               reg_lambda=0.203, silent=-1, subsample=0.9163, verbose=-1)

In [15]:
Q_E = ['QaE','QbE','QcE','QdE','QeE','QfE','QgE','QhE','QiE','QjE',
       'QkE','QlE','QmE','QnE','QoE','QpE','QqE','QrE','QsE','QtE']

In [30]:
x_all_train = train_data

In [31]:
for column in Q_E:
    x_all_train = x_all_train[x_all_train[column] < 15000]

In [18]:
for column in Q_E:
    x_all_train[column+'_log'] = np.log(x_all_train[column] + 1)

In [19]:
x_all_train = x_all_train.drop(Q_E, axis=1)

In [20]:
x_all_train.shape

(42512, 77)

In [21]:
x_all_train = train_data.drop('voted', axis=1)
x_all_label = train_data['voted']

In [22]:
label_en = LabelEncoder()
x_all_train['label_age'] = label_en.fit_transform(x_all_train['age_group'])
x_all_train['label_gender'] = label_en.fit_transform(x_all_train['gender'])
x_all_train['label_race'] = label_en.fit_transform(x_all_train['race'])
x_all_train['label_religion'] = label_en.fit_transform(x_all_train['religion'])

In [23]:
x_all_train = x_all_train.drop(object_columns, axis=1)

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(x_all_train, x_all_label, test_size=0.43)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((25953, 76), (19579, 76), (25953,), (19579,))

In [26]:
# BayesianOptimization객체를 수행할 함수와 search할 parameter 범위를 설정하여 생성. 
lgbBO = BayesianOptimization(lgb_roc_eval,bayesian_params , random_state=0)
# 함수 반환값이 최대가 되는 입력값 유추를 위한 iteration 수행. 
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.776422	training's binary_logloss: 0.565631	valid_1's auc: 0.761709	valid_1's binary_logloss: 0.572074
[200]	training's auc: 0.793349	training's binary_logloss: 0.544345	valid_1's auc: 0.764305	valid_1's binary_logloss: 0.5616
[300]	training's auc: 0.809636	training's binary_logloss: 0.530743	valid_1's auc: 0.765473	valid_1's binary_logloss: 0.559816
[400]	training's auc: 0.823597	training's binary_logloss: 0.519104	valid_1's auc: 0.765385	valid_1's binary_logloss: 0.559661
[500]	training's auc: 0.836268	training's binary_logloss: 0.508458	valid_1's auc: 0.76532	valid_1's binary_logloss: 0.559677
Did not meet early stopping. Best iteration

[300]	training's auc: 0.836248	training's binary_logloss: 0.509689	valid_1's auc: 0.767276	valid_1's binary_logloss: 0.557278
[400]	training's auc: 0.853993	training's binary_logloss: 0.49472	valid_1's auc: 0.766754	valid_1's binary_logloss: 0.557474
[500]	training's auc: 0.868578	training's binary_logloss: 0.481357	valid_1's auc: 0.766072	valid_1's binary_logloss: 0.557872
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.868578	training's binary_logloss: 0.481357	valid_1's auc: 0.766072	valid_1's binary_logloss: 0.557872
| [0m 8       [0m | [0m 0.7661  [0m | [0m 0.6344  [0m | [0m 430.9   [0m | [0m 10.56   [0m | [0m 10.4    [0m | [0m 20.87   [0m | [0m 33.8    [0m | [0m 0.8925  [0m | [0m 2.51    [0m | [0m 0.7914  [0m |
Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.76589	training's binary_logloss: 0.5726	valid_1's auc: 0.757442	valid_1's binary_logloss: 0.575774
[200]	training's auc: 0.772143	training's 

Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.777189	training's binary_logloss: 0.564369	valid_1's auc: 0.762133	valid_1's binary_logloss: 0.571126
[200]	training's auc: 0.792045	training's binary_logloss: 0.544923	valid_1's auc: 0.764596	valid_1's binary_logloss: 0.561399
[300]	training's auc: 0.806181	training's binary_logloss: 0.533059	valid_1's auc: 0.765713	valid_1's binary_logloss: 0.559741
[400]	training's auc: 0.818371	training's binary_logloss: 0.522942	valid_1's auc: 0.765562	valid_1's binary_logloss: 0.559604
[500]	training's auc: 0.829249	training's binary_logloss: 0.513753	valid_1's auc: 0.765498	valid_1's binary_logloss: 0.559569
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.829249	training's binary_logloss: 0.513753	valid_1's auc: 0.765498	valid_1's binary_logloss: 0.559569
| [0m 16      [0m | [0m 0.7655  [0m | [0m 0.7953  [0m | [0m 381.3   [0m | [0m 8.996   [0m | [0m 103.8   [0m | [0m 43.68  

[500]	training's auc: 0.849239	training's binary_logloss: 0.497639	valid_1's auc: 0.765739	valid_1's binary_logloss: 0.558541
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.849239	training's binary_logloss: 0.497639	valid_1's auc: 0.765739	valid_1's binary_logloss: 0.558541
| [0m 23      [0m | [0m 0.7657  [0m | [0m 0.652   [0m | [0m 403.9   [0m | [0m 15.49   [0m | [0m 128.7   [0m | [0m 40.26   [0m | [0m 32.06   [0m | [0m 5.745   [0m | [0m 6.553   [0m | [0m 0.5736  [0m |
Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.788367	training's binary_logloss: 0.555788	valid_1's auc: 0.763053	valid_1's binary_logloss: 0.567955
[200]	training's auc: 0.807566	training's binary_logloss: 0.533018	valid_1's auc: 0.765218	valid_1's binary_logloss: 0.559653
[300]	training's auc: 0.825282	training's binary_logloss: 0.517838	valid_1's auc: 0.765746	valid_1's binary_logloss: 0.558668
[400]	training's auc: 0.839863	trainin

In [60]:
from sklearn.preprocessing import StandardScaler

In [61]:
std_en = StandardScaler()

In [62]:
x_all_train = train_data

In [63]:
for column in Q_E:
    x_all_train = x_all_train[x_all_train[column] < 15000]

In [48]:
for column in Q_E:
    x_all_train[column] = np.log(x_all_train[column] + 1)

In [64]:
for column in Q_E:
    x_all_train[column+'_std'] = std_en.fit_transform(x_all_train[[column]])

In [65]:
x_all_train = x_all_train.drop(Q_E, axis=1)

In [66]:
x_all_train = train_data.drop('voted', axis=1)
x_all_label = train_data['voted']

In [67]:
label_en = LabelEncoder()
x_all_train['label_age'] = label_en.fit_transform(x_all_train['age_group'])
x_all_train['label_gender'] = label_en.fit_transform(x_all_train['gender'])
x_all_train['label_race'] = label_en.fit_transform(x_all_train['race'])
x_all_train['label_religion'] = label_en.fit_transform(x_all_train['religion'])

In [68]:
x_all_train = x_all_train.drop(object_columns, axis=1)

In [69]:
X_train, X_valid, y_train, y_valid = train_test_split(x_all_train, x_all_label, test_size=0.43)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((25953, 76), (19579, 76), (25953,), (19579,))

In [70]:
# BayesianOptimization객체를 수행할 함수와 search할 parameter 범위를 설정하여 생성. 
lgbBO = BayesianOptimization(lgb_roc_eval,bayesian_params , random_state=0)
# 함수 반환값이 최대가 되는 입력값 유추를 위한 iteration 수행. 
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.772738	training's binary_logloss: 0.56792	valid_1's auc: 0.765735	valid_1's binary_logloss: 0.571302
[200]	training's auc: 0.791702	training's binary_logloss: 0.546426	valid_1's auc: 0.767872	valid_1's binary_logloss: 0.559912
[300]	training's auc: 0.808354	training's binary_logloss: 0.532996	valid_1's auc: 0.768013	valid_1's binary_logloss: 0.558176
[400]	training's auc: 0.822865	training's binary_logloss: 0.521203	valid_1's auc: 0.76768	valid_1's binary_logloss: 0.557894
[500]	training's auc: 0.835661	training's binary_logloss: 0.510549	valid_1's auc: 0.767404	valid_1's binary_logloss: 0.55782
Did not meet early stopping. Best iteration

[300]	training's auc: 0.784353	training's binary_logloss: 0.549478	valid_1's auc: 0.767233	valid_1's binary_logloss: 0.559759
[400]	training's auc: 0.791918	training's binary_logloss: 0.543454	valid_1's auc: 0.767327	valid_1's binary_logloss: 0.559001
[500]	training's auc: 0.798138	training's binary_logloss: 0.538443	valid_1's auc: 0.767085	valid_1's binary_logloss: 0.558817
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.798138	training's binary_logloss: 0.538443	valid_1's auc: 0.767085	valid_1's binary_logloss: 0.558817
| [0m 8       [0m | [0m 0.7671  [0m | [0m 0.7427  [0m | [0m 319.5   [0m | [0m 9.93    [0m | [0m 111.8   [0m | [0m 35.35   [0m | [0m 35.0    [0m | [0m 36.71   [0m | [0m 3.602   [0m | [0m 0.9431  [0m |
Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.766437	training's binary_logloss: 0.573543	valid_1's auc: 0.763688	valid_1's binary_logloss: 0.574929
[200]	training's auc: 0.778519	trainin

[100]	training's auc: 0.783569	training's binary_logloss: 0.561753	valid_1's auc: 0.768121	valid_1's binary_logloss: 0.569144
[200]	training's auc: 0.806894	training's binary_logloss: 0.536217	valid_1's auc: 0.769869	valid_1's binary_logloss: 0.557828
[300]	training's auc: 0.827228	training's binary_logloss: 0.519354	valid_1's auc: 0.769428	valid_1's binary_logloss: 0.556535
[400]	training's auc: 0.844947	training's binary_logloss: 0.504415	valid_1's auc: 0.768635	valid_1's binary_logloss: 0.55677
[500]	training's auc: 0.860213	training's binary_logloss: 0.490969	valid_1's auc: 0.768226	valid_1's binary_logloss: 0.556933
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.860213	training's binary_logloss: 0.490969	valid_1's auc: 0.768226	valid_1's binary_logloss: 0.556933
| [0m 16      [0m | [0m 0.7682  [0m | [0m 0.7405  [0m | [0m 120.1   [0m | [0m 12.18   [0m | [0m 53.76   [0m | [0m 42.68   [0m | [0m 47.66   [0m | [0m 10.66   [0m | [0m 1.986   [

Training until validation scores don't improve for 300 rounds
[100]	training's auc: 0.770019	training's binary_logloss: 0.569028	valid_1's auc: 0.765769	valid_1's binary_logloss: 0.570996
[200]	training's auc: 0.786419	training's binary_logloss: 0.549686	valid_1's auc: 0.768369	valid_1's binary_logloss: 0.559749
[300]	training's auc: 0.800249	training's binary_logloss: 0.538467	valid_1's auc: 0.768775	valid_1's binary_logloss: 0.558005
[400]	training's auc: 0.812427	training's binary_logloss: 0.528699	valid_1's auc: 0.768392	valid_1's binary_logloss: 0.557604
[500]	training's auc: 0.823336	training's binary_logloss: 0.5198	valid_1's auc: 0.768199	valid_1's binary_logloss: 0.557432
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.823336	training's binary_logloss: 0.5198	valid_1's auc: 0.768199	valid_1's binary_logloss: 0.557432
| [0m 24      [0m | [0m 0.7682  [0m | [0m 0.8324  [0m | [0m 52.63   [0m | [0m 14.54   [0m | [0m 100.8   [0m | [0m 22.84   [0

In [71]:
label_en.classes_

array(['Agnostic', 'Atheist', 'Buddhist', 'Christian_Catholic',
       'Christian_Mormon', 'Christian_Other', 'Christian_Protestant',
       'Hindu', 'Jewish', 'Muslim', 'Other', 'Sikh'], dtype=object)

In [73]:
from sklearn.preprocessing import OrdinalEncoder

In [75]:
ord_ec = OrdinalEncoder()
ord_ec.fit(train_data[['religion']])

OrdinalEncoder()

In [76]:
label_en.classes_

array(['Agnostic', 'Atheist', 'Buddhist', 'Christian_Catholic',
       'Christian_Mormon', 'Christian_Other', 'Christian_Protestant',
       'Hindu', 'Jewish', 'Muslim', 'Other', 'Sikh'], dtype=object)

In [109]:
train_data['religion'].value_counts()

Atheist                 10192
Agnostic                 9624
Christian_Catholic       6431
Christian_Other          5137
Christian_Protestant     4875
Other                    4770
Hindu                    1429
Muslim                   1192
Buddhist                  850
Jewish                    487
Christian_Mormon          428
Sikh                      117
Name: religion, dtype: int64

In [110]:
religion_index = train_data['religion'].value_counts().index.tolist()
religion_index

['Atheist',
 'Agnostic',
 'Christian_Catholic',
 'Christian_Other',
 'Christian_Protestant',
 'Other',
 'Hindu',
 'Muslim',
 'Buddhist',
 'Jewish',
 'Christian_Mormon',
 'Sikh']

In [111]:
ord_ec.set_params(categories=religion_index)

OrdinalEncoder(categories=['Atheist', 'Agnostic', 'Christian_Catholic',
                           'Christian_Other', 'Christian_Protestant', 'Other',
                           'Hindu', 'Muslim', 'Buddhist', 'Jewish',
                           'Christian_Mormon', 'Sikh'])

In [112]:
test = ord_ec.fit_transform(train_data[['religion']])

ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).

In [107]:
ord_ec.categories_

[array(['Agnostic', 'Atheist', 'Buddhist', 'Christian_Catholic',
        'Christian_Mormon', 'Christian_Other', 'Christian_Protestant',
        'Hindu', 'Jewish', 'Muslim', 'Other', 'Sikh'], dtype=object)]

In [105]:
len(religion_index)

12

In [104]:
len(train_data['religion'].unique())

12

In [83]:
train_data[['religion']]

Unnamed: 0_level_0,religion
index,Unnamed: 1_level_1
0,Other
1,Hindu
2,Other
3,Hindu
4,Agnostic
...,...
45527,Jewish
45528,Atheist
45529,Christian_Other
45530,Atheist


In [None]:
|