# Categorical feature encoding challenge - Baseline v2

In [37]:
import pandas as pd

data_path = '../input/cat-in-the-dat/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

## 1. Feature engineering
### 1.1. feature encoding

In [38]:
# 데이터 합치기
all_data = pd.concat([train, test])
all_data.drop('target', axis=1, inplace=True)

#### 1.1.1. binary feature - string -> integer

In [39]:
all_data['bin_3'] = all_data['bin_3'].map({'F': 0, 'T': 1})
all_data['bin_4'] = all_data['bin_4'].map({'N': 0, 'Y': 1})

#### 1.1.2 ordinal feature - string -> integer

In [40]:
# ord_1, ord_2
ord1dict = {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4}
ord2dict = {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5}

all_data['ord_1'] = all_data['ord_1'].map(ord1dict)
all_data['ord_2'] = all_data['ord_2'].map(ord2dict)

# ord_3 ~ ord_5
from sklearn.preprocessing import OrdinalEncoder

ord_345 = ['ord_3', 'ord_4', 'ord_5']

ord_encoder = OrdinalEncoder()
all_data[ord_345] = ord_encoder.fit_transform(all_data[ord_345])

# for feature, categories in zip(ord_345, ord_encoder.categories_):
#     print(feature)
#     print(categories)

#### 1.1.3 nominal feature - one hot

In [41]:
nom_features = ['nom_' + str(i) for i in range(10)]

from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder()
encoded_nom_matrix = onehot_encoder.fit_transform(all_data[nom_features])

In [42]:
all_data.drop(nom_features, axis=1, inplace=True)

#### 1.1.4 date feature - one hot

In [43]:
date_features = ['day', 'month']

encoded_date_matrix = onehot_encoder.fit_transform(all_data[date_features])

all_data.drop(date_features, axis=1, inplace=True)

### 1.2 Feature scaling

In [44]:
# 순서형 피처 스케일링
from sklearn.preprocessing import MinMaxScaler

ord_features = ['ord_' + str(i) for i in range(6)]

all_data[ord_features] = MinMaxScaler().fit_transform(all_data[ord_features])

In [45]:
# 쪼개졌던 데이터 합치기
from scipy import sparse

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data),
                               encoded_nom_matrix,
                               encoded_date_matrix],
                              format='csr')

In [46]:
# 훈련, 테스트, 검증 데이터 나누기
num_train = len(train)

X_train = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]
y = train['target']

### 1.3 Optimizing hyperparameter

In [47]:
%%time

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()

lr_params = {'C': [.1, .125, .2], 'max_iter': [800, 900, 1000], 'solver': ['liblinear'], 'random_state': [42]}

gridsearch_logistic_model = GridSearchCV(estimator=logistic_model,
                                         param_grid=lr_params,
                                         scoring='roc_auc',
                                         cv=5)

gridsearch_logistic_model.fit(X_train, y)

print('최적 하이퍼파라미터:', gridsearch_logistic_model.best_params_)

최적 하이퍼파라미터: {'C': 0.125, 'max_iter': 800, 'random_state': 42, 'solver': 'liblinear'}
CPU times: total: 2min 52s
Wall time: 2min 53s


### 1.4 Prediction and Submit

In [48]:
y_preds = gridsearch_logistic_model.best_estimator_.predict_proba(X_test)[:, 1]

submission['target'] = y_preds
submission.to_csv('submission.csv')