In [77]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow_addons as tfa
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from catboost import CatBoostClassifier

In [78]:
train = pd.read_csv('../Data/train_values.csv', index_col='building_id')
labels = pd.read_csv('../Data/train_labels.csv', index_col='building_id')
test = pd.read_csv('../Data/test_values.csv', index_col='building_id')

In [79]:
train.dtypes

geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_cement_mortar_brick     int64
has_superstructure_t

In [80]:
cat_cols = list(train.select_dtypes('object').columns)

bin_cols = [
    'has_superstructure_adobe_mud',
    'has_superstructure_mud_mortar_stone',
    'has_superstructure_stone_flag',
    'has_superstructure_cement_mortar_stone',
    'has_superstructure_mud_mortar_brick',
    'has_superstructure_cement_mortar_brick',
    'has_superstructure_timber',
    'has_superstructure_rc_engineered',
    'has_superstructure_other',
    'has_secondary_use',
    'has_secondary_use_agriculture',
    'has_secondary_use_hotel',
    'has_secondary_use_rental',
    'has_secondary_use_institution',
    'has_secondary_use_school',
    'has_secondary_use_industry',
    'has_secondary_use_health_post',
    'has_secondary_use_gov_office',
    'has_secondary_use_use_police',
    'has_secondary_use_other'
]

all_cat_cols = cat_cols + bin_cols

num_cols = list(train.select_dtypes('number').columns)

In [100]:
ct = ColumnTransformer(
#    [('cat', OneHotEncoder(), all_cat_cols),
    [('num', StandardScaler(), num_cols)
    ],
    remainder='passthrough')

In [101]:
train_enc = ct.fit_transform(train)

In [69]:
labels_encoder = OneHotEncoder()
labels_enc = labels_encoder.fit_transform(np.array(labels).reshape(-1,1))
labels_enc = labels_enc.toarray()

In [75]:
categorical_features_indices = np.where((train_enc != np.float))[0]

In [93]:
categorical_features_indices = np.asarray([int(train.columns.get_loc(c)) for c in all_cat_cols])

array([[-0.9834136903330413, -0.5187049387843586, 1.6290546017011027,
        ..., 't', 'd', 'v'],
       [-0.7344593363138063, 0.48199787510963743, -0.9450173995037939,
        ..., 's', 'd', 'v'],
       [0.8837439648112211, -0.8191580838760184, 0.7446115797656891, ...,
        't', 'd', 'v'],
       ...,
       [0.3858352567727511, -1.5751369650743883, 0.5224724021633063, ...,
        's', 'd', 'v'],
       [1.5061298498593085, -1.6042130758897102, -1.2085677077456334,
        ..., 'j', 'd', 'v'],
       [0.8837439648112211, -1.676903352928015, 0.7797150547448312, ...,
        'j', 'd', 'v']], dtype=object)

# Train Test Split

In [104]:
X_train, X_test, y_train, y_test = train_test_split(train_enc, labels, test_size=0.3, random_state=42, shuffle=True)

In [107]:
sfs = SFS(CatBoostClassifier(n_estimators=100, verbose=False),
          k_features = X_train.shape[1],
          forward=True,
          floating=False,
          verbose=2,
          scoring='f1_micro',
          cv=2)

sfs = sfs.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s remaining:    0.0s
Traceback (most recent call last):
  File "_catboost.pyx", line 2149, in _catboost.get_float_feature
  File "_catboost.pyx", line 1122, in _catboost._FloatOrNan
  File "_catboost.pyx", line 948, in _catboost._FloatOrNanFromString
TypeError: Cannot convert 'b't'' to float

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\mcand\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\mcand\Anaconda3\lib\site-packages\catboost\core.py", line 4673, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\mcand\Anaconda3\lib\site-packages\catboost\core

<catboost.core.CatBoostClassifier at 0x2721d99d730>

In [None]:
model = keras.Sequential([
    keras.layers.Dense(108, activation='relu'),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(3, activation='relu')
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tfa.metrics.F1Score(num_classes=3), 'accuracy'])

In [None]:
history = model.fit(
    X_train,
    y_train,
    batch_size=200,
    epochs=150,
    validation_data=(X_test, y_test),
    #callbacks=[h1n1_mc, EarlyStopping],
    shuffle=True,
    verbose=1
)