In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling
import cufflinks as cf
import plotly.offline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import f1_score

DATA_DIR = "/home/gangda/workspace/ds-richter/data"


X = pd.read_csv(DATA_DIR + '/train_values.csv', index_col='building_id')
y = pd.read_csv(DATA_DIR + '/train_labels.csv', index_col='building_id')
y -= 1

# Identify categorical and numerical columns
categorical_columns = [
    "land_surface_condition", "foundation_type", "roof_type",
    "ground_floor_type", "other_floor_type", "position",
    "plan_configuration", "legal_ownership_status",
]
numerical_columns = [
    "geo_level_1_id", "geo_level_2_id", "geo_level_3_id",
    "count_floors_pre_eq", "age", "area_percentage",
    "height_percentage", "count_families"
]
other_columns = [
    "has_superstructure_adobe_mud",
    "has_superstructure_mud_mortar_stone",
    "has_superstructure_stone_flag",
    "has_superstructure_cement_mortar_stone",
    "has_superstructure_mud_mortar_brick",
    "has_superstructure_cement_mortar_brick",
    "has_superstructure_timber",
    "has_superstructure_bamboo",
    "has_superstructure_rc_non_engineered",
    "has_superstructure_rc_engineered",
    "has_superstructure_other",
    "has_secondary_use",
    "has_secondary_use_agriculture",
    "has_secondary_use_hotel",
    "has_secondary_use_rental",
    "has_secondary_use_institution",
    "has_secondary_use_school",
    "has_secondary_use_industry",
    "has_secondary_use_health_post",
    "has_secondary_use_gov_office",
    "has_secondary_use_use_police",
    "has_secondary_use_other",
]

In [12]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numerical_columns),
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ("other", FunctionTransformer(validate=False), other_columns),
])

params = {
    'objective': 'multi:softmax',
    'num_class': 3,  # Number of classes
    'eval_metric': 'mlogloss',
    'learning_rate': 0.1,
    'max_depth': 9,
    'n_estimators': 500,
}

In [15]:
from sklearn.model_selection import KFold
import xgboost as xgb

kf = KFold(n_splits=5)
best_models = []
val_acc = 0
for train_index, val_index in kf.split(X):
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index, :]
    X_val = X.iloc[val_index, :]
    y_val = y.iloc[val_index, :]

    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", XGBClassifier(random_state=42, **params))
    ])
    model.fit(X_train, y_train)
    val_preds = model.predict(X_val)

    # train_x = preprocessor.fit_transform(X_train)
    # valid_x = preprocessor.transform(X_val)
    # dtrain = xgb.DMatrix(train_x, label=y_train)
    # dval = xgb.DMatrix(valid_x, label=y_val)

    # model = xgb.train(params,
    #                   dtrain,
    #                   # num_boost_round=500,
    #                   evals=[(dtrain, 'train'), (dval, 'validation')],
    #                   early_stopping_rounds=100,
    #                   verbose_eval=True)
    # val_preds = model.predict(dval)

    score = f1_score(y_val, val_preds, average='micro')
    print(score)
    val_acc += score
    best_models.append(model)

print("\n ========================================================")
print(f'val_acc: {val_acc/len(best_models): .5f}')

0.7466664108516721
0.742344589409056
0.7484650805832693
0.7448004604758249
0.7459516500383728

val_acc:  0.74565


In [19]:
val_probs = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_val = X.iloc[val_index, :]
    val_probs.append(best_models[i].predict_proba(X_val))

val_probs = np.concatenate(val_probs, axis=0)
val_probs

array([[2.8083611e-03, 3.4647098e-01, 6.5072066e-01],
       [2.5451034e-03, 5.9818465e-01, 3.9927024e-01],
       [1.0440555e-02, 2.5882798e-01, 7.3073149e-01],
       ...,
       [5.4383470e-04, 5.1469464e-02, 9.4798666e-01],
       [2.9753974e-01, 6.9838798e-01, 4.0722722e-03],
       [1.0168686e-03, 3.6412862e-01, 6.3485450e-01]], dtype=float32)

In [20]:
X_test = pd.read_csv(DATA_DIR + '/test_values.csv', index_col='building_id')

test_probs = None
for best_model in best_models:
    if test_probs is None:
        test_probs = best_model.predict_proba(X_test)
    else:
        test_probs += best_model.predict_proba(X_test)

test_probs /= len(best_models)
test_probs

array([[7.5468223e-04, 2.2502311e-01, 7.7422220e-01],
       [1.6795030e-03, 8.3807909e-01, 1.6024137e-01],
       [7.4247234e-03, 8.0637091e-01, 1.8620428e-01],
       ...,
       [5.0396997e-02, 6.8840945e-01, 2.6119357e-01],
       [3.4430132e-03, 8.0507171e-01, 1.9148526e-01],
       [5.1186615e-01, 4.7126403e-01, 1.6869843e-02]], dtype=float32)

In [21]:
version = 1

In [22]:
np.save(DATA_DIR + '/intermediate/xgboost_{}fold_v{}_train.npy'.format(kf.n_splits, version), val_probs)
np.save(DATA_DIR + '/intermediate/xgboost_{}fold_v{}_test.npy'.format(kf.n_splits, version), test_probs)

In [32]:
test_preds = np.argmax(test_probs, axis=1) + 1

submission_format = pd.read_csv(DATA_DIR + '/submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=test_preds,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission['damage_grade'] = my_submission['damage_grade'].astype(int)
my_submission.to_csv(DATA_DIR + '/submission/lightgbm_{}fold_v{}.csv'.format(kf.n_splits, version))