This is December 2021 Tabulat Playground series

### Outline:
0. Load libraries and custom functions.
1. Load data.
2. Preliminary data analysis: explore features and a target, delete unneeded features, create new features.
3. Train-test split.
4. Missing values. In some cases it may be useful to explore skew and perform log-transform before imputing missing values.
5. Feature engineering. Transform skewed variables, do OHC and scaling.
6. Fit models.
7. Evaluate models.
8. Feature importance, error analysis. Based on the results, go to 2. and iterate.
9. Make predictions.

In [26]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import os, time, warnings, optuna, gc
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error, make_scorer
from sklearn.inspection import permutation_importance
from scipy.special import inv_boxcox
from xgboost import XGBClassifier, XGBRegressor
import lightgbm as lgb

pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()


def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: 
    fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)"""
    
    # set df_pred to None if it does not exist
    if (cat_features is not None):
        if (cat_fill=='mode'):

            df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
            df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
            if (df_pred is not None):
                df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])

        if (cat_fill=='missing'):

            df_train[cat_features] = df_train[cat_features].fillna(value='missing')
            df_test[cat_features] = df_test[cat_features].fillna(value='missing')
            if (df_pred is not None):
                df_pred[cat_features] = df_pred[cat_features].fillna(value='missing')
        
    if (num_fill=='median'):
        df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
        df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
        if (df_pred is not None):
            df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())    
    
    if (cat_features is not None):
        all_good = (
        (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
        (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()))
        if (all_good):
            print('Missing values imputed successfully')
        else:
            print('There are still some missing values...')
    else:
        all_good = (
        (np.prod(df_train[num_features].shape)==df_train[num_features].count().sum()) and 
        (np.prod(df_test[num_features].shape) == df_test[num_features].count().sum()))
        if (all_good):
            print('Missing values imputed successfully')
        else:
            print('There are still some missing values...')
# END

    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])"""
    # set df_pred to None if it does not exist
    
    columns_before = df_train.shape[1]
    
    for feature_name in features:
        
        if df_train[feature_name].count()==df_train.shape[0]:
            continue
        
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
            
        columns_after = df_train.shape[1]
            
    print(columns_after-columns_before, ' dummy features added')
# END
   

def discretize_mp_i1(df_train, df_test, df_pred, feature, ntiles, delete_feature=False):
    """This function divides a continuous feature into quantile groups.
    Example: discretize_mp_i1(X_train, X_test, X_pred, 'Age', 15)"""
    # set df_pred to None if it does not exist
    _,bin = pd.qcut(df_train[feature], ntiles, retbins = True, labels = False, duplicates = 'drop')
    df_train[feature+'Ntile'] = pd.cut(df_train[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    df_test[feature+'Ntile'] = pd.cut(df_test[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    if (df_pred is not None):
        df_pred[feature+'Ntile'] = pd.cut(df_pred[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    if (delete_feature==True):
        df_train.drop(columns=[feature], inplace=True)
        df_test.drop(columns=[feature], inplace=True)
        df_pred.drop(columns=[feature], inplace=True)
    print('Discretized ',feature, ' into ', len(bin)-1, ' bins')
# END


def log_transformer_mp_i1(df_train, df_test, df_pred=None, feature_subset=False, max_skew=3):
    """This function divides a continuous feature into quantile groups.
    Example: log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)"""
    # set df_pred to None if it does not exist
    if (feature_subset==False):
        features_totransform = df_train.columns
    else:
        features_totransform = feature_subset.copy()
    skewed_vars = list(df_train.skew()[(df_train.skew())>max_skew].index)
    for col in list(set(skewed_vars)&set(features_totransform)):
        df_train[col] = np.log1p(df_train[col])
        df_test[col] = np.log1p(df_test[col])
        if (df_pred is not None):
            df_pred[col] = np.log1p(df_pred[col])
    print('Skewed columns log-transformed: ', list(set(skewed_vars)&set(features_totransform)))
# END
    
    
def add_dummyfeatures(df_train, df_test, df_pred, feature_dict):
    """This function adds dummy feature when some feature is equal to value, specified in a dictionary.
    Example: add_dummyfeatures(X_train, X_test, X_pred, {'RoomService':0, 'Spa':0, 'VRDeck':0, 'ShoppingMall':0})"""
    input_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    for i in range(len(list(feature_dict.items()))):
        feature,value = list(feature_dict.keys())[i], list(feature_dict.values())[i]
        df_train.loc[df_train[feature]==value,(str(feature)+str(value))]=1
        df_train.loc[df_train[feature]!=value,(str(feature)+str(value))]=0
        df_test.loc[df_test[feature]==value,(str(feature)+str(value))]=1
        df_test.loc[df_test[feature]!=value,(str(feature)+str(value))]=0
        df_pred.loc[df_pred[feature]==value,(str(feature)+str(value))]=1
        df_pred.loc[df_pred[feature]!=value,(str(feature)+str(value))]=0
    output_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    print(output_dimensions-input_dimensions, ' variables created') 
# END



time0 = time.time()


#1. Load data #

df = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
pred = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
pred0 = pred.copy()
print(df.shape, pred.shape)

df.drop(columns = ['Slope','Aspect','Soil_Type5','Soil_Type7','Soil_Type15',
                  'Hillshade_3pm', 'Hillshade_9am', 'Soil_Type1', 'Soil_Type3'], inplace = True)


# 2. pEDA #

df = df.sample(1000000, random_state=3)

df.drop(columns = ['Id'], inplace = True)
pred.drop(columns = ['Id'], inplace = True)
print(df.Cover_Type.value_counts())
#df.head()

#[[col, df[col].nunique()] for col in df.columns]
#df.count()
df.Cover_Type.value_counts()
#df.skew()

# 3. Train-test split #

train_y = df[['Cover_Type']]
train_y.replace([1, 2, 3, 4, 6, 7], [0,1,2,3,4,5], inplace = True)
train_x = df.drop(columns = ['Cover_Type'])

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.05, random_state = 1)

print(X_train.shape, X_test.shape, y_train.shape, pred.shape)


# 4. Missing values #

#X_train.count()

#X_train.skew()

# 5. Feature engineering #

#log_transformer_mp_i1(X_train, X_test, pred)
#X_train.skew()
# all skewed variables are already ohc-ed, so their skew does not matter.

ss = StandardScaler()

for col in X_train.columns:
    X_train[[col]] = ss.fit_transform(X_train[[col]])
    X_test[[col]] = ss.transform(X_test[[col]])
    pred[[col]] = ss.transform(pred[[col]])
    
gc.collect()
print(X_train.shape)

(4000000, 56) (1000000, 55)
2    566215
1    366466
3     48854
7     15486
6      2896
4        83
Name: Cover_Type, dtype: int64
(950000, 45) (50000, 45) (950000, 1) (1000000, 54)
(950000, 45)


In [None]:
# 6. Model fitting #

f1w = make_scorer(f1_score , average='weighted')

time1 = time.time()
lr = LogisticRegression()
param_grid = {'C':[1, 10, 100]}
lrm = GridSearchCV(lr, param_grid, cv=2, scoring=f1w)
lrm.fit(X_train, y_train)
print('Logistic', lrm.best_params_, lrm.best_score_, time.time()-time1)

In [27]:
# Fit XGBoost using Optuna

time1 = time.time()

def objective(trial, n_splits=2, n_jobs=-1, early_stopping_rounds=50):
    params = {
        "tree_method": 'gpu_hist',
        "gpu_id": 0,
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "n_estimators": 500,
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.02, 0.3),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.2, 1),
        "subsample": trial.suggest_uniform("subsample", 0.3, 1),
        "alpha": trial.suggest_loguniform("alpha", 0.001, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 50),
        "n_jobs": n_jobs,
    }

    X = X_train
    y = y_train
    
    model = XGBClassifier(**params)
    rkf = KFold(n_splits=n_splits)
    X_values = X.values
    y_values = y.values
    y_pred = np.zeros_like(y_values)
    for train_index, test_index in rkf.split(X_values):
        X_A, X_B = X_values[train_index, :], X_values[test_index, :]
        y_A, y_B = y_values[train_index], y_values[test_index]
        model.fit(X_A, y_A, eval_set=[(X_B, y_B)],
                  early_stopping_rounds=early_stopping_rounds, verbose = False)
        y_pred[test_index] += model.predict(X_B).reshape(-1,1)
    return (f1_score(y_train, y_pred, average='weighted'))

time1 = time.time()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
print('Total time ', time.time()-time1)
hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

optuna_hyperpars = study.best_params
optuna_hyperpars['tree_method']='gpu_hist'
optuna_hyperpars['gpu_id']=0
optuna_hyperpars['n_estimators']=500
#optuna_hyperpars
optuna_xgb = XGBClassifier(**optuna_hyperpars)
optuna_xgb.fit(X_train, y_train)

[32m[I 2022-06-09 01:23:48,520][0m A new study created in memory with name: no-name-069c839f-700b-47ef-80f8-e23f6754cfff[0m
[32m[I 2022-06-09 01:24:55,989][0m Trial 0 finished with value: 0.9584389417187137 and parameters: {'max_depth': 5, 'learning_rate': 0.29535515718703187, 'colsample_bytree': 0.5066221863279529, 'subsample': 0.6826572392683761, 'alpha': 4.509393388813394, 'lambda': 0.0005266980188326377, 'gamma': 1.2764179485806539e-06, 'min_child_weight': 6.937299033354326}. Best is trial 0 with value: 0.9584389417187137.[0m
[32m[I 2022-06-09 01:26:06,442][0m Trial 1 finished with value: 0.9556603229937607 and parameters: {'max_depth': 6, 'learning_rate': 0.20367503092088726, 'colsample_bytree': 0.43423686736648603, 'subsample': 0.3191481405440985, 'alpha': 4.156795098890419, 'lambda': 0.010015128712165837, 'gamma': 2.0898745712528777e-06, 'min_child_weight': 46.373549320787355}. Best is trial 0 with value: 0.9584389417187137.[0m
[32m[I 2022-06-09 01:26:41,500][0m Trial

Total time  4728.104464292526
           max_depth : 7
       learning_rate : 0.11250606762391409
    colsample_bytree : 0.7938198392071085
           subsample : 0.8397323870529677
               alpha : 0.005359657215045861
              lambda : 0.014329788062220945
               gamma : 7.183796127900205e-05
    min_child_weight : 0.4706698914121564
best objective value : 0.959176518319491


XGBClassifier(alpha=0.005359657215045861, base_score=0.5, booster='gbtree',
              callbacks=None, colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.7938198392071085, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None,
              gamma=7.183796127900205e-05, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              lambda=0.014329788062220945, learning_rate=0.11250606762391409,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=7,
              max_leaves=0, min_child_weight=0.4706698914121564, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto', ...)

In [28]:
f1w = make_scorer(f1_score , average='weighted')

time1 = time.time()

xgb = XGBClassifier(tree_method = 'gpu_hist', gpu_id = 0)
param_grid = {'eta':[0.1, 0.2, 0.3], 'max_depth':[4,6,8], 'n_estimators':[300]}
xgbm = GridSearchCV(xgb, param_grid, cv=2, scoring=f1w, verbose=1)

xgbm.fit(X_train, y_train)
print('XGB', xgbm.best_params_, xgbm.best_score_, time.time()-time1)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
XGB {'eta': 0.2, 'max_depth': 8, 'n_estimators': 300} 0.9589380364372453 486.0761115550995


In [29]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [30]:
optuna.visualization.plot_slice(study)

# 

In [31]:
# 7. Model evaluation #

#print('Logistic IS', f1_score(y_train, lrm.predict(X_train), average='weighted'), 
#      f1_score(y_test, lrm.predict(X_test), average='weighted'))

print('XGB_gs IS', f1_score(y_train, xgbm.predict(X_train), average='weighted'), 
      f1_score(y_test, xgbm.predict(X_test), average='weighted'))

print('XGB_Optuna IS', f1_score(y_train, optuna_xgb.predict(X_train), average='weighted'), 
      f1_score(y_test, optuna_xgb.predict(X_test), average='weighted'))


XGB_gs IS 0.9805404614663118 0.9603984963962868
XGB_Optuna IS 0.9752105379666629 0.9604782680304427


In [23]:
# feature importance #

results = permutation_importance(optuna_xgb, X_test, y_test, n_jobs=-1)
fi = pd.DataFrame({'col':X_test.columns, 'FI':results.importances_mean})
fi = fi.sort_values('FI', ascending = False)
fi

Unnamed: 0,col,FI
0,Elevation,0.462276
3,Horizontal_Distance_To_Roadways,0.050991
5,Horizontal_Distance_To_Fire_Points,0.0315
8,Wilderness_Area3,0.020405
2,Vertical_Distance_To_Hydrology,0.017207
6,Wilderness_Area1,0.011685
1,Horizontal_Distance_To_Hydrology,0.011141
43,Soil_Type39,0.007698
42,Soil_Type38,0.006796
15,Soil_Type10,0.005608


In [10]:
X_train

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
366266,0.911931,-1.361045,0.803411,3.098012,2.478975,-0.198322,1.078583,-0.941120,-0.059149,0.406448,...,-0.169528,-0.19819,-0.197285,-0.11282,-0.126101,-0.104283,-0.112389,-0.20703,-0.203746,-0.178611
2727183,-2.170251,-0.105683,0.102755,0.560129,-0.317980,-0.527786,0.329476,0.807619,-0.173422,-1.270972,...,-0.169528,-0.19819,-0.197285,-0.11282,-0.126101,-0.104283,-0.112389,-0.20703,-0.203746,-0.178611
2481996,-0.965083,-0.123876,0.336307,-1.011152,0.920255,0.959359,0.948303,-1.927589,0.237960,0.020987,...,-0.169528,-0.19819,-0.197285,-0.11282,-0.126101,-0.104283,-0.112389,-0.20703,-0.203746,-0.178611
2223972,-0.490635,-1.179108,0.219531,-0.905223,-0.419953,-1.069050,1.241432,-1.389515,-0.059149,-0.486758,...,-0.169528,-0.19819,-0.197285,-0.11282,-0.126101,-0.104283,-0.112389,-0.20703,-0.203746,-0.178611
137685,0.430557,1.395294,1.387292,0.551301,1.838006,0.267787,1.404281,1.345692,-0.493385,1.058631,...,-0.169528,-0.19819,-0.197285,-0.11282,-0.126101,-0.104283,-0.112389,-0.20703,-0.203746,-0.178611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3518093,-0.507951,1.604521,-0.130797,0.405649,-0.347115,-0.111780,-0.940747,0.090187,1.472103,0.417968,...,-0.169528,-0.19819,-0.197285,-0.11282,-0.126101,-0.104283,-0.112389,-0.20703,-0.203746,-0.178611
1235702,-0.646476,-1.288270,-1.648886,-0.565367,2.449840,-0.276513,0.818024,1.480211,-0.219131,-0.806646,...,-0.169528,-0.19819,-0.197285,-0.11282,-0.126101,-0.104283,-0.112389,-0.20703,-0.203746,-0.178611
3761920,-0.767685,0.303675,-0.597901,-0.424128,-0.623897,-0.825367,-0.484770,1.076656,-1.384711,-0.235101,...,-0.169528,-0.19819,-0.197285,-0.11282,-0.126101,-0.104283,-0.112389,-0.20703,-0.203746,-0.178611
223019,0.711070,1.604521,0.219531,-0.706606,-0.419953,-0.082174,0.427186,-0.941120,-0.607657,0.044026,...,-0.169528,-0.19819,-0.197285,-0.11282,-0.126101,-0.104283,-0.112389,-0.20703,-0.203746,-0.178611


In [None]:
pred

In [33]:
pred.drop(columns = ['Slope','Aspect','Soil_Type5','Soil_Type7','Soil_Type15',
                  'Hillshade_3pm', 'Hillshade_9am', 'Soil_Type1', 'Soil_Type3'], inplace = True)
temp = optuna_xgb.predict(pred)
temp = pd.DataFrame(temp, columns = ['Cover_Type'])
print(temp.head())
temp.replace([0,1,2,3,4,5], [1, 2, 3, 4, 6, 7], inplace = True)
print(temp.head())

   Cover_Type
0           1
1           1
2           1
3           1
4           1
   Cover_Type
0           2
1           2
2           2
3           2
4           2


In [34]:

#yhat = optuna_xgb.predict(pred)

submission_df_xgb = pd.DataFrame({'Id': pred0.Id, 'Cover_Type': temp['Cover_Type']}, columns=['Id', 'Cover_Type'])
#submission_df_bt.Transported = np.array([bool(x) for x in submission_df_bt.Transported])
submission_df_xgb.to_csv('KP14_xgb.csv',index=False)

os.chdir(r'/kaggle/working')

from IPython.display import FileLink
FileLink(r'KP14_xgb.csv')