In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import itertools
import tensorflow as tf
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import BatchNormalization, Dropout, Dense, InputLayer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping



In [2]:
train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')



for df in [train, test]:
    df.drop(columns=['Soil_Type7', 'Soil_Type15','Soil_Type1'],inplace=True)
    
#feature engineering
'''
train['total_distance_to_water'] = np.sqrt((train['Horizontal_Distance_To_Hydrology'])**2 + (train['Vertical_Distance_To_Hydrology'])**2)
train = train.drop(['Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology'], axis=1)

test['total_distance_to_water'] = np.sqrt((test['Horizontal_Distance_To_Hydrology'])**2 + (test['Vertical_Distance_To_Hydrology'])**2)
test = test.drop(['Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology'], axis=1)
#train['total_shade'] = train['Hillshade_9am'] + train['Hillshade_Noon'] + train['Hillshade_3pm']
train['mean_shade'] = (train['Hillshade_9am'] + train['Hillshade_Noon'] + train['Hillshade_3pm'])/3
train= train.drop(['Hillshade_9am','Hillshade_Noon', 'Hillshade_3pm'],axis=1)

test['mean_shade'] = (test['Hillshade_9am'] + test['Hillshade_Noon'] + test['Hillshade_3pm'])/3
test= test.drop(['Hillshade_9am','Hillshade_Noon', 'Hillshade_3pm'],axis=1)
'''
train = train[train.Cover_Type != 5]
X = train.drop(['Cover_Type','Id'], axis=1)
le = LabelEncoder()
y = pd.DataFrame(le.fit_transform(train.Cover_Type))
X_test = test.drop('Id', axis=1)



In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16',
                'float32', 'float64']
    for col in df.columns:
        if df[col].dtype=='bool':
            df[col] = df[col].astype(int)
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            #change int type to lowest poss
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

X = reduce_mem_usage(X)
X_test = reduce_mem_usage(X_test)

Mem. usage decreased to 259.40 Mb (83.7% reduction)
Mem. usage decreased to 57.22 Mb (85.3% reduction)


In [4]:
score_list, test_pred_list, history_list = [], [], []

EPOCHS = 90
VERBOSE = 1
SINGLE_FOLD = False   
BATCH_SIZE = 800
FOLDS = 10
RUNS = 1  # should be 1. increase the number of runs only if you want see how the result depends on the random seed

def my_model(X):

    
    model = Sequential()
    model.add(InputLayer(input_shape=(X.shape[-1])))
    model.add(Dense(128, activation='selu'))
    model.add(BatchNormalization())
   # model.add(Dropout(.1))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    # model.add(Dropout(.1))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    # model.add(Dropout(.1))
    model.add(Dense(6, activation='softmax'))
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

    return model

np.random.seed(1)
tf.random.set_seed(1)

for run in range(RUNS):
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=1)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y=y)):
        X_tr = X.iloc[train_idx]
        X_va = X.iloc[val_idx]
        y_tr = y.iloc[train_idx]
        y_va = y.iloc[val_idx]

        scaler = StandardScaler()

        X_tr = pd.DataFrame(scaler.fit_transform(X_tr))
        X_va = pd.DataFrame(scaler.transform(X_va))
        
        model = my_model(X_tr)

        #define callbacks
        lr = ReduceLROnPlateau(monitor='val_loss', factor=.5, patience=5, verbose=VERBOSE)

        es = EarlyStopping(monitor='val_loss', patience=10, verbose=VERBOSE, mode='min', restore_best_weights=True)

#train and save model
        history = model.fit(X_tr, y_tr, validation_data=(X_va, y_va), epochs=EPOCHS, batch_size=BATCH_SIZE,
                            validation_batch_size= len(X_va), callbacks=[lr,es], shuffle=True)

        history_list.append(history.history)
        model.save(f'model{run}.{fold}')

        #inference for validation after last epoch of fold
        y_va_pred = model.predict(X_va, batch_size=len(X_va))
        y_va_pred = np.argmax(y_va_pred, axis=1)

        #evaluation
        accuracy = accuracy_score(y_va, y_va_pred)

        print(f'Fold {run}.{fold} : Accuracy: {accuracy:.5f}')

        #test predicts
        test_pred_list.append(model.predict(scaler.transform(X_test), batch_size=BATCH_SIZE))

sub = pd.DataFrame()
sub['Id'] = test['Id']
sub['Cover_Type'] = le.inverse_transform(np.argmax(sum(test_pred_list), axis=1))
sub.to_csv('submission.csv', index=False)

2021-12-15 13:09:53.514468: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-15 13:09:53.612894: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-15 13:09:53.613613: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-15 13:09:53.614711: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90

Epoch 00041: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90

Epoch 00048: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90
Epoch 53/90
Epoch 54/90

Epoch 00054: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90

Epoch 00060: ReduceLROnPlateau reducing learn

2021-12-15 13:35:53.590950: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


Fold 0.0 : Accuracy: 0.96220


2021-12-15 13:36:04.752735: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1468799592 exceeds 10% of free system memory.
2021-12-15 13:36:06.197411: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1468799592 exceeds 10% of free system memory.


Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90

Epoch 00033: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90

Epoch 00041: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90

Epoch 00051: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 52/90
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90
Epoch 61/90

Epoch 00061: ReduceLROnPlateau re

2021-12-15 14:02:40.737964: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1468799592 exceeds 10% of free system memory.


Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90

Epoch 00025: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90

Epoch 00038: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90

Epoch 00044: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90

Epoch 00052: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Ep