In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import StandardScaler,LabelEncoder
from tensorflow.keras import metrics,regularizers
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sub=pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv")
train=pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
test=pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape

In [None]:
test.shape

**EDA**

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
def plot_distributions(data, features, hue, ncols,method):
    nrows = round(len(features) / ncols)
    fig,axes=plt.subplots(nrows, ncols, figsize=(16, round(nrows*16/ncols)))
    col_i,row_i= 0, 0
    for index, feature in enumerate(features):
        if method == 'hist':
            sns.kdeplot(data=data, x=feature, hue=hue, ax=axes[row_i][col_i])
        elif method == 'count':
            temp = data.sort_values(feature)
            sns.countplot(data=temp, x=feature, hue=hue, ax=axes[row_i][col_i])
        elif method == 'bar':
            temp = data.copy()
            temp['counts'] = 1
            temp = temp.groupby([hue, feature], as_index=False).agg({'counts':'sum'})
            sns.barplot(data = temp, x = feature, y = 'counts', hue = hue, ax = axes[row_i][col_i])
        col_i += 1
        if col_i == ncols:
            col_i = 0
            row_i += 1

In [None]:
float_features = list(test.select_dtypes(include='float').columns)
plot_distributions(train,float_features,'target',4,'hist')

In [None]:
int_features = list(test.select_dtypes(include='int').columns)
int_features.remove("id")
plot_distributions(train,int_features,'target',3,'bar')

**Feature Engineering**

In [None]:
train_y = train["target"]
train_x = train.drop(["id","target"],axis=1)
test = test.drop(["id"],axis=1)

In [None]:
def create_features(data):    
    object_data_cols = [f"f_27_{i+1}" for i in range(10)]
    object_data = pd.DataFrame(data['f_27'].apply(list).tolist(), columns=object_data_cols)
    for feature in object_data_cols:
        object_data[feature] = object_data[feature].apply(ord) - ord('A')
    data = pd.concat([object_data,data],axis=1)
    data["unique_characters"] = data.f_27.apply(lambda s: len(set(s)))
    data = data.drop("f_27",axis=1)
    return data

In [None]:
train_x = create_features(train_x)
test = create_features(test)

In [None]:
def feature_engineering(data):
    data['i_02_21'] = (data.f_21 + data.f_02 > 5.2).astype(int) - (data.f_21 + data.f_02 < -5.3).astype(int)
    data['i_05_22'] = (data.f_22 + data.f_05 > 5.1).astype(int) - (data.f_22 + data.f_05 < -5.4).astype(int)
    i_00_01_26 = data.f_00 + data.f_01 + data.f_26
    data['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    return data

In [None]:
train_x  = feature_engineering(train_x)
test = feature_engineering(test)

**Model**

lightgbm

In [None]:
"""y_pred = pd.DataFrame()
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
for fold,(tr_idx,val_idx) in enumerate(kf.split(train_x,train_y)):
    print("*****Fold {}*****".format(fold+1))
    tr_X,val_X = train_x.iloc[tr_idx],train_x.iloc[val_idx]
    tr_y,val_y = train_y.iloc[tr_idx],train_y.iloc[val_idx]
    
    lgb_train = lgb.Dataset(tr_X,tr_y)
    lgb_eval = lgb.Dataset(val_X,val_y)

    params = {"objective" : "binary",
              "metric" : "auc",
              'num_leaves': 200,
              'learning_rate': 0.1,
              'colsample_bytree': 0.9,
              'subsample': 0.8,
              'reg_alpha': 0.1,
              'random_state': 21}

    model = lgb.train(params,
                      lgb_train,
                      valid_sets=[lgb_train,lgb_eval],
                      num_boost_round=10000,
                      early_stopping_rounds=10)

    y_pred["fold{}".format(fold+1)] = model.predict(test)"""

neural network

In [None]:
y_pred = pd.DataFrame()
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
for fold,(tr_idx,val_idx) in enumerate(kf.split(train_x,train_y)):
    print("*****Fold {}*****".format(fold+1))
    tr_X,val_X = train_x.iloc[tr_idx],train_x.iloc[val_idx]
    tr_y,val_y = train_y.iloc[tr_idx],train_y.iloc[val_idx]

    scaler = StandardScaler()
    tr_X_scaled = scaler.fit_transform(tr_X)
    val_X_scaled = scaler.transform(val_X)
    test_scaled = scaler.transform(test)

    model = Sequential()
    model.add(Dense(128,kernel_regularizer=regularizers.l2(30e-6),activation="swish",input_shape=(tr_X_scaled.shape[1],)))
    model.add(Dense(64,kernel_regularizer=regularizers.l2(30e-6),activation="swish"))
    model.add(Dense(64,kernel_regularizer=regularizers.l2(30e-6),activation="swish"))
    model.add(Dense(32,kernel_regularizer=regularizers.l2(30e-6),activation="swish"))
    model.add(Dense(16,kernel_regularizer=regularizers.l2(30e-6),activation="swish"))
    model.add(Dense(8,kernel_regularizer=regularizers.l2(30e-6),activation="swish"))
    model.add(Dense(1,activation="sigmoid"))


    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=[metrics.AUC(name = 'auc')])

    lr = ReduceLROnPlateau(monitor='val_auc', factor=0.5,  patience=3, verbose=True)
    es = EarlyStopping(monitor='val_auc', mode='max', patience=5, restore_best_weights=True, verbose=True)

    model.fit(tr_X_scaled,tr_y,
              validation_data=(val_X_scaled, val_y), 
              epochs=200, 
              batch_size=4096, 
              callbacks=[es,lr],
              verbose=True, 
              shuffle=True)

    y_pred["fold{}".format(fold+1)] = model.predict(test_scaled).squeeze()

**Submit**

In [None]:
sub['target'] =  y_pred.mean(axis=1)
sub.to_csv('submission', index=False)