In [None]:
# google drive
# from google.colab import drive
# drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' ## hide tf warnings
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Input, InputLayer, Add, BatchNormalization, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping

# import yaml
# from drive.MyDrive.Kaggle.June_2022_na_imputation.src.functions import *

In [None]:
!gcloud config set project kaggle-j2022

Updated property [core/project].


In [None]:
!pip install fsspec gcsfs



In [None]:
try: # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError: # detect GPUs
    #strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)

Number of accelerators:  1


In [None]:
data = pd.read_csv('gs://kaggle_j2022_data/Kaggle_TPS/data.csv', index_col='row_id')
#data_fake_nas = pd.read_csv('/content/drive/MyDrive/Kaggle/June_2022_na_imputation/src/data_fake_nas.csv', index_col='row_id')
sample = pd.read_csv('gs://kaggle_j2022_data/Kaggle_TPS/sample_submission.csv', index_col='row-col')

_request non-retriable exception: Anonymous caller does not have storage.objects.get access to the Google Cloud Storage object., 401
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/gcsfs/retry.py", line 115, in retry_request
    return await func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/gcsfs/core.py", line 384, in _request
    validate_response(status, contents, path, args)
  File "/usr/local/lib/python3.7/dist-packages/gcsfs/retry.py", line 102, in validate_response
    raise HttpError(error)
gcsfs.retry.HttpError: Anonymous caller does not have storage.objects.get access to the Google Cloud Storage object., 401


HttpError: ignored

In [None]:
def get_missings(df):
    labels,values = list(),list()
    for column in df.columns:
           if df[column].isnull().sum():
            labels.append(column)
            values.append((df[column].isnull().sum() / len(df[column]))*100)
            missings=pd.DataFrame({'Column':labels,
                                   'Missing(Percent)':values}).sort_values(by='Missing(Percent)',
                                                                           ascending=False)
    return missings

features_with_Nan = get_missings(data).Column.to_list()

In [None]:
set_seed(seed=69)
col_list, F1, F2, F3, F4, missing_cols = get_lists(data)

In [None]:
data['n_missing'] = data.isnull().sum(axis=1)

In [None]:
def high_correlated(col):
    return data.corrwith(data[col]).abs().sort_values(ascending=False)[1:30].index.to_list()

In [None]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
def nn_model(X_train,y_train,X_val,y_val,X_test):
    
    # Random seed
    
    # Create a sequential model
    model= Sequential([
    tf.keras.layers.Input(shape = X_train.shape[1:]),
    Dense(1024, activation='swish'),
    BatchNormalization(),
    Dense(1024, activation='swish'),
    BatchNormalization(),
    Dense(512, activation='swish'),
    BatchNormalization(),
    Dense(512, activation='swish'),
    BatchNormalization(),
    Dense(128, activation='swish'),
    BatchNormalization(),
    Dense(64, activation='swish'),
    BatchNormalization(),
    Dense(1,   activation = 'linear')
    ])
    
    # Compile the model
    model.compile(
    loss=rmse,
    optimizer=Adam(learning_rate = 0.01),
    metrics=[rmse]
    )
        
    # Define callbacks
    lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.5, patience = 3, verbose = 0)
    es = EarlyStopping(monitor = 'val_loss',patience = 12, verbose = 0, mode = 'min', restore_best_weights = True)
    callbacks=[lr,es]
    
    # Fit the model
    history = model.fit(X_train, y_train,
                       epochs=30,
                       validation_data=(X_val, y_val),
                       batch_size=1024,
                       shuffle = True,
                       callbacks = callbacks,
                       verbose=1)
    
    return model,history

In [None]:
data_imputed = pd.DataFrame() 
loss_per_feature={}

# Iterate over the columns that contain Nan values
for col in tqdm(data[features_with_Nan].columns):
    
    predictions=[]
    validation_loss=[]

    # Mask to access not_null part of the current column
    not_null = ~data[col].isnull()
    
    # Train dataset (includes non-null part of current column)
    train = data.loc[not_null]
    
    # Test dataset (includes null part of current column)
    test = data.loc[~not_null]
    
    # Feature selection
    selected_features=[n for n in high_correlated(col) if n not in ['row_id', col]]

    # Cross validation type
    kf = KFold(n_splits = 5)

    dfsplit = train[selected_features]
    
    # Splitting data to train and validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(dfsplit)):

        X_train, X_val = train.iloc[train_idx].drop(col,axis = 1), train.iloc[val_idx].drop(col, axis = 1)
        print(X_train.shape)
        y_train, y_val = train.iloc[train_idx][col], train.iloc[val_idx][col]
        X_test = test.drop(col, axis = 1)
        
        # Fillna with the mean
        X_train, X_val = X_train.fillna(X_train.median()), X_val.fillna(X_val.median())
        X_test = X_test.fillna(X_test.median())
        
        # Standard Scaling 
        scaler = StandardScaler()
        X_train=scaler.fit_transform(X_train)
        X_val=scaler.transform(X_val)
        X_test=scaler.transform(X_test)
        
        # Running NN model 
        with strategy.scope():
            model = nn_model(X_train,y_train,X_val,y_val,X_test)
        
        # Make an out-of-fold prediction
        y_preds = model[0].predict(X_test)
        
        # Add y_preds to a list
        predictions.append(y_preds)
        
        # Save loss for current fold
        validation_loss.append(model[1].history["val_loss"][-1])

    # Caluculate the mean of oof predictions
    
    mean_values = np.array(predictions).mean(axis = 0)
    
    # Save mean-loss for current feature
    loss_per_feature[col] = np.mean(validation_loss)
    
    # Specifying column to impute
    imputed_feature = data[col].copy()
    
    # Filling missing values
    imputed_feature.loc[~not_null] =  mean_values.ravel()
    
    # Concatenate imputed columns
    data_imputed = pd.concat([data_imputed, imputed_feature],axis = 1)

# Replace columns with imputed columns 
#data[features_with_Nan] = data_imputed

  0%|          | 0/55 [00:00<?, ?it/s]

(785204, 80)
Epoch 1/30

  0%|          | 0/55 [01:03<?, ?it/s]


KeyboardInterrupt: ignored

In [None]:
X_train

Unnamed: 0_level_0,F_1_0,F_1_1,F_1_2,F_1_3,F_1_4,F_1_5,F_1_6,F_1_7,F_1_8,F_1_9,...,F_4_5,F_4_6,F_4_7,F_4_8,F_4_9,F_4_10,F_4_11,F_4_12,F_4_13,F_4_14
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200022,-1.988204,-1.828772,1.077371,0.945673,-1.984582,1.146581,1.241775,-1.174940,-1.087964,0.715459,...,-3.005483,2.935064,3.066955,-0.010597,-0.662083,-1.174628,-5.804925,0.464522,3.360727,-0.221765
200023,-0.480704,1.322995,1.086004,-1.325292,1.263961,-1.118903,-0.557043,-0.195749,1.430164,-1.080607,...,-0.432463,2.132057,1.843843,0.325321,-0.102409,-0.688601,-0.178803,-0.573473,0.730111,0.442149
200024,-1.245516,-0.926732,-0.030415,-0.774921,0.700511,0.000883,-1.991891,-1.387808,-1.710535,0.698414,...,-4.214785,-1.340212,1.372317,-0.305099,-0.197510,-0.415516,3.596930,0.500959,-2.970917,0.321180
200025,-0.000968,-0.355507,-0.289029,1.910412,1.514051,1.156930,1.227460,0.964805,-2.115368,0.899339,...,0.076752,0.116101,-0.646937,-0.206988,-0.902491,-0.273789,3.902232,-0.034544,-0.940770,1.379310
200026,0.606706,-0.021642,2.120131,2.066081,-2.243087,-1.437356,0.606973,1.150498,-2.354033,0.040424,...,3.251609,4.394103,0.868230,0.112436,0.075293,0.792724,3.809432,-1.297869,2.827621,-0.621257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,-0.823740,0.285673,0.343307,-0.436747,1.700549,-1.069432,0.819698,-0.168457,-0.429074,0.844075,...,1.799592,-0.301352,5.339675,-0.991529,1.279494,-0.841051,-2.276500,1.762961,5.324553,-0.228733
999996,-0.769106,-0.387363,-1.227469,0.601183,0.351161,0.219475,-0.530277,0.853452,0.608646,1.648023,...,1.909697,-1.299360,-0.071713,-0.162173,0.072501,-0.614687,-1.265524,0.190385,-0.344112,-0.346807
999997,0.147534,-0.715276,-0.465049,-1.988941,-1.594535,-1.044882,3.159455,-0.634108,0.890382,-1.212444,...,2.891854,3.105002,-3.470520,0.017554,0.096988,0.569255,3.609790,-0.584108,-1.492096,-0.997502
999998,-1.709886,-0.813785,-1.866536,-0.179420,2.231478,1.460122,-0.220585,-0.118496,-0.140064,1.362596,...,-1.135003,-5.127360,-3.903728,-1.597023,0.893159,0.379434,0.846266,-1.085554,3.122423,0.004831


In [None]:
loss_df=pd.DataFrame(loss_per_feature,index=['Validation_RMSE']).T.sort_values(by='Validation_RMSE',
                                                                               ascending=False)

loss_df

Unnamed: 0,Validation_RMSE
F_1_8,1.095792
F_3_4,1.08637
F_1_0,1.085814
F_3_14,1.082555
F_3_6,1.082257
F_3_3,1.082252
F_3_10,1.081002
F_3_16,1.080721
F_3_0,1.080389
F_3_13,1.079474


In [None]:
for i in sample.index: 
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sample.loc[i, 'value'] = data.loc[row, col]

sample.to_csv("drive/MyDrive/Kaggle/June_2022_na_imputation/src/NN_submission_GPU.csv")
sample

Unnamed: 0_level_0,value
row-col,Unnamed: 1_level_1
0-F_1_14,-0.000210
0-F_3_23,0.008720
1-F_3_24,0.014534
2-F_1_2,-0.002481
2-F_4_2,0.628245
...,...
999993-F_4_2,-0.110552
999994-F_3_10,-0.012313
999994-F_4_9,-0.127469
999997-F_3_14,-0.023711


In [None]:
loss_df.to_csv('drive/MyDrive/Kaggle/June_2022_na_imputation/src/val_loss_per_feat_NN_GPU.csv')

In [None]:
!

['F_4_4',
 'F_4_9',
 'F_4_2',
 'F_4_5',
 'F_4_3',
 'F_4_7',
 'F_4_13',
 'F_4_11',
 'F_4_10',
 'F_4_12',
 'F_4_6',
 'F_4_8',
 'F_4_14',
 'F_4_1',
 'F_1_0',
 'F_1_7',
 'F_3_21',
 'F_3_6',
 'F_2_0',
 'F_3_1',
 'F_3_17',
 'F_1_9',
 'F_2_12',
 'F_2_15',
 'F_1_2',
 'F_2_19',
 'F_2_22',
 'F_2_3',
 'F_3_4']

In [None]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 85
model name	: Intel(R) Xeon(R) CPU @ 3.10GHz
stepping	: 7
microcode	: 0x1
cpu MHz		: 3100.230
cache size	: 25344 KB
physical id	: 0
siblings	: 8
core id		: 0
cpu cores	: 4
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities
bugs		: spectre_v1 spectre_v2 spec_store_bypass mds swapgs taa
bo