In [None]:
# google drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' ## hide tf warnings
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Input, InputLayer, Add, BatchNormalization, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping

import yaml
from drive.MyDrive.Kaggle.Imputation_062022.src.functions import *
from drive.MyDrive.Kaggle.Imputation_062022.models.MLP1 import *

Mounted at /content/drive


In [None]:
try: # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError: # detect GPUs
    #strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)

Number of accelerators:  1


In [None]:
data = pd.read_csv('drive/MyDrive/Kaggle/Imputation_062022/src/data/data.csv', index_col='row_id')
#data_fake_nas = pd.read_csv('/content/drive/MyDrive/Kaggle/June_2022_na_imputation/src/data_fake_nas.csv', index_col='row_id')
sample = pd.read_csv('drive/MyDrive/Kaggle/Imputation_062022/src/data/sample_submission.csv', index_col='row-col')

_request non-retriable exception: Anonymous caller does not have storage.objects.get access to the Google Cloud Storage object., 401
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/gcsfs/retry.py", line 115, in retry_request
    return await func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/gcsfs/core.py", line 384, in _request
    validate_response(status, contents, path, args)
  File "/usr/local/lib/python3.7/dist-packages/gcsfs/retry.py", line 102, in validate_response
    raise HttpError(error)
gcsfs.retry.HttpError: Anonymous caller does not have storage.objects.get access to the Google Cloud Storage object., 401


HttpError: ignored

In [None]:
set_seed(seed=69)
col_list, F1, F2, F3, F4, missing_cols = get_lists(data)

In [None]:
def get_missings(df):
    labels,values = list(),list()
    for column in df.columns:
           if df[column].isnull().sum():
            labels.append(column)
            values.append((df[column].isnull().sum() / len(df[column]))*100)
            missings=pd.DataFrame({'Column':labels,
                                   'Missing(Percent)':values}).sort_values(by='Missing(Percent)',
                                                                           ascending=False)
    return missings

features_with_Nan = get_missings(data).Column.to_list()

In [None]:
data['n_missing'] = data.isnull().sum(axis=1)

In [None]:
def high_correlated(col):
    return data.corrwith(data[col]).abs().sort_values(ascending=False)[1:30].index.to_list()

In [None]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
data_imputed = pd.DataFrame() 
loss_per_feature={}

# Iterate over the columns that contain Nan values
for col in tqdm(data[features_with_Nan].columns):
    
    predictions=[]
    validation_loss=[]

    # Mask to access not_null part of the current column
    not_null = ~data[col].isnull()
    
    # Train dataset (includes non-null part of current column)
    train = data.loc[not_null]
    
    # Test dataset (includes null part of current column)
    test = data.loc[~not_null]
    
    # Feature selection
    selected_features=[n for n in high_correlated(col) if n not in ['row_id', col]]

    # Cross validation type
    kf = KFold(n_splits = 5)

    dfsplit = train[selected_features]
    
    # Splitting data to train and validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(dfsplit)):

        X_train, X_val = train.iloc[train_idx].drop(col,axis = 1), train.iloc[val_idx].drop(col, axis = 1)
        print(X_train.shape)
        y_train, y_val = train.iloc[train_idx][col], train.iloc[val_idx][col]
        X_test = test.drop(col, axis = 1)
        
        # Fillna with the mean
        X_train, X_val = X_train.fillna(X_train.median()), X_val.fillna(X_val.median())
        X_test = X_test.fillna(X_test.median())
        
        # Standard Scaling 
        scaler = StandardScaler()
        X_train=scaler.fit_transform(X_train)
        X_val=scaler.transform(X_val)
        X_test=scaler.transform(X_test)
        
        # Running NN model 
        with strategy.scope():
            model = nn_model(X_train,y_train,X_val,y_val,X_test)
        
        # Make an out-of-fold prediction
        y_preds = model[0].predict(X_test)
        
        # Add y_preds to a list
        predictions.append(y_preds)
        
        # Save loss for current fold
        validation_loss.append(model[1].history["val_loss"][-1])

    # Caluculate the mean of oof predictions
    
    mean_values = np.array(predictions).mean(axis = 0)
    
    # Save mean-loss for current feature
    loss_per_feature[col] = np.mean(validation_loss)
    
    # Specifying column to impute
    imputed_feature = data[col].copy()
    
    # Filling missing values
    imputed_feature.loc[~not_null] =  mean_values.ravel()
    
    # Concatenate imputed columns
    data_imputed = pd.concat([data_imputed, imputed_feature],axis = 1)

# Replace columns with imputed columns 
data[features_with_Nan] = data_imputed

  0%|          | 0/55 [00:00<?, ?it/s]

(785204, 80)
Epoch 1/30

  0%|          | 0/55 [01:03<?, ?it/s]


KeyboardInterrupt: ignored

In [None]:
loss_df=pd.DataFrame(loss_per_feature,index=['Validation_RMSE']).T.sort_values(by='Validation_RMSE',
                                                                               ascending=False)

loss_df

Unnamed: 0,Validation_RMSE
F_1_8,1.095792
F_3_4,1.08637
F_1_0,1.085814
F_3_14,1.082555
F_3_6,1.082257
F_3_3,1.082252
F_3_10,1.081002
F_3_16,1.080721
F_3_0,1.080389
F_3_13,1.079474


In [None]:
for i in sample.index: 
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sample.loc[i, 'value'] = data.loc[row, col]

sample.to_csv("drive/MyDrive/Kaggle/June_2022_na_imputation/src/NN_submission_GPU.csv")
sample

Unnamed: 0_level_0,value
row-col,Unnamed: 1_level_1
0-F_1_14,-0.000210
0-F_3_23,0.008720
1-F_3_24,0.014534
2-F_1_2,-0.002481
2-F_4_2,0.628245
...,...
999993-F_4_2,-0.110552
999994-F_3_10,-0.012313
999994-F_4_9,-0.127469
999997-F_3_14,-0.023711


In [None]:
loss_df.to_csv('drive/MyDrive/Kaggle/June_2022_na_imputation/src/val_loss_per_feat_NN_GPU.csv')