In [1]:
import numpy as np 
import pandas as pd 

import lightgbm as lgm
from math import sqrt 

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from catboost import CatBoostRegressor,CatBoostClassifier



In [2]:
df1 = pd.read_csv('/kaggle/input/playground-series-s3e15/data.csv')
df2 = pd.read_csv('/kaggle/input/predicting-heat-flux/Data_CHF_Zhao_2020_ATE.csv')
df_sub = pd.read_csv('/kaggle/input/playground-series-s3e15/sample_submission.csv')
df = pd.concat([df1, df2])

results_df = pd.DataFrame(columns= ['num_imputer', 'cat_imputer', 'score_5kf'])

In [3]:
df.isna().sum() / len(df)

id                     0.000000
author                 0.149930
geometry               0.164135
pressure [MPa]         0.132860
mass_flux [kg/m2-s]    0.142977
x_e_out [-]            0.310812
D_e [mm]               0.163777
D_h [mm]               0.136948
length [mm]            0.142022
chf_exp [MW/m2]        0.000000
dtype: float64

There are a lof of missing values in each column (except chf_exp [MW/m2] ), ranging between 13-17% (excluding target variable)

Basic train/test split base on missing target values

In [4]:
df_train = df[~df['x_e_out [-]'].isna()]
y = df_train['x_e_out [-]']

df_train = df_train.drop(columns=['x_e_out [-]', 'id'])
df_test= df[df['x_e_out [-]'].isna()]
print(len(df_train), len(df_test))

23094 10415


Splitting numeriacal and categorical features

In [5]:
num_cols = df_train.select_dtypes(include='float64').columns.tolist()
cat_cols = df_train.select_dtypes(include='object').columns.tolist()
print(num_cols)
print(cat_cols)

['pressure [MPa]', 'mass_flux [kg/m2-s]', 'D_e [mm]', 'D_h [mm]', 'length [mm]', 'chf_exp [MW/m2]']
['author', 'geometry']


# Median / Mode / Mean imputations
The simplest method for imputation is filling missing values with column mean, median or mode.

In [6]:
def create_pipeline(strategy = None, 
                    numeric_transformer =True, 
                    categorical_transformer=True):
    
    if numeric_transformer == True:
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=strategy)),
            ])
    else:
        numeric_transformer = Pipeline(steps=[
            ('Scaler', StandardScaler())])
    
    if categorical_transformer == True:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ohencoder', OneHotEncoder(sparse_output=False))])
    else:
        categorical_transformer = Pipeline(steps=[
            ('ohencoder', OneHotEncoder(sparse_output=False))])
        
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)])
    return preprocessor

Validation is based on trainging simple  LGBMRegressor without any hyperparameters tunning on 5kfold split.

In [7]:
def train(preprocessor):
    losses = []
    kf = KFold(n_splits=5,shuffle=True, random_state=123)
    for fold, (train_idx, val_idx) in enumerate(kf.split(df_train, y)):
        X_train, y_train = df_train.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = df_train.iloc[val_idx], y.iloc[val_idx]
            
        model = lgm.LGBMRegressor()
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
        clf.fit(X_train, y_train)
        preds = clf.predict(X_val)
        loss = sqrt(mean_squared_error(y_val, preds))
        losses.append(loss)
    avg_loss = sum(losses) / len(losses)
    return clf, avg_loss

def train_kfold(preprocessor): 
    model, avg_loss = train(preprocessor)
    print(f'average loss across 5 folds: {avg_loss:.5f}')
    return avg_loss

## None 

In [8]:
preprocessor = create_pipeline(numeric_transformer=None)
tmp = {'num_imputer': 'None', 'cat_imputer' : 'most_frequent', 
       'score_5kf' : train_kfold(preprocessor)}
results_df = pd.concat([results_df, pd.DataFrame(tmp, index=[0])])

average loss across 5 folds: 0.07316


### Mean

In [9]:
strategy = 'mean'
preprocessor = create_pipeline(strategy='mean')
tmp = {'num_imputer': strategy, 'cat_imputer' : 'most_frequent', 
       'score_5kf' : train_kfold(preprocessor)}
results_df = pd.concat([results_df, pd.DataFrame(tmp, index=[0])])

average loss across 5 folds: 0.07314


### Median

In [10]:
strategy = 'median'
preprocessor = create_pipeline(strategy=strategy)
tmp = {'num_imputer': strategy, 'cat_imputer' : 'most_frequent', 
       'score_5kf' : train_kfold(preprocessor)}
results_df = pd.concat([results_df, pd.DataFrame(tmp, index=[0])])

average loss across 5 folds: 0.07315


### Mode

In [11]:
strategy = 'most_frequent'
preprocessor = create_pipeline(strategy=strategy)
tmp = {'num_imputer': strategy, 'cat_imputer' : 'most_frequent', 
       'score_5kf' : train_kfold(preprocessor)}
results_df = pd.concat([results_df, pd.DataFrame(tmp, index=[0])])

average loss across 5 folds: 0.07326


In [12]:
results_df.head()

Unnamed: 0,num_imputer,cat_imputer,score_5kf
0,,most_frequent,0.073162
0,mean,most_frequent,0.073137
0,median,most_frequent,0.073147
0,most_frequent,most_frequent,0.073264


# IterativeImputer
Iterative Imputer is a technique used for imputing missing values in a dataset by modeling each feature with missing values as a function of other features.

In [13]:
preprocessor= create_pipeline(
                              numeric_transformer =  Pipeline(steps=[('iterative', 
                                                        IterativeImputer())]),
                             )
tmp = {'num_imputer': 'iterative', 'cat_imputer' : 'most_frequent', 
       'score_5kf' : train_kfold(preprocessor)}
results_df = pd.concat([results_df, pd.DataFrame(tmp, index=[0])])

average loss across 5 folds: 0.07316


In [14]:
results_df.head()

Unnamed: 0,num_imputer,cat_imputer,score_5kf
0,,most_frequent,0.073162
0,mean,most_frequent,0.073137
0,median,most_frequent,0.073147
0,most_frequent,most_frequent,0.073264
0,iterative,most_frequent,0.073162


# Nearest neighbors imputation
Strategy used to impute missing values in a dataset by finding the most similar instances to the instance with missing values and using their values to fill in the missing ones.

In [15]:
def create_pipeline_nn(n_neigh, num_imputer =True, cat_imputer=True):
    if num_imputer == True:
        numeric_transformer = Pipeline(steps=[
            ('imputer', KNNImputer(n_neighbors=n_neigh, weights="uniform")),
            ('Scaler', StandardScaler())])
     
    if cat_imputer == True:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ohencoder', OneHotEncoder(sparse_output=False))])
    else:
        categorical_transformer = Pipeline(steps=[('ohencoder', OneHotEncoder(sparse_output=False))])
        
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)])
    return preprocessor

In [16]:
n_neighs = [5, 10, 15,20, 30]
for n_neigh in n_neighs:
    preprocessor = create_pipeline_nn(n_neigh=n_neigh)
    tmp = {'num_imputer': 'Nearest neighbors_' + str(n_neigh), 'cat_imputer' : 'most_frequent', 
           'score_5kf' : train_kfold(preprocessor)}
    results_df = pd.concat([results_df, pd.DataFrame(tmp, index=[0])])

average loss across 5 folds: 0.07322
average loss across 5 folds: 0.07327
average loss across 5 folds: 0.07314
average loss across 5 folds: 0.07308
average loss across 5 folds: 0.07317


In [17]:
results_df 

Unnamed: 0,num_imputer,cat_imputer,score_5kf
0,,most_frequent,0.073162
0,mean,most_frequent,0.073137
0,median,most_frequent,0.073147
0,most_frequent,most_frequent,0.073264
0,iterative,most_frequent,0.073162
0,Nearest neighbors_5,most_frequent,0.073216
0,Nearest neighbors_10,most_frequent,0.073266
0,Nearest neighbors_15,most_frequent,0.073136
0,Nearest neighbors_20,most_frequent,0.073083
0,Nearest neighbors_30,most_frequent,0.073175


# Filling NaNs using correlated features

Let's try an idea provided in this discussion [link](https://www.kaggle.com/competitions/playground-series-s3e15/discussion/411353#2267177) 
Described : "Also, there is an easy way to fill nan values in D_e and D_h features! Since they are high-correlated features - 0.81, and, technically, they are almost the same (since there are a lot of sample where D_e value is equal to D_h value), we can just fill, for example, D_e nan value into the 'D_h' value in the current sample, and vice versa!"

In [18]:
df_train.loc[df_train['D_e [mm]'].isna(), 'D_e [mm]'] = df_train.loc[df_train['D_e [mm]'].isna(), 'D_h [mm]']
df_train.loc[df_train['D_h [mm]'].isna(), 'D_h [mm]'] = df_train.loc[df_train['D_h [mm]'].isna(), 'D_e [mm]']

In [19]:
preprocessor= create_pipeline(numeric_transformer=None
                             )
tmp = {'num_imputer': 'None', 'cat_imputer' : 'most_frequent', 
       'score_5kf' : train_kfold(preprocessor)}
results_df = pd.concat([results_df, pd.DataFrame(tmp, index=[1])])

average loss across 5 folds: 0.07291


Even without imputation on other columns, it performs much better than any other methods used before!

Let's combine this with best working filling method, which is Nearest neighbors using n_neigh=20

In [20]:
n_neigh = 20 
preprocessor = create_pipeline_nn(n_neigh = n_neigh)
tmp = {'num_imputer': 'Nearest neighbors_' + str(n_neigh), 'cat_imputer' : 'most_frequent', 
       'score_5kf' : train_kfold(preprocessor)}
results_df = pd.concat([results_df, pd.DataFrame(tmp, index=[1])])

average loss across 5 folds: 0.07279


In [21]:
results_df

Unnamed: 0,num_imputer,cat_imputer,score_5kf
0,,most_frequent,0.073162
0,mean,most_frequent,0.073137
0,median,most_frequent,0.073147
0,most_frequent,most_frequent,0.073264
0,iterative,most_frequent,0.073162
0,Nearest neighbors_5,most_frequent,0.073216
0,Nearest neighbors_10,most_frequent,0.073266
0,Nearest neighbors_15,most_frequent,0.073136
0,Nearest neighbors_20,most_frequent,0.073083
0,Nearest neighbors_30,most_frequent,0.073175


# Filling NaNs using Regressor

Code implementation based on [link](https://www.kaggle.com/competitions/playground-series-s3e15/discussion/410645)

In [22]:
#filling missin values filled in a section above
df_train = df[~df['x_e_out [-]'].isna()]
y = df_train['x_e_out [-]']

df_train = df_train.drop(columns=['x_e_out [-]', 'id'])

In [23]:
for col in num_cols:
    train_data = df_train[df_train[col].notna()]
    test_data = df_train[df_train[col].isna()]
    if len(test_data) > 0:
        X_train = train_data[num_cols].drop(columns=col)
        y_train = train_data[col]
        model = lgm.LGBMRegressor()
        model.fit(np.array(X_train), np.array(y_train))

        X_test = test_data[num_cols].drop(columns=col)
        y_pred = model.predict(X_test)

        df_train.loc[df_train[col].isna(), col] = y_pred

In [24]:
preprocessor= create_pipeline(numeric_transformer=None)
tmp = {'num_imputer': 'None', 'cat_imputer' : 'most_frequent', 
       'score_5kf' : train_kfold(preprocessor)}
results_df = pd.concat([results_df, pd.DataFrame(tmp, index=[2])])

average loss across 5 folds: 0.07280


In [25]:
results_df

Unnamed: 0,num_imputer,cat_imputer,score_5kf
0,,most_frequent,0.073162
0,mean,most_frequent,0.073137
0,median,most_frequent,0.073147
0,most_frequent,most_frequent,0.073264
0,iterative,most_frequent,0.073162
0,Nearest neighbors_5,most_frequent,0.073216
0,Nearest neighbors_10,most_frequent,0.073266
0,Nearest neighbors_15,most_frequent,0.073136
0,Nearest neighbors_20,most_frequent,0.073083
0,Nearest neighbors_30,most_frequent,0.073175


* it looks like filling missing values with some regressor works the best for local cv.
* preprocessing with or without 'most_frequent' as a categorical imputer returns very simillar results
* it looks like Nearest neighbors imputation works the best from standard imputation techniques(filling with _, iterativeimputer).
index value:

0. No NaNs were filled before imputation.
1. Partialy filled.
2. Allmost all filled (in this case, all numeric cols).

# Subsmissin without params tunning

In [26]:
preds = np.zeros(len(df_test))
kf = KFold(n_splits=5,shuffle=True, random_state=123)
for fold, (train_idx, val_idx) in enumerate(kf.split(df_train, y)):
    X_train, y_train = df_train.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = df_train.iloc[val_idx], y.iloc[val_idx]
    
    preprocessor = create_pipeline_nn(n_neigh = 20)
    model = lgm.LGBMRegressor()
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    clf.fit(X_train, y_train)
    predictions = clf.predict(df_test)
    preds += ((predictions) / 5)

In [27]:
df_sub['x_e_out [-]'] = preds
df_sub.to_csv('submission.csv', index=False)
df_sub.head()

Unnamed: 0,id,x_e_out [-]
0,4,0.002371
1,7,-0.092375
2,10,-0.044921
3,12,-0.037171
4,23,0.040613


* submission score using best preproccesing method: : **0.075667**
* submission withour preprocessing method: 0.076528