# Tabular Playground Series - Aug 2022 - Top 2% - 24/1888

In [None]:
!pip install feature_engine
!git clone https://github.com/analokmaus/kuma_utils.git
import sys; sys.path.append("kuma_utils/")

In [None]:
import os
import sys
import joblib
import numpy as np
import pandas as pd
import gc; gc.enable()
from lightgbm import LGBMClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
from feature_engine.encoding import WoEEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from kuma_utils.preprocessing.imputer import LGBMImputer
from sklearn.linear_model import LogisticRegression, HuberRegressor
import warnings; warnings.filterwarnings("ignore")

**Import data**

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2022/train.csv")
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2022/test.csv")
sub = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2022/sample_submission.csv")
target, groups = df_train['failure'], df_train['product_code']
df_train.drop('failure',axis=1, inplace = True)

**Preprocessing, thanks also to the other competitors**

In [None]:
def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])
    
    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']

    feature = [f for f in df_test.columns if f.startswith('measurement') or f=='loading']

    full_fill_dict ={}
    full_fill_dict['measurement_17'] = {
        'A': ['measurement_5','measurement_6','measurement_8'],
        'B': ['measurement_4','measurement_5','measurement_7'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8']
    }

    col = [col for col in df_test.columns if 'measurement' not in col]+ ['loading','m3_missing','m5_missing']
    a = []
    b =[]
    for x in range(3,17):
        corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
        a.append(np.round(np.sum(corr[1:4]),3))
        b.append(f'measurement_{x}')
    c = pd.DataFrame()
    c['Selected columns'] = b
    c['correlation total'] = a
    c = c.sort_values(by = 'correlation total',ascending=False).reset_index(drop = True)
    display(c.head(10))

    for i in range(10):
        measurement_col = 'measurement_' + c.iloc[i,0][12:]
        fill_dict = {}
        for x in data.product_code.unique() : 
            corr = np.absolute(data[data.product_code == x].drop(col, axis=1).corr()[measurement_col]).sort_values(ascending=False)
            measurement_col_dic = {}
            measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
            fill_dict[x] = measurement_col_dic[measurement_col]
        full_fill_dict[measurement_col] =fill_dict

    feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
    nullValue_cols = [col for col in df_train.columns if df_train[col].isnull().sum()!=0]

    for code in data.product_code.unique():
        total_na_filled_by_linear_model = 0
        for measurement_col in list(full_fill_dict.keys()):
            tmp = data[data.product_code == code]
            column = full_fill_dict[measurement_col][code]
            tmp_train = tmp[column+[measurement_col]].dropna(how='any')
            tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp[measurement_col].isnull())]

            model = HuberRegressor(epsilon=1.9)
            model.fit(tmp_train[column], tmp_train[measurement_col])
            data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data[measurement_col].isnull()),measurement_col] = model.predict(tmp_test[column])
            total_na_filled_by_linear_model += len(tmp_test)
            
        NA = data.loc[data["product_code"] == code,nullValue_cols ].isnull().sum().sum()
        model1 = KNNImputer(n_neighbors=3)
        data.loc[data.product_code==code, feature] = model1.fit_transform(data.loc[data.product_code==code, feature])
        
    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    df_train = data.iloc[:df_train.shape[0],:]
    df_test = data.iloc[df_train.shape[0]:,:]

    woe_encoder = WoEEncoder(variables=['attribute_0'])
    woe_encoder.fit(df_train, target)
    df_train = woe_encoder.transform(df_train)
    df_test = woe_encoder.transform(df_test)

    features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'area', 'm3_missing', 'm5_missing', 'measurement_avg']
    
    return df_train, df_test, features

def scale(train_data, val_data, test_data, feats):
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(train_data[feats])
    scaled_val = scaler.transform(val_data[feats])
    scaled_test = scaler.transform(test_data[feats])
    new_train = train_data.copy()
    new_val = val_data.copy()
    new_test = test_data.copy()
    new_train[feats] = scaled_train
    new_val[feats] = scaled_val
    new_test[feats] = scaled_test
    return new_train, new_val, new_test

df_train, df_test, features = preprocessing(df_train, df_test)
df_train['failure'] = target

**Model**

In [None]:
from sklearn.linear_model import LinearRegression, BayesianRidge, Ridge
import tensorflow as tf

oof = np.zeros(len(df_train))

result = []
importance_list = []

test_preds = np.zeros(len(df_test))
for fold, (train_idx, val_idx) in enumerate(StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0).split(df_train, df_train["failure"])):
    x_train, y_train = df_train.loc[train_idx][features], df_train.loc[train_idx]["failure"]
    x_val, y_val = df_train.loc[val_idx][features], df_train.loc[val_idx]["failure"]

    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_val_scaled = scaler.transform(x_val)
    test_scaled = scaler.transform(df_test[features].copy())
    
    model = LogisticRegression(max_iter= 500, C=0.0001, penalty="l2", solver="newton-cg")
    lgb_params = {
        'seed': 0,
        'n_jobs': -1,
        'lambda_l2': 2,
        'metric': "auc",
        'max_depth': -1,
        'num_leaves': 100,
        'boosting': 'gbdt',
        'bagging_freq': 20,
        'learning_rate': 0.01,
        'objective': 'binary',
        'min_data_in_leaf': 50,
        'num_boost_round': 500,
        'feature_fraction': 0.90,
        'bagging_fraction': 0.90,
    }
    model1 = LGBMClassifier(**lgb_params)
    
    model.fit(x_train, y_train)
    model1.fit(x_train_scaled, y_train)
    
    y_pred0 = model.predict_proba(x_val)[:, 1]   
    y_pred1 = model1.predict_proba(x_val_scaled)[:, 1]    
    y_pred = y_pred0*0.90+y_pred1*0.10
        
    r = roc_auc_score(y_val, y_pred)
    
    importance_list.append(model.coef_.ravel())
    result.append(r)
    
    pred = model.predict_proba(df_test[features])[:, 1]*r    
    pred1 = model1.predict_proba(test_scaled)[:, 1]*r
    
    test_preds += pred*0.90+pred1*0.10
    
    oof[val_idx] = y_pred
    print(f"Val score: {roc_auc_score(y_val, y_pred):.7f}")

print(f"Val score: {roc_auc_score(df_train['failure'], oof):.7f}")
print("Mean", round(sum(result)/5,7) )


**Output**

In [None]:
output = pd.read_csv('../input/tabular-playground-series-aug-2022/sample_submission.csv')
output['failure'] = test_preds/sum(result)
output.to_csv('pred19.csv', index=False)

In [None]:
output["failure"]