In [None]:
!git clone https://github.com/analokmaus/kuma_utils.git

Cloning into 'kuma_utils'...
remote: Enumerating objects: 915, done.[K
remote: Counting objects: 100% (120/120), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 915 (delta 104), reused 102 (delta 96), pack-reused 795[K
Receiving objects: 100% (915/915), 679.99 KiB | 4.20 MiB/s, done.
Resolving deltas: 100% (592/592), done.


In [None]:
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import yaml

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, RobustScaler, PowerTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_validate, RandomizedSearchCV, GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, plot_confusion_matrix

sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

from drive.MyDrive.Kaggle_TPS_0822.src import functions
import warnings
warnings.filterwarnings("ignore")

In [None]:
CFG = yaml.load(open('drive/MyDrive/Kaggle_TPS_0822/src/config.YAML', 'rb'))

functions.set_seed(CFG['SEED'])

pd.set_option('display.max_rows', CFG['NROWS'])
pd.set_option('display.max_columns', CFG['NCOLS'])

train = pd.read_csv(CFG['DATAPATH'] + 'train.csv', index_col='id')
test = pd.read_csv(CFG['DATAPATH'] +'test.csv', index_col='id')
train = pd.concat([train])
test = pd.concat([test])

In [None]:
cat_feats, int_feats, cont_feats, null_feats, no_nulls = functions.get_feats(train)

- Since attribute 0 and 1 are the material, it would make sense that attribute 2 and 3 are some form of dimensions
- This competition is based on a hypothetical 'super soaker' which absorbs liquid, so these dimensions are likely volumetric, we will start with a simple area formula for a baseline

In [None]:
train['attribute_2*3'] = train['attribute_2'] * train['attribute_3']
train.drop(['attribute_2', 'attribute_3'], axis=1, inplace=True)

test['attribute_2*3'] = test['attribute_2'] * test['attribute_3']
test.drop(['attribute_2', 'attribute_3'], axis=1, inplace=True)

In [None]:
for feat in null_feats:
    train[f'na_{feat}'] = train[feat].isna().astype(int)
    test[f'na_{feat}'] = test[feat].isna().astype(int)

In [None]:
le = LabelEncoder()

tmp = pd.concat([train, test])
attribute_0 = le.fit_transform(tmp['attribute_0'])
attribute_1 = le.fit_transform(tmp['attribute_1'])

train['attribute_0'] = attribute_0[:train.shape[0]]
train['attribute_1'] = attribute_1[:train.shape[0]]

test['attribute_0'] = attribute_0[train.shape[0]:]
test['attribute_1'] = attribute_1[train.shape[0]:]

In [None]:
# selecting all data from each product code category for:
dfa, dfb, dfc, dfd, dfe, dff, dfg, dfh, dfi = functions.get_products(train, test)

LGBM_IMP = LGBMImputer(cat_features=cat_feats, n_iter=20)

# Train data
iter_a = LGBM_IMP.fit_transform(dfa[null_feats])
iter_b = LGBM_IMP.fit_transform(dfb[null_feats])
iter_c = LGBM_IMP.fit_transform(dfc[null_feats])
iter_d = LGBM_IMP.fit_transform(dfd[null_feats])
iter_e = LGBM_IMP.fit_transform(dfe[null_feats])

# Test data
iter_f = LGBM_IMP.fit_transform(dff[null_feats])
iter_g = LGBM_IMP.fit_transform(dfg[null_feats])
iter_h = LGBM_IMP.fit_transform(dfh[null_feats])
iter_i = LGBM_IMP.fit_transform(dfi[null_feats])

['A' 'B' 'C' 'D' 'E']
['F' 'G' 'H' 'I']


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
train_imp_products = pd.concat([iter_a, iter_b, iter_c, iter_d, iter_e], axis=0)
train[null_feats] = train_imp_products

test_imp_products = pd.concat([iter_f, iter_g, iter_h, iter_i], axis=0)
test[null_feats] = test_imp_products

In [None]:
X_train = train.drop(['product_code', 'failure'], axis=1)
X_test = test.drop('product_code', axis=1)
y_train = train['failure']

In [None]:
def score(X, y, model, cv):
     scoring=['roc_auc']
     scores =cross_validate(model, X, y, scoring=scoring, cv=cv, return_train_score=True)
     scores = pd.DataFrame(scores).T
     return scores.assign(
         mean=lambda x: x.mean(axis=1), 
         std= lambda x: x.std(axis=1))

In [None]:
lr = LogisticRegression(max_iter=1000)
kf = KFold(n_splits=5, shuffle=True)
scores=score(X_train, y_train, lr, kf)
scores

Unnamed: 0,0,1,2,3,4,mean,std
fit_time,0.951551,0.43351,0.453807,0.653307,0.583854,0.615206,0.186883
score_time,0.00759,0.007605,0.007218,0.007135,0.007267,0.007363,0.000196
test_roc_auc,0.590159,0.58904,0.563255,0.595752,0.601322,0.587906,0.013081
train_roc_auc,0.592827,0.593125,0.599792,0.590957,0.590132,0.593367,0.003403


In [None]:
lr.fit(X_train, y_train)
preds = lr.predict(X_test)
sub = pd.read_csv(CFG['SUBMISSIONS'] + 'sample_submission.csv', index_col='id')
sub['failure'] = preds
sub.to_csv(CFG['SUBMISSIONS']+'na_markers.csv', index='id')

In [None]:
importances = lr.coef_[0]
names = lr.feature_names_in_
list(zip(names, importances))

[('loading', 0.0075746252855731),
 ('attribute_0', 0.05244459682722128),
 ('attribute_1', 0.011745617841518388),
 ('measurement_0', 0.0027097442498381155),
 ('measurement_1', -0.0043875112959342434),
 ('measurement_2', 0.015747382606103158),
 ('measurement_3', -0.011952269718129372),
 ('measurement_4', -0.052300662854688845),
 ('measurement_5', -0.023304139132790756),
 ('measurement_6', -0.019261219097602124),
 ('measurement_7', -0.0067579171297599255),
 ('measurement_8', -0.03072166210527576),
 ('measurement_9', -0.031047867724435456),
 ('measurement_10', -0.009347549109271193),
 ('measurement_11', -0.012991509656241903),
 ('measurement_12', 0.009044463140686673),
 ('measurement_13', -0.013231216327895983),
 ('measurement_14', 0.0017961905148009833),
 ('measurement_15', -0.005707186101166248),
 ('measurement_16', 0.0021846802645717972),
 ('measurement_17', 0.0010386305686152844),
 ('attribute_2*3', -0.0015239827290295174),
 ('na_loading', -0.05404670206301796),
 ('na_measurement_3', -

- the missing token for measurement 3 and 5 have quite a large correlation, lets see what happens if we keep just those ones

In [None]:
drop_cols = [x for x in train.columns if x.startswith('na') and not x.endswith(('_3', '_5'))]
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

In [None]:
lr = LogisticRegression(max_iter=1000)
kf = KFold(n_splits=5, shuffle=True)
scores=score(train.drop(['product_code', 'failure'], axis=1), train['failure'], lr, kf)
scores

Unnamed: 0,0,1,2,3,4,mean,std
fit_time,0.982898,3.382465,0.406301,2.59477,0.691581,1.611603,1.165936
score_time,0.013354,0.007081,0.007369,0.007006,0.009898,0.008942,0.002452
test_roc_auc,0.589064,0.595693,0.591062,0.5876,0.579451,0.588574,0.005316
train_roc_auc,0.593296,0.592391,0.592644,0.59459,0.595552,0.593695,0.001201


- Slightly better than without the two engineered features, could see a bigger boost after mor efeature selection

In [None]:
lr.fit(train.drop(['product_code', 'failure'], axis=1), train['failure'])
preds = lr.predict(test.drop('product_code', axis=1))
sub['failure'] = preds
sub.to_csv(CFG['SUBMISSIONS'] + 'with_missing_measure_3+5.csv')