In [1]:
!git clone https://github.com/analokmaus/kuma_utils.git

Cloning into 'kuma_utils'...
remote: Enumerating objects: 915, done.[K
remote: Counting objects: 100% (120/120), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 915 (delta 104), reused 102 (delta 96), pack-reused 795[K
Receiving objects: 100% (915/915), 679.99 KiB | 3.28 MiB/s, done.
Resolving deltas: 100% (592/592), done.


In [33]:
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import yaml

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, RobustScaler, PowerTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_validate, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, plot_confusion_matrix

sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

from drive.MyDrive.Kaggle_TPS_0822.src import functions
import warnings
warnings.filterwarnings("ignore")

In [7]:
CFG = yaml.load(open('drive/MyDrive/Kaggle_TPS_0822/src/config.YAML', 'rb'))

functions.set_seed(CFG['SEED'])

pd.set_option('display.max_rows', CFG['NROWS'])
pd.set_option('display.max_columns', CFG['NCOLS'])

train = pd.read_csv(CFG['DATAPATH'] + 'train.csv', index_col='id')
test = pd.read_csv(CFG['DATAPATH'] +'test.csv', index_col='id')
train = pd.concat([train])
test = pd.concat([test])

cat_feats = train.select_dtypes('object').columns
int_feats = train.select_dtypes('integer').columns
cont_feats = train.select_dtypes('float').columns
null_feats = [col for col in train.columns if train[col].isnull().sum() != 0]

# Missing Value Imputation
- For our first iteration of imputation, we will group based on product codes - this should give us better results than just imputing the mean since each product should be different

In [8]:
# selecting all data from each product code category for:
# train:
print(train['product_code'].unique())
df_a = train[train['product_code'] == 'A']
df_b = train[train['product_code'] == 'B']
df_c = train[train['product_code'] == 'C']
df_d = train[train['product_code'] == 'D']
df_e = train[train['product_code'] == 'E']

# test:
print(test['product_code'].unique())
df_f = test[test['product_code'] == 'F']
df_g = test[test['product_code'] == 'G']
df_h = test[test['product_code'] == 'H']
df_i = test[test['product_code'] == 'I']

product_code_dfs = [df_a, df_b, df_c, df_d, df_e, df_f, df_g, df_h, df_i]

['A' 'B' 'C' 'D' 'E']
['F' 'G' 'H' 'I']


In [9]:
LGBM_IMP = LGBMImputer(cat_features=cat_feats, n_iter=20)

# Train data
iter_a = LGBM_IMP.fit_transform(df_a[null_feats])
iter_b = LGBM_IMP.fit_transform(df_b[null_feats])
iter_c = LGBM_IMP.fit_transform(df_c[null_feats])
iter_d = LGBM_IMP.fit_transform(df_d[null_feats])
iter_e = LGBM_IMP.fit_transform(df_e[null_feats])

# Test data
iter_f = LGBM_IMP.fit_transform(df_f[null_feats])
iter_g = LGBM_IMP.fit_transform(df_g[null_feats])
iter_h = LGBM_IMP.fit_transform(df_h[null_feats])
iter_i = LGBM_IMP.fit_transform(df_i[null_feats])

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [10]:
no_nas = [col for col in train.columns if col not in null_feats and col!='failure']

train_imp = train[no_nas]
train_imp['failure'] = train['failure']
test_imp = test[no_nas]

train_imp_products = pd.concat([iter_a, iter_b, iter_c, iter_d, iter_e], axis=0)
train_imp = pd.concat([train_imp, train_imp_products], axis=1)

test_imp_products = pd.concat([iter_f, iter_g, iter_h, iter_i], axis=0)
test_imp = pd.concat([test_imp, test_imp_products], axis=1)

In [11]:
print(f'Missing Value totals\nTrain: {train_imp.isnull().sum().sum()} \nTest:{test_imp.isnull().sum().sum()}')

Missing Value totals
Train: 0 
Test:0


In [12]:
le = LabelEncoder()

tmp = pd.concat([train_imp, test_imp])
attribute_0 = le.fit_transform(tmp['attribute_0'])
attribute_1 = le.fit_transform(tmp['attribute_1'])

train_imp['attribute_0'] = attribute_0[:train_imp.shape[0]]
train_imp['attribute_1'] = attribute_1[:train_imp.shape[0]]

test_imp['attribute_0'] = attribute_0[train_imp.shape[0]:]
test_imp['attribute_1'] = attribute_1[train_imp.shape[0]:]

In [34]:
X_train = train_imp.drop(['product_code', 'failure'], axis=1)
X_test = test_imp.drop('product_code', axis=1)
y_train = train_imp['failure']

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True)

In [35]:
folds = 10



In [36]:
def score(X, y, model, cv):
     scoring=['roc_auc']
     scores =cross_validate(model, X, y, scoring=scoring, cv=cv, return_train_score=True)
     scores = pd.DataFrame(scores).T
     return scores.assign(
         mean=lambda x: x.mean(axis=1), 
         std= lambda x: x.std(axis=1))

In [37]:
lr = LogisticRegression(max_iter=500)
kf = KFold(n_splits=folds, shuffle=True)
scores=score(X_train, y_train, lr, kf)
scores

# lr.fit(X_train, y_train)
# lr.score(X_train, y_train)
# y_preds = lr.predict(X_val)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std
fit_time,1.576185,1.76982,1.952431,0.964127,1.997108,2.305975,2.305066,1.597449,1.897398,0.641127,1.700669,0.511635
score_time,0.014346,0.021551,0.011257,0.007167,0.007032,0.011975,0.010407,0.011915,0.006958,0.007783,0.011039,0.004264
test_roc_auc,0.592555,0.589492,0.588213,0.585533,0.582344,0.545271,0.589346,0.605263,0.601307,0.601979,0.58813,0.015997
train_roc_auc,0.592751,0.593139,0.593366,0.5934,0.594169,0.598221,0.593283,0.59084,0.592101,0.591672,0.593294,0.001883


In [38]:
lr.fit(X_train, y_train)
preds = lr.predict(X_test)

In [51]:
sub = pd.read_csv(CFG['SUBMISSIONS']+'sample_submission.csv', index_col='id')
sub['failure'] = preds
sub.to_csv(CFG['SUBMISSIONS']+'baseline.csv', index='id')