In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.metrics import roc_auc_score, accuracy_score

Load Model

In [10]:
import pickle
with open('model.pickle', 'rb') as f:
    model = pickle.load(f)

Load Data

In [11]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [12]:
data = pd.concat([df_train, df_test])
data = pd.concat([df_train, df_test])
data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
data['loading'] = np.log1p(data['loading'])

Feature Extraction

In [13]:
feature = [f for f in df_test.columns if f.startswith('measurement') or f=='loading']

fill_dict = {
    'A': ['measurement_5','measurement_6','measurement_8'],
    'B': ['measurement_4','measurement_5','measurement_7'],
    'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
    'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
    'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
    'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
    'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
    'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
    'I': ['measurement_3','measurement_7','measurement_8']
}

Preprocessing

In [14]:
#filling NA
for code in data.product_code.unique():
    tmp = data[data.product_code==code]
    column = fill_dict[code]
    tmp_train = tmp[column+['measurement_17']].dropna(how='any')
    tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp['measurement_17'].isnull())]
    print("code:", code, " nan:", len(tmp_test))
    
    model1 = HuberRegressor()
    model1.fit(tmp_train[column], tmp_train['measurement_17'])
    data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data['measurement_17'].isnull()), 'measurement_17'] = model1.predict(tmp_test[column])

    model2 = KNNImputer(n_neighbors=5)
    data.loc[data.product_code==code, feature] = model2.fit_transform(data.loc[data.product_code==code, feature])
test = data[data.failure.isnull()]

code A has 386 samples to fill nan
code B has 418 samples to fill nan
code C has 391 samples to fill nan
code D has 398 samples to fill nan
code E has 429 samples to fill nan
code F has 420 samples to fill nan
code G has 373 samples to fill nan
code H has 361 samples to fill nan
code I has 377 samples to fill nan


In [15]:
train = data[data.failure.notnull()]
test = data[data.failure.isnull()]

x = train.drop(['failure'], axis=1)
y = train['failure'].astype(int)
test = test.drop(['failure'], axis=1)

Scale Features

In [16]:
select_feature = ['m3_missing', 'm5_missing', 'measurement_1', 'measurement_2', 'loading', 'measurement_17']

In [17]:
scaler = StandardScaler()
    
scaled_train = scaler.fit_transform(x[select_feature])
scaled_val = scaler.transform(x[select_feature])
scaled_test = scaler.transform(test[select_feature])
    
new_test = test.copy()
new_test[select_feature] = scaled_test

assert len(test) == len(new_test)

Predict Model

In [18]:
submission = pd.read_csv('sample_submission.csv')
submission['failure'] = model.predict_proba(new_test[select_feature])[:, 1]
submission[['id', 'failure']].to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,failure
0,26570,0.208202
1,26571,0.199114
2,26572,0.2035
3,26573,0.205523
4,26574,0.243987
