In [135]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.metrics import roc_auc_score, accuracy_score

In [136]:
#load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

print("train", df_train.shape)
print("test", df_test.shape)

train (26570, 26)
test (20775, 25)


In [137]:
data = pd.concat([df_train, df_test])
data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
data['loading'] = np.log1p(data['loading'])

In [138]:
#fill missing data
print(df_train.isna().sum())

id                   0
product_code         0
loading            250
attribute_0          0
attribute_1          0
attribute_2          0
attribute_3          0
measurement_0        0
measurement_1        0
measurement_2        0
measurement_3      381
measurement_4      538
measurement_5      676
measurement_6      796
measurement_7      937
measurement_8     1048
measurement_9     1227
measurement_10    1300
measurement_11    1468
measurement_12    1601
measurement_13    1774
measurement_14    1874
measurement_15    2009
measurement_16    2110
measurement_17    2284
failure              0
dtype: int64


Filling NA

In [139]:
feature = [f for f in df_test.columns if f.startswith('measurement') or f=='loading']

fill_dict = {
    'A': ['measurement_5','measurement_6','measurement_8'],
    'B': ['measurement_4','measurement_5','measurement_7'],
    'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
    'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
    'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
    'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
    'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
    'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
    'I': ['measurement_3','measurement_7','measurement_8']
}

In [140]:
#preprocessing
for code in data.product_code.unique():
    tmp = data[data.product_code==code]
    column = fill_dict[code]
    tmp_train = tmp[column+['measurement_17']].dropna(how='any')
    tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp['measurement_17'].isnull())]
    print(f"code {code} has {len(tmp_test)} samples to fill nan")
    
    model1 = HuberRegressor()
    model1.fit(tmp_train[column], tmp_train['measurement_17'])
    data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data['measurement_17'].isnull()), 'measurement_17'] = model1.predict(tmp_test[column])

    model2 = KNNImputer(n_neighbors=5)
    data.loc[data.product_code==code, feature] = model2.fit_transform(data.loc[data.product_code==code, feature])

code A has 386 samples to fill nan
code B has 418 samples to fill nan
code C has 391 samples to fill nan
code D has 398 samples to fill nan
code E has 429 samples to fill nan
code F has 420 samples to fill nan
code G has 373 samples to fill nan
code H has 361 samples to fill nan
code I has 377 samples to fill nan


In [141]:
def scale(train_data, val_data, test_data, feats):
    scaler = StandardScaler()
    
    scaled_train = scaler.fit_transform(train_data[feats])
    scaled_val = scaler.transform(val_data[feats])
    scaled_test = scaler.transform(test_data[feats])
    
    new_train = train_data.copy()
    new_val = val_data.copy()
    new_test = test_data.copy()
    
    new_train[feats] = scaled_train
    new_val[feats] = scaled_val
    new_test[feats] = scaled_test
    
    assert len(train_data) == len(new_train)
    assert len(val_data) == len(new_val)
    assert len(test_data) == len(new_test)
    
    return new_train, new_val, new_test

In [142]:
train = data[data.failure.notnull()]
test = data[data.failure.isnull()]
print(train.shape, test.shape)

(26570, 28) (20775, 28)


In [143]:
x = train.drop(['failure'], axis=1)
y = train['failure'].astype(int)
test = test.drop(['failure'], axis=1)
x.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,m3_missing,m5_missing
0,0,A,4.395683,material_7,material_8,9,5,7.0,8.0,4.0,...,15.859,17.594,15.193,15.029,16.18,13.034,14.684,764.1,0,0
1,1,A,4.453067,material_7,material_8,9,5,14.0,3.0,3.0,...,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0,0
2,2,A,4.424008,material_7,material_8,9,5,12.0,1.0,5.0,...,15.607,19.31,13.798,16.711,18.631,14.094,17.946,663.376,0,0
3,3,A,4.625659,material_7,material_8,9,5,13.0,2.0,6.0,...,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0,0
4,4,A,5.242064,material_7,material_8,9,5,9.0,2.0,8.0,...,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0,0


In [144]:
select_feature = ['m3_missing', 'm5_missing', 'measurement_1', 'measurement_2', 'loading', 'measurement_17']

In [145]:
x_train, x_val, x_test = scale(x[select_feature], x[select_feature], test.copy(), select_feature)

Model

In [146]:
model = LogisticRegression(max_iter=1000, C=0.0001, penalty='l2', solver='newton-cg')
model.fit(x_train, y) 

LogisticRegression(C=0.0001, max_iter=1000, solver='newton-cg')

In [147]:
import pickle
with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)