In [5]:
import pandas as pd 
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

test_pred = []
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
X_test = test.copy()
#add a column which recode the missing of measurement_5
X_test['m_5_missing'] = X_test.measurement_5.isna()

#impute the missing block
features = []
for feature in X_test.columns:
    if feature == 'loading' or feature.startswith('measurement'):
        features.append(feature)
imputer = SimpleImputer(strategy = "most_frequent")
imputer.fit(X_test[features])
for df in [X_test]:
    df[features] = imputer.transform(df[features])

#encode the attributes
encode_attributes = ['attribute_0', 'attribute_1']  
for df in [X_test]:
    code_list = []
    dfA_list = df['attribute_0'].values
    dfB_list = df['attribute_1'].values
    for itemidx in range(len(dfA_list)):
        if dfA_list[itemidx] == 'material_5' or dfB_list[itemidx] == 'material_5':
            code_list.append(True)
        else:
            code_list.append(False)
    df['material_5'] = code_list
for df in [X_test]:
    code_list = []
    dfA_list = df['attribute_0'].values
    dfB_list = df['attribute_1'].values
    for itemidx in range(len(dfA_list)):
        if dfA_list[itemidx] == 'material_7' or dfB_list[itemidx] == 'material_7':
            code_list.append(True)
        else:
            code_list.append(False)
    df['material_7'] = code_list
    df.drop(columns = encode_attributes, inplace = True)

#select features to fit model
feature_selc = ['loading', 'material_5', 'material_7', 'measurement_2', 'measurement_10', 'measurement_17', 'm_5_missing']
for fold in range(1, 4):
    model = joblib.load(f"model_{fold}")
    test_pred.append(model.predict_proba(X_test[feature_selc])[ : , 1])
    
#write result into csv
submission = pd.DataFrame({'id': test.index, 'failure': sum(test_pred)/len(test_pred)})
submission.to_csv('submission.csv', index=False)