In [2]:
import pandas as pd 
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

auc_list = []
test_pred = []
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
kf = GroupKFold(n_splits = 5)
for fold, (idx_train, idx_vaild) in enumerate(kf.split(train, train.failure, train.product_code)):
    X_train = train.iloc[idx_train][test.columns]
    X_vaild = train.iloc[idx_vaild][test.columns]
    y_train = train.iloc[idx_train].failure
    y_vaild = train.iloc[idx_vaild].failure 
    X_test = test.copy()

    #add a column which recode the missing of measurement_5
    X_train['m_5_missing'] = X_train.measurement_5.isna()
    X_vaild['m_5_missing'] = X_vaild.measurement_5.isna()
    X_test['m_5_missing'] = X_test.measurement_5.isna()
    
    #impute the missing block
    features = []
    for feature in X_train.columns:
        if feature == 'loading' or feature.startswith('measurement'):
            features.append(feature)
    imputer = SimpleImputer(strategy = "most_frequent")
    imputer.fit(X_train[features])
    for df in [X_train, X_vaild, X_test]:
        df[features] = imputer.transform(df[features])
    
    #encode the attributes
    encode_attributes = ['attribute_0', 'attribute_1']  
    for df in [X_train, X_vaild, X_test]:
        code_list = []
        dfA_list = df['attribute_0'].values
        dfB_list = df['attribute_1'].values
        for itemidx in range(len(dfA_list)):
            if dfA_list[itemidx] == 'material_5' or dfB_list[itemidx] == 'material_5':
                code_list.append(True)
            else:
                code_list.append(False)
        df['material_5'] = code_list
    for df in [X_train, X_vaild, X_test]:
        code_list = []
        dfA_list = df['attribute_0'].values
        dfB_list = df['attribute_1'].values
        for itemidx in range(len(dfA_list)):
            if dfA_list[itemidx] == 'material_7' or dfB_list[itemidx] == 'material_7':
                code_list.append(True)
            else:
                code_list.append(False)
        df['material_7'] = code_list
        df.drop(columns = encode_attributes, inplace = True)
    
    #select features to fit model
    feature_selc = ['loading', 'material_5', 'material_7', 'measurement_2', 'measurement_10', 'measurement_17', 'm_5_missing']
    #model = RandomForestRegressor(n_estimators = 150, max_depth = 10, min_samples_leaf = 100, n_jobs = -1, random_state = 1)
    #model = AdaBoostClassifier(base_estimator = None, n_estimators = 150, learning_rate = 0.075, algorithm = 'SAMME.R', random_state = None)
    #model = RandomForestClassifier(n_estimators = 250, max_depth = 8, min_samples_leaf = 100, n_jobs = -1, random_state = 1)
    #model = make_pipeline(StandardScaler(), LogisticRegression(penalty='l1', C = 0.001, solver = 'liblinear', random_state = 1))
    model = make_pipeline(StandardScaler(), LogisticRegression(penalty = 'l2', C = 0.000005, solver = 'saga', random_state = 0))
    model.fit(X_train[feature_selc], y_train)
    y_vaild_pred = model.predict_proba(X_vaild[feature_selc])[ : , 1]
    score = roc_auc_score(y_vaild, y_vaild_pred)
    print(f"Fold {fold}: auc = {score}")
    auc_list.append(score)
    if(fold < 4 and fold > 0):
        test_pred.append(model.predict_proba(X_test[feature_selc])[ : , 1])
        #save models
        joblib.dump(model, f"model_{fold}")

# Show overall score
print(f"Average auc = {sum(auc_list) / len(auc_list):.5f}")

#write result into csv
submission = pd.DataFrame({'id': test.index, 'failure': sum(test_pred)/len(test_pred)})
submission.to_csv('submission.csv', index=False)

Fold 0: auc = 0.5873615394326317
Fold 1: auc = 0.5848479830444022
Fold 2: auc = 0.5922739327255239
Fold 3: auc = 0.5992151528776978
Fold 4: auc = 0.5920760904094672
Average auc = 0.59115


If above code can't reproduce the result, please download models and run this part.

In [3]:
import pandas as pd 
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
test = pd.read_csv('test.csv', index_col='id')
test_pred = []
X_test = test.copy()
#add a column which recode the missing of measurement_5
X_test['m_5_missing'] = X_test.measurement_5.isna()
#impute the missing block
features = []
for feature in X_train.columns:
    if feature == 'loading' or feature.startswith('measurement'):
        features.append(feature)
imputer = SimpleImputer(strategy = "most_frequent")
imputer.fit(X_train[features])
for df in [X_test]:
    df[features] = imputer.transform(df[features])

#encode the attributes
encode_attributes = ['attribute_0', 'attribute_1']  
for df in [X_test]:
    code_list = []
    dfA_list = df['attribute_0'].values
    dfB_list = df['attribute_1'].values
    for itemidx in range(len(dfA_list)):
        if dfA_list[itemidx] == 'material_5' or dfB_list[itemidx] == 'material_5':
            code_list.append(True)
        else:
            code_list.append(False)
    df['material_5'] = code_list
for df in [X_test]:
    code_list = []
    dfA_list = df['attribute_0'].values
    dfB_list = df['attribute_1'].values
    for itemidx in range(len(dfA_list)):
        if dfA_list[itemidx] == 'material_7' or dfB_list[itemidx] == 'material_7':
            code_list.append(True)
        else:
            code_list.append(False)
    df['material_7'] = code_list
    df.drop(columns = encode_attributes, inplace = True)

#select features to fit model
feature_selc = ['loading', 'material_5', 'material_7', 'measurement_2', 'measurement_10', 'measurement_17', 'm_5_missing']
for fold in range(1, 4):
    model = joblib.load(f"model_{fold}")
    test_pred.append(model.predict_proba(X_test[feature_selc])[ : , 1])
    
#write result into csv
submission = pd.DataFrame({'id': test.index, 'failure': sum(test_pred)/len(test_pred)})
submission.to_csv('submission.csv', index=False)