In [1]:
import csv
import pandas as pd
from sklearn.model_selection import GroupKFold, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pickle

root_path = '..'

# read data
train_df = pd.read_csv(root_path + '/tabular-playground-series-aug-2022/train.csv')
test_df = pd.read_csv(root_path + '/tabular-playground-series-aug-2022/test.csv')

# encode string feature to number
labelencoder = LabelEncoder()
train_df['product_code'] = labelencoder.fit_transform(train_df['product_code']) 
train_df['attribute_0'] = labelencoder.fit_transform(train_df['attribute_0'])
train_df['attribute_1'] = labelencoder.fit_transform(train_df['attribute_1'])
test_df['product_code'] = labelencoder.fit_transform(test_df['product_code']) 
test_df['attribute_0'] = labelencoder.fit_transform(test_df['attribute_0'])
test_df['attribute_1'] = labelencoder.fit_transform(test_df['attribute_1'])

# measurements standardization
for col in test_df.columns:
    train_df[col] = (train_df[col] - train_df[col].mean()) / train_df[col].std()
    test_df[col] = (test_df[col] - test_df[col].mean()) / test_df[col].std()

# feature engineering
for df in [train_df, test_df]:
    df['measurement_2'] = df['measurement_2'].clip(11, None)

# add feature: avg -> average of measurement_3 ~ measurement_16
train_df['avg'] = train_df['measurement_3']
test_df['avg'] = test_df['measurement_3']
measurements = [f for f in test_df.columns if f.startswith('measurement') and f != 'measurement_0' and f != 'measurement_1' and f != 'measurement_2' and f != 'measurement_3']
for col in measurements:
    train_df['avg'] += train_df[col]
train_df['avg'] = (train_df['avg'] - train_df['avg'].mean()) / train_df['avg'].std()
for col in measurements:
    test_df['avg'] += test_df[col]
test_df['avg'] = (test_df['avg'] - test_df['avg'].mean()) / test_df['avg'].std()

# add feature: measurement_3_missing & measurement_5_missing -> the number of missing values in measurement_3 & measurement_5
train_df['measurement_3_missing'] = train_df['measurement_3'].isna()
train_df['measurement_5_missing'] = train_df['measurement_5'].isna()
test_df['measurement_3_missing'] = test_df['measurement_3'].isna()
test_df['measurement_5_missing'] = test_df['measurement_5'].isna()

# add feature: area -> multiple attribute_2 and attribute_3
train_df['area'] = train_df['attribute_2'] * train_df['attribute_3']
test_df['area'] = test_df['attribute_2'] * test_df['attribute_3']

FileNotFoundError: [Errno 2] No such file or directory: '../tabular-playground-series-aug-2022/train.csv'

In [None]:
# select logistic regression as model
model = LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=1)

# select features
select_features = ['loading', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'attribute_0', 'measurement_3_missing', 'measurement_5_missing', 'area', 'avg']
auc_list = []

# cross validation
kf = GroupKFold(n_splits=5)
for fold, (train_index, valid_index) in enumerate(kf.split(train_df, train_df.failure, train_df.product_code)):
    x_train = train_df.iloc[train_index][select_features]
    x_valid = train_df.iloc[valid_index][select_features]
    y_train = train_df.iloc[train_index]['failure']
    y_valid = train_df.iloc[valid_index]['failure']

    # fill missing values
    imputer = KNNImputer(n_neighbors=3)
    x_train = imputer.fit_transform(x_train)
    x_valid = imputer.fit_transform(x_valid)
    
    # fit and predict
    model.fit(x_train, y_train)
    y_pred = model.predict_proba(x_valid)[:,1]
    score = roc_auc_score(y_valid, y_pred)
    print(f"Fold {fold}: auc = {score:.5f}")
    auc_list.append(score)

# print out average AUC(area under curve)
print()
print(f"Average auc = {sum(auc_list) / len(auc_list):.5f}")

# save the model
filename = 'model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# load the model
model = pickle.load(open(filename, 'rb'))

x_test = test_df.loc[:, select_features]

# fill missing values in test
imputer = KNNImputer(n_neighbors=3)
x_test = imputer.fit_transform(x_test)

# predict test data
y_pred = model.predict_proba(x_test)[:,1]

# wrtie predictions to file
file = open('submission.csv', 'w', newline='')
csv_writer = csv.writer(file)
csv_writer.writerow(["id", "failure"])

for i in range(len(y_pred)):
    csv_writer.writerow([i + 26570, float(y_pred[i])])
    
file.close()

print("Finsh testing")