In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import csv
import pandas as pd
from sklearn.model_selection import GroupKFold, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pickle

root_path = '/content/drive/MyDrive/ML/Final/'

# read data
test_df = pd.read_csv(root_path + 'tabular-playground-series-aug-2022/test.csv')

# encode string feature to number
labelencoder = LabelEncoder()
test_df['product_code'] = labelencoder.fit_transform(test_df['product_code']) 
test_df['attribute_0'] = labelencoder.fit_transform(test_df['attribute_0'])
test_df['attribute_1'] = labelencoder.fit_transform(test_df['attribute_1'])

# measurements standardization
for col in test_df.columns:
    test_df[col] = (test_df[col] - test_df[col].mean()) / test_df[col].std()

# feature engineering
test_df['measurement_2'] = test_df['measurement_2'].clip(11, None)

# add feature: avg -> average of measurement_3 ~ measurement_16
test_df['avg'] = test_df['measurement_3']
measurements = [f for f in test_df.columns if f.startswith('measurement') and f != 'measurement_0' and f != 'measurement_1' and f != 'measurement_2' and f != 'measurement_3']
for col in measurements:
    test_df['avg'] += test_df[col]
test_df['avg'] = (test_df['avg'] - test_df['avg'].mean()) / test_df['avg'].std()

# add feature: measurement_3_missing & measurement_5_missing -> the number of missing values in measurement_3 & measurement_5
test_df['measurement_3_missing'] = test_df['measurement_3'].isna()
test_df['measurement_5_missing'] = test_df['measurement_5'].isna()

# add feature: area -> multiple attribute_2 and attribute_3
test_df['area'] = test_df['attribute_2'] * test_df['attribute_3']

In [3]:
# select features
select_features = ['loading', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'attribute_0', 'measurement_3_missing', 'measurement_5_missing', 'area', 'avg']

# load the model
model = pickle.load(open(root_path + 'model.pkl', 'rb'))

x_test = test_df.loc[:, select_features]

# fill missing values in test
imputer = KNNImputer(n_neighbors=3)
x_test = imputer.fit_transform(x_test)

# predict test data
y_pred = model.predict_proba(x_test)[:,1]

# wrtie predictions to file
file = open(root_path + '109550018.csv', 'w', newline='')
csv_writer = csv.writer(file)
csv_writer.writerow(["id", "failure"])

for i in range(len(y_pred)):
    csv_writer.writerow([i + 26570, float(y_pred[i])])
    
file.close()

print("Finsh inference")

Finsh inference
