#1 DATA LOADING

In [1]:
# Bagian 1: Memuat Data
import pandas as pd

# Load the datasets
train_data = pd.read_csv('../dataset/train.csv')
test_data = pd.read_csv('../dataset/test.csv')
sample_submission = pd.read_csv('../dataset/sample_submission.csv')

# Display the first few rows of the training data
train_data.head()


Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


#2 DATA CHECKING

In [2]:
# Bagian 2: Memeriksa Data
# Check for missing values
missing_values_train = train_data.isnull().sum()
missing_values_test = test_data.isnull().sum()

print("Missing values in training data:\n", missing_values_train)
print("\nMissing values in test data:\n", missing_values_test)


Missing values in training data:
 id                   0
product_code         0
loading            250
attribute_0          0
attribute_1          0
attribute_2          0
attribute_3          0
measurement_0        0
measurement_1        0
measurement_2        0
measurement_3      381
measurement_4      538
measurement_5      676
measurement_6      796
measurement_7      937
measurement_8     1048
measurement_9     1227
measurement_10    1300
measurement_11    1468
measurement_12    1601
measurement_13    1774
measurement_14    1874
measurement_15    2009
measurement_16    2110
measurement_17    2284
failure              0
dtype: int64

Missing values in test data:
 id                   0
product_code         0
loading            223
attribute_0          0
attribute_1          0
attribute_2          0
attribute_3          0
measurement_0        0
measurement_1        0
measurement_2        0
measurement_3      329
measurement_4      409
measurement_5      508
measurement_6      624
me

#3 PREPROCESSING - IMPUTASI

In [3]:
from sklearn.impute import SimpleImputer

# Impute missing values
imputer = SimpleImputer(strategy='mean')
train_data_imputed = train_data.copy()
test_data_imputed = test_data.copy()

# Columns to impute
cols_to_impute = train_data.columns[train_data.isnull().sum() > 0]

train_data_imputed[cols_to_impute] = imputer.fit_transform(train_data[cols_to_impute])
test_data_imputed[cols_to_impute] = imputer.transform(test_data[cols_to_impute])

# Display the first few rows of the imputed training data
train_data_imputed.head()


Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,16.048444,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,19.172085,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


#4 PREPROCESSING - ONE-HOT ENCODING

In [4]:
# One-Hot Encode categorical variables
train_data_encoded = pd.get_dummies(train_data_imputed, columns=['product_code', 'attribute_0', 'attribute_1'])
test_data_encoded = pd.get_dummies(test_data_imputed, columns=['product_code', 'attribute_0', 'attribute_1'])

# Ensure the test set has the same columns as the train set
test_data_encoded = test_data_encoded.reindex(columns=train_data_encoded.columns, fill_value=0)

# Display the first few rows of the encoded training data
train_data_encoded.head()


Unnamed: 0,id,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,...,product_code_A,product_code_B,product_code_C,product_code_D,product_code_E,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_8
0,0,80.1,9,5,7,8,4,18.04,12.518,15.748,...,True,False,False,False,False,False,True,False,False,True
1,1,84.89,9,5,14,3,3,18.213,11.54,17.717,...,True,False,False,False,False,False,True,False,False,True
2,2,82.43,9,5,12,1,5,18.057,11.652,16.738,...,True,False,False,False,False,False,True,False,False,True
3,3,101.07,9,5,13,2,6,17.295,11.188,18.576,...,True,False,False,False,False,False,True,False,False,True
4,4,188.06,9,5,9,2,8,19.346,12.95,16.99,...,True,False,False,False,False,False,True,False,False,True


#5 MEMISAHKAN FITUR DAN TARGET

In [5]:
# Separate features and target variable from the training data
X = train_data_encoded.drop(columns=['id', 'failure'])
y = train_data_encoded['failure']
X_test = test_data_encoded.drop(columns=['id', 'failure'])

# Display the shape of the features and target
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
print("Shape of X_test:", X_test.shape)

Shape of X: (26570, 31)
Shape of y: (26570,)
Shape of X_test: (20775, 31)


#6 STANDARISASI FITUR

In [6]:
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Display the first few rows of the scaled features
pd.DataFrame(X_scaled, columns=X.columns).head()


Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,...,product_code_A,product_code_B,product_code_C,product_code_D,product_code_E,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_8
0,-1.228625,1.525966,-1.538285,-0.101025,-0.05537,-0.681939,0.249978,0.797229,-1.402756,1.815874,...,2.051781,-0.496233,-0.5264,-0.488091,-0.501705,-0.496233,0.496233,-0.799571,-0.501705,1.202276
1,-1.105315,1.525966,-1.538285,1.599402,-1.246039,-0.984141,0.424027,-0.194728,0.598997,0.389673,...,2.051781,-0.496233,-0.5264,-0.488091,-0.501705,-0.496233,0.496233,-0.799571,-0.501705,1.202276
2,-1.168644,1.525966,-1.538285,1.113565,-1.722306,-0.379737,0.267081,-0.081129,-0.396288,0.74342,...,2.051781,-0.496233,-0.5264,-0.488091,-0.501705,-0.496233,0.496233,-0.799571,-0.501705,1.202276
3,-0.688791,1.525966,-1.538285,1.356484,-1.484172,-0.077535,-0.499536,-0.551751,1.472286,0.844345,...,2.051781,-0.496233,-0.5264,-0.488091,-0.501705,-0.496233,0.496233,-0.799571,-0.501705,1.202276
4,1.550609,1.525966,-1.538285,0.384811,-1.484172,0.526868,1.563892,1.235394,-0.140096,-1.799071,...,2.051781,-0.496233,-0.5264,-0.488091,-0.501705,-0.496233,0.496233,-0.799571,-0.501705,1.202276


#7 MEMBAGI DATA LATIH DAN VALIDASI

In [7]:
from sklearn.model_selection import train_test_split

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display the shape of the training and validation sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)


Shape of X_train: (21256, 31)
Shape of X_val: (5314, 31)
Shape of y_train: (21256,)
Shape of y_val: (5314,)


#8 MODELING

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier

# Define the base classifiers
log_reg = LogisticRegression(max_iter=1000, random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)

# Define the ensemble model
ensemble_model = VotingClassifier(
    estimators=[('log_reg', log_reg), ('gb_clf', gb_clf)],
    voting='soft'
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

#9 EVALUASI MODEL

In [9]:
from sklearn.metrics import roc_auc_score

# Evaluate the model
y_val_pred = ensemble_model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'ROC AUC Score on Validation Set: {roc_auc}')


ROC AUC Score on Validation Set: 0.5935089603655751


#10 SUBMISSION

In [10]:
# Bagian 10: Prediksi dan Menyimpan Hasil
# Make predictions on the test data
y_test_pred = ensemble_model.predict_proba(X_test_scaled)[:, 1]

# Prepare the submission file
submission = pd.DataFrame({'id': test_data['id'], 'failure': y_test_pred})
#submission.to_csv('submission_lr-gb.csv', index=False)

# Display the first few rows of the submission file
submission.head()


Unnamed: 0,id,failure
0,26570,0.22006
1,26571,0.169587
2,26572,0.173415
3,26573,0.176588
4,26574,0.314013
