In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd

# Load the datasets
train_data = pd.read_csv('../dataset/train.csv')
test_data = pd.read_csv('../dataset/test.csv')
sample_submission = pd.read_csv('../dataset/sample_submission.csv')

# Impute missing values
imputer = SimpleImputer(strategy='mean')
train_data_imputed = train_data.copy()
test_data_imputed = test_data.copy()

# Columns to impute
cols_to_impute = train_data.columns[train_data.isnull().sum() > 0]

train_data_imputed[cols_to_impute] = imputer.fit_transform(train_data[cols_to_impute])
test_data_imputed[cols_to_impute] = imputer.transform(test_data[cols_to_impute])

# One-Hot Encode categorical variables
train_data_encoded = pd.get_dummies(train_data_imputed, columns=['product_code', 'attribute_0', 'attribute_1'])
test_data_encoded = pd.get_dummies(test_data_imputed, columns=['product_code', 'attribute_0', 'attribute_1'])

# Ensure the test set has the same columns as the train set
test_data_encoded = test_data_encoded.reindex(columns=train_data_encoded.columns, fill_value=0)

# Separate features and target variable from the training data
X = train_data_encoded.drop(columns=['id', 'failure'])
y = train_data_encoded['failure']
X_test = test_data_encoded.drop(columns=['id', 'failure'])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the base classifiers
log_reg = LogisticRegression(max_iter=1000, random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)

# Define the ensemble model
ensemble_model = VotingClassifier(
    estimators=[('log_reg', log_reg), ('gb_clf', gb_clf)],
    voting='soft'
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the model
y_val_pred = ensemble_model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'ROC AUC Score on Validation Set: {roc_auc}')

# Make predictions on the test data
y_test_pred = ensemble_model.predict_proba(X_test_scaled)[:, 1]

# Prepare the submission file
submission = pd.DataFrame({'id': test_data['id'], 'failure': y_test_pred})
submission.to_csv('submission_lr-gb.csv', index=False)

ROC AUC Score on Validation Set: 0.5935089603655751
