In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score

# Load the datasets
train_data = pd.read_csv('../dataset/train.csv')
test_data = pd.read_csv('../dataset/test.csv')
sample_submission = pd.read_csv('../dataset/sample_submission.csv')

# Impute missing values
imputer = SimpleImputer(strategy='mean')
train_data_imputed = train_data.copy()
test_data_imputed = test_data.copy()

# Columns to impute
cols_to_impute = train_data.columns[train_data.isnull().sum() > 0]

train_data_imputed[cols_to_impute] = imputer.fit_transform(train_data[cols_to_impute])
test_data_imputed[cols_to_impute] = imputer.transform(test_data[cols_to_impute])

# One-Hot Encode categorical variables
train_data_encoded = pd.get_dummies(train_data_imputed, columns=['product_code', 'attribute_0', 'attribute_1'])
test_data_encoded = pd.get_dummies(test_data_imputed, columns=['product_code', 'attribute_0', 'attribute_1'])

# Ensure the test set has the same columns as the train set
test_data_encoded = test_data_encoded.reindex(columns=train_data_encoded.columns, fill_value=0)

# Separate features and target variable from the training data
X = train_data_encoded.drop(columns=['id', 'failure'])
y = train_data_encoded['failure']
X_test = test_data_encoded.drop(columns=['id', 'failure'])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the Logistic Regression model and the parameter grid for GridSearchCV
model = LogisticRegression(max_iter=1000, random_state=42)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],  # SAGA supports all penalties including 'elasticnet'
    'l1_ratio': [0, 0.5, 1]  # l1_ratio is used only for elasticnet
}

# Implement GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and evaluate on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'Best ROC AUC Score on Validation Set: {roc_auc}')

# Perform cross-validation on the entire training data
cv_scores = cross_val_score(best_model, X_scaled, y, cv=5, scoring='roc_auc')
print(f'Cross-Validation ROC AUC Scores: {cv_scores}')
print(f'Mean Cross-Validation ROC AUC Score: {cv_scores.mean()}')

# Make predictions on the test data
y_test_pred = best_model.predict_proba(X_test_scaled)[:, 1]

# Prepare the submission file
submission = pd.DataFrame({'id': test_data['id'], 'failure': y_test_pred})
submission.to_csv('submission_lr_tuned.csv', index=False)

Best ROC AUC Score on Validation Set: 0.5944416997859165
Cross-Validation ROC AUC Scores: [0.60898036 0.58636419 0.58799726 0.59647075 0.5833045 ]
Mean Cross-Validation ROC AUC Score: 0.5926234130445159
