In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.metrics import balanced_accuracy_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import joblib
import time

# Load datasets
train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Fill missing values 
train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(train_data.mean(), inplace=True)

# Encode categorical columns 
if 'Gender' in train_data.columns:
    le = LabelEncoder()
    train_data['Gender'] = le.fit_transform(train_data['Gender'])
    test_data['Gender'] = le.transform(test_data['Gender'])

# Feature engineering
if 'BMI' in train_data.columns and 'Age' in train_data.columns:
    train_data['BMI_Age'] = train_data['BMI'] / train_data['Age']
    test_data['BMI_Age'] = test_data['BMI'] / test_data['Age']

# Interaction features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interaction = poly.fit_transform(train_data.drop(columns=['Target', 'Id']))
X_test_interaction = poly.transform(test_data.drop(columns=['Id']))

# Combine original and interaction features
X = np.hstack((train_data.drop(columns=['Target', 'Id']), X_interaction))
y = train_data['Target']
X_test = np.hstack((test_data.drop(columns=['Id']), X_test_interaction))

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle imbalanced data
smote = SMote(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

param_dist = {
    'max_depth': range(3, 7),
    'n_estimators': range(100, 300, 100),
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0],
    'gamma': [0, 0.1],
    'reg_lambda': [0.1, 1.0],
    'reg_alpha': [0.1, 1.0],
    'min_child_weight': range(1, 4)
}

# Measure the time taken for a single model fit 
xgb_model = XGBClassifier(random_state=42)
start_time = time.time()
xgb_model.fit(X_train, y_train)
end_time = time.time()
single_fit_time = end_time - start_time
print(f"Time for a single fit: {single_fit_time:.2f} seconds")

# Estimate total time for RandomizedSearchCV
total_fits = 150  # Based on 50 candidates and 3-fold CV
number_of_jobs = 8  # Assume that n_jobs in RandomizedSearchCV is set to 8

estimated_total_time = (single_fit_time * total_fits) / number_of_jobs
print(f"Estimated total time for RandomizedSearchCV: {estimated_total_time:.2f} seconds or {estimated_total_time/60:.2f} minutes")

# Perform RandomizedSearchCV
xgb_random = RandomizedSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=50,
    scoring='balanced_accuracy',
    cv=3,
    verbose=3,    # Increased verbosity
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
xgb_random.fit(X_train, y_train)
end_time = time.time()
print(f"Total time taken for RandomizedSearchCV: {end_time - start_time:.2f} seconds")

# Save the search results
joblib.dump(xgb_random, 'xgb_random_search.pkl')

best_params = xgb_random.best_params_
print(f"Best parameters found by RandomizedSearchCV: {best_params}")

xgb_best_model = xgb_random.best_estimator_
xgb_best_model.fit(X_train, y_train)

y_val_pred = xgb_best_model.predict(X_val)
balanced_acc_score = balanced_accuracy_score(y_val, y_val_pred)
print(f"XGBoost Balanced Accuracy Score on Validation Set: {balanced_acc_score}")

# If satisfied with XGBoost, you can then proceed with the other models
# Ensemble voting
from sklearn.ensemble import VotingClassifier

models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb_best_model,
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

voting_model = VotingClassifier(estimators=[
    ('rf', models['RandomForest']),
    ('xgb', models['XGBoost']),
    ('gbc', models['GradientBoosting'])
], voting='soft')

voting_model.fit(X_train, y_train)

# Final predictions on test data
test_predictions = voting_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({'Id': test_data['Id'], 'Target': test_predictions})
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")


Fitting 3 folds for each of 50 candidates, totalling 150 fits


KeyboardInterrupt: 