In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Load datasets
train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Fill missing values 
train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(train_data.mean(), inplace=True)

# Encode categorical columns 
if 'Gender' in train_data.columns:
    le = LabelEncoder()
    train_data['Gender'] = le.fit_transform(train_data['Gender'])
    test_data['Gender'] = le.transform(test_data['Gender'])

# Feature engineering
if 'BMI' in train_data.columns and 'Age' in train_data.columns:
    train_data['BMI_Age'] = train_data['BMI'] / train_data['Age']
    test_data['BMI_Age'] = test_data['BMI'] / test_data['Age']

# Interaction features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interaction = poly.fit_transform(train_data.drop(columns=['Target', 'Id']))
X_test_interaction = poly.transform(test_data.drop(columns=['Id']))

# Combine original and interaction features
X = np.hstack((train_data.drop(columns=['Target', 'Id']), X_interaction))
y = train_data['Target']
X_test = np.hstack((test_data.drop(columns=['Id']), X_test_interaction))

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle imbalanced data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Hyperparameter tuning 
param_dist = {
    'max_depth': range(3, 10),
    'n_estimators': range(100, 1000, 100),
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5],
    'reg_lambda': [1e-3, 0.1, 1.0],
    'reg_alpha': [1e-3, 0.1, 1.0],
    'min_child_weight': range(1, 10)
}

xgb_random = RandomizedSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=50,
    scoring='balanced_accuracy',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
xgb_random.fit(X_train, y_train)
best_params = xgb_random.best_params_
print(f"Best parameters found by RandomizedSearchCV: {best_params}")

# Train models with best parameters
models = {
    'RandomForest': RandomForestClassifier(random_state=42, n_estimators=200, max_depth=20, min_samples_split=5, min_samples_leaf=1),
    'XGBoost': XGBClassifier(random_state=42, **best_params),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    balanced_acc_score = balanced_accuracy_score(y_val, y_val_pred)
    print(f"{model_name} Balanced Accuracy Score on Validation Set: {balanced_acc_score}")

    # Cross-validation scores
    cv_scores = cross_val_score(model, X, y, cv=3, scoring='balanced_accuracy')
    print(f"{model_name} Mean Balanced Accuracy Score with Cross-Validation: {cv_scores.mean()}")

# Ensemble voting prediction
from sklearn.ensemble import VotingClassifier
voting_model = VotingClassifier(estimators=[
    ('rf', models['RandomForest']),
    ('xgb', models['XGBoost']),
    ('gbc', models['GradientBoosting'])
], voting='soft')
voting_model.fit(X_train, y_train)

# Final predictions on test data
test_predictions = voting_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({'Id': test_data['Id'], 'Target': test_predictions})
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully")


Fitting 3 folds for each of 50 candidates, totalling 150 fits
