In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier

train_df = pd.read_csv('trainee_train.csv')
test_df = pd.read_csv('trainee_test_fish.csv')

X = train_df.drop(columns=['Unnamed: 0', 'im'])
y = train_df['im']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)


X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Define the models for the ensemble
models = [
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
]

# Voting classifier
ensemble = VotingClassifier(estimators=models, voting='soft')

# Parameters for RandomizedSearchCV
params = {
    'xgb__n_estimators': [50, 100, 200],
    'xgb__learning_rate': [0.1, 0.01, 0.001],
    'xgb__max_depth': [3, 4, 5],
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [5, 10, 15],
    'gb__n_estimators': [100, 200, 300],
    'gb__learning_rate': [0.1, 0.01, 0.001],
    'gb__max_depth': [3, 4, 5]
}

# Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(
    ensemble, param_distributions=params, n_iter=10,
    scoring='roc_auc', n_jobs=-1, cv=5, random_state=42
)

random_search.fit(X_train, y_train)


best_ensemble = random_search.best_estimator_

# Validation predictions
y_val_pred = best_ensemble.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'Validation ROC-AUC Score: {roc_auc}')

X_test = test_df.drop(columns=['Unnamed: 0'])
X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)


test_predictions = best_ensemble.predict_proba(X_test_pca)[:, 1]

# Convert probabilities to binary outcomes
binary_predictions = (test_predictions >= 0.5).astype(float)

# Create the submission file with binary outcomes
submission_df = pd.DataFrame({
    '': test_df.index,
    'im': binary_predictions
})


submission_df.to_csv('submission_ensemble.csv', index=False, header=True)
