In [23]:
import pandas as pd
import os

experiments_dir = 'experiments'
prediction_files = []

for root, dirs, files in os.walk(experiments_dir):
    for file in files:
        if file.endswith('prediction.csv'):
            prediction_files.append(os.path.join(root, file))

print(f"Found {len(prediction_files)} prediction files")

dfs = []
for f in prediction_files:
    df = pd.read_csv(f)
    model_name = os.path.basename(f).replace('_prediction.csv', '')
    df = df.rename(columns={'probability': model_name})
    dfs.append(df)

merged = dfs[0]
for df in dfs[1:]:
    merged = merged.merge(df, on='id', how='outer')

# Compute ensemble as average
merged['ensemble'] = merged.drop('id', axis=1).mean(axis=1)

# Save the combined predictions
os.makedirs('experiments/exp_5_ensemble', exist_ok=True)
merged.to_csv('experiments/exp_5_ensemble/ensemble_predictions.csv', index=False)

print("Ensemble predictions saved to experiments/exp_5_ensemble/ensemble_predictions.csv")

Found 20 prediction files
Ensemble predictions saved to experiments/exp_5_ensemble/ensemble_predictions.csv


In [24]:
# Experiment with simple averaging of best models
best_models = ['exp_1_combined_linear_svc_optuna', 
               'exp_2_catboost_optuna',
               'exp_3_hybrid',
               'exp_4_deberta_url_features']

# Load only the best models' predictions
best_dfs = []
for f in prediction_files:
    model_name = os.path.basename(f).replace('_prediction.csv', '')
    if model_name in best_models:
        df = pd.read_csv(f)
        df = df.rename(columns={'probability': model_name})
        best_dfs.append(df)

# Merge the best models
merged = best_dfs[0]
for df in best_dfs[1:]:
    merged = merged.merge(df, on='id', how='outer')

# Compute simple ensemble as average
merged['simple_ensemble'] = merged[best_models].mean(axis=1)

# Compute weighted ensemble: DeBERTa 70%, others 10% each
merged['weighted_ensemble'] = (0.7 * merged['exp_4_deberta_url_features'] + 
                               0.1 * merged['exp_1_combined_linear_svc_optuna'] + 
                               0.1 * merged['exp_2_catboost_optuna'] + 
                               0.1 * merged['exp_3_hybrid'])

# Save the combined predictions
os.makedirs('experiments/exp_5_ensemble_best_models', exist_ok=True)
merged.to_csv('experiments/exp_5_ensemble_best_models/ensemble_predictions.csv', index=False)

print("Ensemble predictions saved to experiments/exp_5_ensemble_best_models/ensemble_predictions.csv")

# Save the simple average predictions
simple_submission = merged[['id', 'simple_ensemble']].rename(columns={'simple_ensemble': 'probability'})
simple_submission.to_csv('experiments/exp_5_ensemble_best_models/exp_5_simple_avg_prediction.csv', index=False)

# Save the weighted average predictions
weighted_submission = merged[['id', 'weighted_ensemble']].rename(columns={'weighted_ensemble': 'probability'})
weighted_submission.to_csv('experiments/exp_5_ensemble_best_models/exp_5_weighted_avg_prediction.csv', index=False)

print("Simple average predictions saved to experiments/exp_5_ensemble_best_models/exp_5_simple_avg_prediction.csv")
print("Weighted average predictions saved to experiments/exp_5_ensemble_best_models/exp_5_weighted_avg_prediction.csv")

Ensemble predictions saved to experiments/exp_5_ensemble_best_models/ensemble_predictions.csv
Simple average predictions saved to experiments/exp_5_ensemble_best_models/exp_5_simple_avg_prediction.csv
Weighted average predictions saved to experiments/exp_5_ensemble_best_models/exp_5_weighted_avg_prediction.csv
