In [None]:
import pandas as pd
import numpy as np
from collections import Counter


def weighted_ensemble_predictions(file1_path, file2_path, file3_path, file4_path):
    """
    Create weighted ensemble from 4 CSV files based on their performance
    """
    
    # Read the CSV files
    df1 = pd.read_csv(file1_path)
    df2 = pd.read_csv(file2_path)
    df3 = pd.read_csv(file3_path)
    df4 = pd.read_csv(file4_path)
    
    # Define weights based on accuracy scores
    weights = {
        'model1': 0.73,  # 73% accuracy
        'model2': 0.73,  # 73% accuracy
        'model3': 0.76,  # 76% accuracy
        'model4': 0.75   # 75% accuracy (adjust as needed)
    }
    
    # Normalize weights to sum to 1
    total_weight = sum(weights.values())
    normalized_weights = {k: v/total_weight for k, v in weights.items()}
    
    # Assuming predictions are in a column named 'prediction' 
    # Adjust column name as needed
    pred_col = 'prediction' if 'prediction' in df1.columns else df1.columns[0]
    
    predictions1 = df1[pred_col].values
    predictions2 = df2[pred_col].values  
    predictions3 = df3[pred_col].values
    predictions4 = df4[pred_col].values
    
    # Ensure all arrays have same length
    min_len = min(len(predictions1), len(predictions2), len(predictions3), len(predictions4))
    predictions1 = predictions1[:min_len]
    predictions2 = predictions2[:min_len]
    predictions3 = predictions3[:min_len]
    predictions4 = predictions4[:min_len]
    
    ensemble_predictions = []
    
    # Weighted voting for each sample
    for i in range(min_len):
        # Create weighted votes dictionary
        votes = {}
        
        # Add weighted votes for each model
        for pred, weight in [(predictions1[i], normalized_weights['model1']),
                           (predictions2[i], normalized_weights['model2']),
                           (predictions3[i], normalized_weights['model3']),
                           (predictions4[i], normalized_weights['model4'])]:
            if pred in votes:
                votes[pred] += weight
            else:
                votes[pred] = weight
        
        # Select prediction with highest weighted vote
        ensemble_pred = max(votes.items(), key=lambda x: x[1])[0]
        ensemble_predictions.append(ensemble_pred)
    
    return ensemble_predictions, normalized_weights


def simple_majority_voting(file1_path, file2_path, file3_path, file4_path):
    """
    Alternative: Simple majority voting (unweighted) for 4 models
    """
    df1 = pd.read_csv(file1_path)
    df2 = pd.read_csv(file2_path)
    df3 = pd.read_csv(file3_path)
    df4 = pd.read_csv(file4_path)
    
    pred_col = 'prediction' if 'prediction' in df1.columns else df1.columns[0]
    
    predictions1 = df1[pred_col].values
    predictions2 = df2[pred_col].values  
    predictions3 = df3[pred_col].values
    predictions4 = df4[pred_col].values
    
    min_len = min(len(predictions1), len(predictions2), len(predictions3), len(predictions4))
    ensemble_predictions = []
    
    for i in range(min_len):
        # Count votes for each class
        votes = [predictions1[i], predictions2[i], predictions3[i], predictions4[i]]
        vote_counts = Counter(votes)
        
        # Get majority vote (or first in case of tie)
        majority_vote = vote_counts.most_common(1)[0][0]
        ensemble_predictions.append(majority_vote)
    
    return ensemble_predictions


# Usage example:
if __name__ == "__main__":
    # Replace with your actual file paths
    file1 = "73_predictions_subtask1_test.csv"
    file2 = "73_2_predictions_subtask1_test.csv" 
    file3 = "76_predictions_subtask1_test.csv"
    file4 = "75_predictions_subtask1_test.csv"  # Add your fourth model file
    
    # Method 1: Weighted ensemble (recommended)
    ensemble_preds, weights = weighted_ensemble_predictions(file1, file2, file3, file4)
    
    print("Normalized weights used:")
    for model, weight in weights.items():
        print(f"{model}: {weight:.3f}")
    
    print(f"\nFirst 10 ensemble predictions: {ensemble_preds[:10]}")
    
    # Create DataFrame from ensemble predictions
    ensemble_df = pd.DataFrame({'prediction': ensemble_preds})
    # Save ensemble predictions to CSV (keeping all records)
    ensemble_df.to_csv('ensemble_predictions.csv', index=False, encoding='utf-8')
    
    # Method 2: Simple majority voting
    majority_preds = simple_majority_voting(file1, file2, file3, file4)
    
    # Compare both methods (using all records)
    agreement = sum(1 for i in range(len(ensemble_preds)) 
                   if ensemble_preds[i] == majority_preds[i])
    print(f"\nAgreement between weighted and majority voting: {agreement/len(ensemble_preds):.3f}")