In [None]:
import pandas as pd
import numpy as np

def create_weighted_average_predictions():
    # Load the three CSV files
    print("Loading the three prediction files...")
    
    # File 1: arabertv2_ordinal_detailed_predictions.csv
    df1 = pd.read_csv(r"open_72,000_marbert_regression_weighted_detailed_predictions.csv")
    print("Columns in file 1:", df1.columns.tolist())
    # Rename columns to standardize
    df1 = df1.rename(columns={'ID': 'Sentence_ID', 'Predicted_Level': 'Prediction'})
    df1 = df1[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df1['Model'] = 'AraBERTv2'
    
    # File 2: araelectra_regression_detailed_predictions.csv  
    df2 = pd.read_csv(r"coral_arabertv2_OPEN_detailed_predictions (1).csv")
    print("Columns in file 2:", df2.columns.tolist())
    # Rename columns to standardize - this file has different column names
    if 'Sentence ID' in df2.columns:
        df2 = df2.rename(columns={'Sentence ID': 'Sentence_ID'})
    if 'ID' in df2.columns:
        df2 = df2.rename(columns={'ID': 'Sentence_ID'})
    if 'Predicted_Level' in df2.columns:
        df2 = df2.rename(columns={'Predicted_Level': 'Prediction'})
    # Already has 'Prediction' column, no need to rename
    df2 = df2[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df2['Model'] = 'AraELECTRA'
    
    # File 3: marabert_v2_regression_detailed_predictions.csv
    df3 = pd.read_csv(r"open_arabertv2_regression_weighted_detailed_predictions.csv")
    print("Columns in file 3:", df3.columns.tolist())
    # Rename columns to standardize - check if columns exist
    if 'ID' in df3.columns:
        df3 = df3.rename(columns={'ID': 'Sentence_ID'})
    if 'Sentence ID' in df3.columns:
        df3 = df3.rename(columns={'Sentence ID': 'Sentence_ID'})
    if 'Predicted_Level' in df3.columns:
        df3 = df3.rename(columns={'Predicted_Level': 'Prediction'})
    
    # Extract only the needed columns
    df3 = df3[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df3['Model'] = 'MaraBERT'
    
    # Remove rows with NaN values
    print(f"File 1 shape before cleaning: {df1.shape}")
    df1 = df1.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 1 shape after cleaning: {df1.shape}")
    
    print(f"File 2 shape before cleaning: {df2.shape}")
    df2 = df2.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 2 shape after cleaning: {df2.shape}")
    
    print(f"File 3 shape before cleaning: {df3.shape}")
    df3 = df3.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 3 shape after cleaning: {df3.shape}")
    
    # Combine all dataframes
    all_predictions = pd.concat([df1, df2, df3], ignore_index=True)
    
    print(f"Total records loaded: {len(all_predictions)}")
    print(f"Unique sentence IDs: {all_predictions['Sentence_ID'].nunique()}")
    print("\nSample data:")
    print(all_predictions.head(10))
    
    # Calculate weighted average for each sentence ID
    weighted_results = []
    
    # Get unique sentence IDs and remove any NaN values
    unique_sentence_ids = all_predictions['Sentence_ID'].dropna().unique()
    
    for sentence_id in unique_sentence_ids:
        # Skip if sentence_id is NaN or empty
        if pd.isna(sentence_id):
            continue
            
        # Get all predictions for this sentence
        sentence_data = all_predictions[all_predictions['Sentence_ID'] == sentence_id]
        
        if len(sentence_data) < 2:
            print(f"Warning: Sentence {sentence_id} has only {len(sentence_data)} predictions")
            continue
            
        # Extract predictions and confidences
        predictions = sentence_data['Prediction'].values
        confidences = sentence_data['Confidence_Score'].values
        models = sentence_data['Model'].values
        
        # Remove any NaN values from predictions and confidences
        valid_indices = ~(pd.isna(predictions) | pd.isna(confidences))
        predictions = predictions[valid_indices]
        confidences = confidences[valid_indices]
        models = models[valid_indices]
        
        if len(predictions) == 0:
            print(f"Warning: No valid predictions for sentence {sentence_id}")
            continue
        
        # Calculate weighted average: (pred1*conf1 + pred2*conf2 + pred3*conf3) / (conf1 + conf2 + conf3)
        weighted_sum = np.sum(predictions * confidences)
        confidence_sum = np.sum(confidences)
        
        if confidence_sum > 0:
            weighted_avg = weighted_sum / confidence_sum
            # Calculate weighted confidence (average of confidences weighted by their own values)
            weighted_confidence = confidence_sum / len(confidences)
            # Round to nearest integer since predictions should be integers
            final_prediction = round(weighted_avg)
        else:
            # Fallback to simple average if confidence sum is 0
            final_prediction = round(np.mean(predictions))
            weighted_confidence = np.mean(confidences) if len(confidences) > 0 else 0.0
        
        # Find the model with highest confidence for this sentence
        max_conf_idx = np.argmax(confidences)
        highest_conf_model = models[max_conf_idx]
        highest_conf_value = confidences[max_conf_idx]
        highest_conf_prediction = predictions[max_conf_idx]
        
        weighted_results.append({
            'Sentence_ID': sentence_id,
            'Prediction': final_prediction,
            'Highest_Conf_Value': highest_conf_value,
        })
    
    # Create final dataframe
    result_df = pd.DataFrame(weighted_results)
    
    # Check if we have any results
    if len(result_df) == 0:
        print("❌ ERROR: No valid results generated!")
        return None
    
    # Sort by Sentence_ID to ensure proper order
    result_df = result_df.sort_values('Sentence_ID')
    
    # Convert Sentence_ID to string to match original format if needed
    result_df['Sentence_ID'] = result_df['Sentence_ID'].astype(str)
    
    # Save to CSV
    output_file = "Average Weighted Predictions3.csv"
    result_df.to_csv(output_file, index=False)
    
    print(f"\n✅ SUCCESS!")
    print(f"📁 File saved as: {output_file}")
    print(f"📊 Total predictions: {len(result_df)}")
    print(f"📋 Columns: {result_df.columns.tolist()}")
    print(f"\nFirst 10 predictions:")
    print(result_df.head(10))
    print(f"\nLast 10 predictions:")
    print(result_df.tail(10))
    
    # Show some statistics
    print(f"\nPrediction statistics:")
    print(f"Min prediction: {result_df['Prediction'].min()}")
    print(f"Max prediction: {result_df['Prediction'].max()}")
    print(f"Mean prediction: {result_df['Prediction'].mean():.2f}")
    


# Run the function
result = create_weighted_average_predictions()

Loading the three prediction files...
Columns in file 1: ['ID', 'Sentence', 'Predicted_Level', 'Confidence_Score', 'Model_Type', 'Prediction_Method']
Columns in file 2: ['ID', 'Sentence', 'Predicted_Level', 'Raw_Prediction', 'Confidence_Score']
Columns in file 3: ['ID', 'Sentence', 'Predicted_Level', 'Raw_Prediction', 'Confidence_Score']
File 1 shape before cleaning: (3420, 4)
File 1 shape after cleaning: (3420, 4)
File 2 shape before cleaning: (3420, 4)
File 2 shape after cleaning: (3420, 4)
File 3 shape before cleaning: (3420, 4)
File 3 shape after cleaning: (3420, 4)
Total records loaded: 10260
Unique sentence IDs: 3420

Sample data:
   Sentence_ID  Prediction  Confidence_Score      Model
0  10102950001           7          0.538000  AraBERTv2
1  10102950002           3          0.484714  AraBERTv2
2  10102950003          12          0.228546  AraBERTv2
3  10102950004          12          0.294010  AraBERTv2
4  10102950005           5          0.374268  AraBERTv2
5  10102950006     

In [None]:
import pandas as pd
import numpy as np


def create_weighted_average_predictions():
    # Load the five CSV files
    print("Loading the five prediction files...")
    
    # File 1: arabertv2_ordinal_detailed_predictions.csv
    df1 = pd.read_csv(r"arabertv2_regression_weighted_detailed_predictions.csv")
    print("Columns in file 1:", df1.columns.tolist())
    # Rename columns to standardize
    df1 = df1.rename(columns={'ID': 'Sentence_ID', 'Predicted_Level': 'Prediction'})
    df1 = df1[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df1['Model'] = 'AraBERTv2'
    
    # File 2: araelectra_regression_detailed_predictions.csv  
    df2 = pd.read_csv(r"Samer_coral_arabertv2_d3tok_detailed_predictions.csv")
    print("Columns in file 2:", df2.columns.tolist())
    # Rename columns to standardize - this file has different column names
    if 'Sentence ID' in df2.columns:
        df2 = df2.rename(columns={'Sentence ID': 'Sentence_ID'})
    if 'ID' in df2.columns:
        df2 = df2.rename(columns={'ID': 'Sentence_ID'})
    if 'Predicted_Level' in df2.columns:
        df2 = df2.rename(columns={'Predicted_Level': 'Prediction'})
    # Already has 'Prediction' column, no need to rename
    df2 = df2[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df2['Model'] = 'AraELECTRA'
    
    # File 3: marabert_v2_regression_detailed_predictions.csv
    df3 = pd.read_csv(r"coral_arabertv2_d3tok_detailed_predictions.csv")
    print("Columns in file 3:", df3.columns.tolist())
    # Rename columns to standardize - check if columns exist
    if 'ID' in df3.columns:
        df3 = df3.rename(columns={'ID': 'Sentence_ID'})
    if 'Sentence ID' in df3.columns:
        df3 = df3.rename(columns={'Sentence ID': 'Sentence_ID'})
    if 'Predicted_Level' in df3.columns:
        df3 = df3.rename(columns={'Predicted_Level': 'Prediction'})
    
    # Extract only the needed columns
    df3 = df3[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df3['Model'] = 'MaraBERT'
    

    # arabertv2_classification_detailed_predictions (1)
    # File 4: Add your fourth file here
    df4 = pd.read_csv(r"araelctra_regression_weighted_detailed_predictions.csv")  # Replace with your actual filename
    print("Columns in file 4:", df4.columns.tolist())
    # Rename columns to standardize - check if columns exist
    if 'ID' in df4.columns:
        df4 = df4.rename(columns={'ID': 'Sentence_ID'})
    if 'Sentence ID' in df4.columns:
        df4 = df4.rename(columns={'Sentence ID': 'Sentence_ID'})
    if 'Predicted_Level' in df4.columns:
        df4 = df4.rename(columns={'Predicted_Level': 'Prediction'})
    
    # Extract only the needed columns
    df4 = df4[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df4['Model'] = 'Model4'  # Replace with your actual model name
    
    # File 5: Add your fifth file here
    df5 = pd.read_csv(r"Samer_camelbert_msa_classification_detailed_predictions.csv")  # Replace with your actual filename
    print("Columns in file 5:", df5.columns.tolist())
    # Rename columns to standardize - check if columns exist
    if 'ID' in df5.columns:
        df5 = df5.rename(columns={'ID': 'Sentence_ID'})
    if 'Sentence ID' in df5.columns:
        df5 = df5.rename(columns={'Sentence ID': 'Sentence_ID'})
    if 'Predicted_Level' in df5.columns:
        df5 = df5.rename(columns={'Predicted_Level': 'Prediction'})
    
    # Extract only the needed columns
    df5 = df5[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df5['Model'] = 'Model5'  # Replace with your actual model name
    
    # Remove rows with NaN values for all files
    print(f"File 1 shape before cleaning: {df1.shape}")
    df1 = df1.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 1 shape after cleaning: {df1.shape}")
    
    print(f"File 2 shape before cleaning: {df2.shape}")
    df2 = df2.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 2 shape after cleaning: {df2.shape}")
    
    print(f"File 3 shape before cleaning: {df3.shape}")
    df3 = df3.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 3 shape after cleaning: {df3.shape}")
    
    print(f"File 4 shape before cleaning: {df4.shape}")
    df4 = df4.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 4 shape after cleaning: {df4.shape}")
    
    print(f"File 5 shape before cleaning: {df5.shape}")
    df5 = df5.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 5 shape after cleaning: {df5.shape}")
    
    # Combine all dataframes (now including df4 and df5)
    all_predictions = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
    
    print(f"Total records loaded: {len(all_predictions)}")
    print(f"Unique sentence IDs: {all_predictions['Sentence_ID'].nunique()}")
    print("\nSample data:")
    print(all_predictions.head(10))
    
    # Calculate weighted average for each sentence ID
    weighted_results = []
    
    # Get unique sentence IDs and remove any NaN values
    unique_sentence_ids = all_predictions['Sentence_ID'].dropna().unique()
    
    for sentence_id in unique_sentence_ids:
        # Skip if sentence_id is NaN or empty
        if pd.isna(sentence_id):
            continue
            
        # Get all predictions for this sentence
        sentence_data = all_predictions[all_predictions['Sentence_ID'] == sentence_id]
        
        if len(sentence_data) < 2:
            print(f"Warning: Sentence {sentence_id} has only {len(sentence_data)} predictions")
            continue
            
        # Extract predictions and confidences
        predictions = sentence_data['Prediction'].values
        confidences = sentence_data['Confidence_Score'].values
        models = sentence_data['Model'].values
        
        # Remove any NaN values from predictions and confidences
        valid_indices = ~(pd.isna(predictions) | pd.isna(confidences))
        predictions = predictions[valid_indices]
        confidences = confidences[valid_indices]
        models = models[valid_indices]
        
        if len(predictions) == 0:
            print(f"Warning: No valid predictions for sentence {sentence_id}")
            continue
        
        # Calculate weighted average: (pred1*conf1 + pred2*conf2 + ... + pred5*conf5) / (conf1 + conf2 + ... + conf5)
        weighted_sum = np.sum(predictions * confidences)
        confidence_sum = np.sum(confidences)
        
        if confidence_sum > 0:
            weighted_avg = weighted_sum / confidence_sum
            # Calculate weighted confidence (average of confidences weighted by their own values)
            weighted_confidence = confidence_sum / len(confidences)
            # Round to nearest integer since predictions should be integers
            final_prediction = round(weighted_avg)
        else:
            # Fallback to simple average if confidence sum is 0
            final_prediction = round(np.mean(predictions))
            weighted_confidence = np.mean(confidences) if len(confidences) > 0 else 0.0
        
        # Find the model with highest confidence for this sentence
        max_conf_idx = np.argmax(confidences)
        highest_conf_model = models[max_conf_idx]
        highest_conf_value = confidences[max_conf_idx]
        highest_conf_prediction = predictions[max_conf_idx]
        
        weighted_results.append({
            'Sentence_ID': sentence_id,
            'Prediction': final_prediction,
            'Highest_Conf_Value': highest_conf_value,
        })
    
    # Create final dataframe
    result_df = pd.DataFrame(weighted_results)
    
    # Check if we have any results
    if len(result_df) == 0:
        print("❌ ERROR: No valid results generated!")
        return None
    
    # Sort by Sentence_ID to ensure proper order
    result_df = result_df.sort_values('Sentence_ID')
    
    # Convert Sentence_ID to string to match original format if needed
    result_df['Sentence_ID'] = result_df['Sentence_ID'].astype(str)
    
    # Save to CSV
    output_file = "Average Weighted Predictions_5models.csv"
    result_df.to_csv(output_file, index=False)
    
    print(f"\n✅ SUCCESS!")
    print(f"📁 File saved as: {output_file}")
    print(f"📊 Total predictions: {len(result_df)}")
    print(f"📋 Columns: {result_df.columns.tolist()}")
    print(f"\nFirst 10 predictions:")
    print(result_df.head(10))
    print(f"\nLast 10 predictions:")
    print(result_df.tail(10))
    
    # Show some statistics
    print(f"\nPrediction statistics:")
    print(f"Min prediction: {result_df['Prediction'].min()}")
    print(f"Max prediction: {result_df['Prediction'].max()}")
    print(f"Mean prediction: {result_df['Prediction'].mean():.2f}")
    
    # Show model distribution
    print(f"\nModel distribution in data:")
    print(all_predictions['Model'].value_counts())



# Run the function
result = create_weighted_average_predictions()


Loading the five prediction files...
Columns in file 1: ['ID', 'Sentence', 'Predicted_Level', 'Raw_Prediction', 'Confidence_Score']
Columns in file 2: ['ID', 'Sentence', 'Predicted_Level', 'Raw_Prediction', 'Confidence_Score']
Columns in file 3: ['ID', 'Sentence', 'Predicted_Level', 'Raw_Prediction', 'Confidence_Score']
Columns in file 4: ['ID', 'Sentence', 'Predicted_Level', 'Confidence_Score', 'Model_Type', 'Prediction_Method']
Columns in file 5: ['ID', 'Sentence', 'Predicted_Level', 'Raw_Prediction', 'Confidence_Score']
File 1 shape before cleaning: (3420, 4)
File 1 shape after cleaning: (3420, 4)
File 2 shape before cleaning: (3420, 4)
File 2 shape after cleaning: (3420, 4)
File 3 shape before cleaning: (3420, 4)
File 3 shape after cleaning: (3420, 4)
File 4 shape before cleaning: (3420, 4)
File 4 shape after cleaning: (3420, 4)
File 5 shape before cleaning: (3420, 4)
File 5 shape after cleaning: (3420, 4)
Total records loaded: 17100
Unique sentence IDs: 3420

Sample data:
   Sente

In [None]:
import pandas as pd
import numpy as np

def create_weighted_average_predictions():
    # Load the four CSV files
    print("Loading the four prediction files...")
    
    
    # File 1: arabertv2_ordinal_detailed_predictions.csv
    df1 = pd.read_csv(r"open_72,000_marbert_regression_weighted_detailed_predictions.csv")
    print("Columns in file 1:", df1.columns.tolist())
    # Rename columns to standardize
    df1 = df1.rename(columns={'ID': 'Sentence_ID', 'Predicted_Level': 'Prediction'})
    df1 = df1[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df1['Model'] = 'AraBERTv2'
    
    # File 2: araelectra_regression_detailed_predictions.csv  
    df2 = pd.read_csv(r"coral_arabertv2_OPEN_detailed_predictions (1).csv")
    print("Columns in file 2:", df2.columns.tolist())
    # Rename columns to standardize - this file has different column names
    if 'Sentence ID' in df2.columns:
        df2 = df2.rename(columns={'Sentence ID': 'Sentence_ID'})
    if 'ID' in df2.columns:
        df2 = df2.rename(columns={'ID': 'Sentence_ID'})
    if 'Predicted_Level' in df2.columns:
        df2 = df2.rename(columns={'Predicted_Level': 'Prediction'})
    # Already has 'Prediction' column, no need to rename
    df2 = df2[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df2['Model'] = 'AraELECTRA'
    
    # File 3: marabert_v2_regression_detailed_predictions.csv
    df3 = pd.read_csv(r"open_arabertv2_regression_weighted_detailed_predictions.csv")
    print("Columns in file 3:", df3.columns.tolist())
    # Rename columns to standardize - check if columns exist
    if 'ID' in df3.columns:
        df3 = df3.rename(columns={'ID': 'Sentence_ID'})
    if 'Sentence ID' in df3.columns:
        df3 = df3.rename(columns={'Sentence ID': 'Sentence_ID'})
    if 'Predicted_Level' in df3.columns:
        df3 = df3.rename(columns={'Predicted_Level': 'Prediction'})
    
    # Extract only the needed columns
    df3 = df3[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df3['Model'] = 'MaraBERT'
    
    # File 4: marbertv2_classification_detailed_predictions.csv
    df4 = pd.read_csv(r"72,000_araelctra_regression_weighted_detailed_predictions (1).csv")
    print("Columns in file 4:", df4.columns.tolist())
    # Rename columns to standardize - check if columns exist
    if 'ID' in df4.columns:
        df4 = df4.rename(columns={'ID': 'Sentence_ID'})
    if 'Sentence ID' in df4.columns:
        df4 = df4.rename(columns={'Sentence ID': 'Sentence_ID'})
    if 'Predicted_Level' in df4.columns:
        df4 = df4.rename(columns={'Predicted_Level': 'Prediction'})
    
    # Extract only the needed columns
    df4 = df4[['Sentence_ID', 'Prediction', 'Confidence_Score']].copy()
    df4['Model'] = 'MaraBERTv2'
    
    # Remove rows with NaN values for all files
    print(f"File 1 shape before cleaning: {df1.shape}")
    df1 = df1.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 1 shape after cleaning: {df1.shape}")
    
    print(f"File 2 shape before cleaning: {df2.shape}")
    df2 = df2.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 2 shape after cleaning: {df2.shape}")
    
    print(f"File 3 shape before cleaning: {df3.shape}")
    df3 = df3.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 3 shape after cleaning: {df3.shape}")
    
    print(f"File 4 shape before cleaning: {df4.shape}")
    df4 = df4.dropna(subset=['Sentence_ID', 'Prediction', 'Confidence_Score'])
    print(f"File 4 shape after cleaning: {df4.shape}")
    
    # Combine all dataframes (now including df1, df2, df3, df4)
    all_predictions = pd.concat([df1, df2, df3, df4], ignore_index=True)
    
    print(f"Total records loaded: {len(all_predictions)}")
    print(f"Unique sentence IDs: {all_predictions['Sentence_ID'].nunique()}")
    print("\nSample data:")
    print(all_predictions.head(10))
    
    # Calculate weighted average for each sentence ID
    weighted_results = []
    
    # Get unique sentence IDs and remove any NaN values
    unique_sentence_ids = all_predictions['Sentence_ID'].dropna().unique()
    
    for sentence_id in unique_sentence_ids:
        # Skip if sentence_id is NaN or empty
        if pd.isna(sentence_id):
            continue
            
        # Get all predictions for this sentence
        sentence_data = all_predictions[all_predictions['Sentence_ID'] == sentence_id]
        
        if len(sentence_data) < 2:
            print(f"Warning: Sentence {sentence_id} has only {len(sentence_data)} predictions")
            continue
            
        # Extract predictions and confidences
        predictions = sentence_data['Prediction'].values
        confidences = sentence_data['Confidence_Score'].values
        models = sentence_data['Model'].values
        
        # Remove any NaN values from predictions and confidences
        valid_indices = ~(pd.isna(predictions) | pd.isna(confidences))
        predictions = predictions[valid_indices]
        confidences = confidences[valid_indices]
        models = models[valid_indices]
        
        if len(predictions) == 0:
            print(f"Warning: No valid predictions for sentence {sentence_id}")
            continue
        
        # Calculate weighted average: (pred1*conf1 + pred2*conf2 + pred3*conf3 + pred4*conf4) / (conf1 + conf2 + conf3 + conf4)
        weighted_sum = np.sum(predictions * confidences)
        confidence_sum = np.sum(confidences)
        
        if confidence_sum > 0:
            weighted_avg = weighted_sum / confidence_sum
            # Calculate weighted confidence (average of confidences weighted by their own values)
            weighted_confidence = confidence_sum / len(confidences)
            # Round to nearest integer since predictions should be integers
            final_prediction = round(weighted_avg)
        else:
            # Fallback to simple average if confidence sum is 0
            final_prediction = round(np.mean(predictions))
            weighted_confidence = np.mean(confidences) if len(confidences) > 0 else 0.0
        
        # Find the model with highest confidence for this sentence
        max_conf_idx = np.argmax(confidences)
        highest_conf_model = models[max_conf_idx]
        highest_conf_value = confidences[max_conf_idx]
        highest_conf_prediction = predictions[max_conf_idx]
        
        weighted_results.append({
            'Sentence ID': sentence_id,
            'Prediction': final_prediction,
            'Highest_Conf_Value': highest_conf_value,
        })
    
    # Create final dataframe
    result_df = pd.DataFrame(weighted_results)
    
    # Check if we have any results
    if len(result_df) == 0:
        print("❌ ERROR: No valid results generated!")
        return None
    
    # Sort by Sentence_ID to ensure proper order
    result_df = result_df.sort_values('Sentence ID')
    
    # Convert Sentence_ID to string to match original format if needed
    result_df['Sentence ID'] = result_df['Sentence ID'].astype(str)
    
    # Save to CSV
    output_file = "Average Weighted Predictions_4models.csv"
    result_df.to_csv(output_file, index=False)
    
    print(f"\n✅ SUCCESS!")
    print(f"📁 File saved as: {output_file}")
    print(f"📊 Total predictions: {len(result_df)}")
    print(f"📋 Columns: {result_df.columns.tolist()}")
    print(f"\nFirst 10 predictions:")
    print(result_df.head(10))
    print(f"\nLast 10 predictions:")
    print(result_df.tail(10))
    
    # Show some statistics
    print(f"\nPrediction statistics:")
    print(f"Min prediction: {result_df['Prediction'].min()}")
    print(f"Max prediction: {result_df['Prediction'].max()}")
    print(f"Mean prediction: {result_df['Prediction'].mean():.2f}")
    
    # Show model distribution
    print(f"\nModel distribution in data:")
    print(all_predictions['Model'].value_counts())

# Run the function
result = create_weighted_average_predictions()


Loading the four prediction files...
Columns in file 1: ['ID', 'Sentence', 'Predicted_Level', 'Raw_Prediction', 'Confidence_Score']
Columns in file 2: ['ID', 'Sentence', 'Predicted_Level', 'Confidence_Score', 'Model_Type', 'Prediction_Method']
Columns in file 3: ['ID', 'Sentence', 'Predicted_Level', 'Raw_Prediction', 'Confidence_Score']
Columns in file 4: ['ID', 'Sentence', 'Predicted_Level', 'Raw_Prediction', 'Confidence_Score']
File 1 shape before cleaning: (3420, 4)
File 1 shape after cleaning: (3420, 4)
File 2 shape before cleaning: (3420, 4)
File 2 shape after cleaning: (3420, 4)
File 3 shape before cleaning: (3420, 4)
File 3 shape after cleaning: (3420, 4)
File 4 shape before cleaning: (3420, 4)
File 4 shape after cleaning: (3420, 4)
Total records loaded: 13680
Unique sentence IDs: 3420

Sample data:
   Sentence_ID  Prediction  Confidence_Score      Model
0  10102950001           7          0.988350  AraBERTv2
1  10102950002           5          0.760760  AraBERTv2
2  10102950003

In [7]:
# Let's verify the enhanced output file was created correctly
import os

# Check if the file exists
output_file = "Average Weighted Predictions.csv"
if os.path.exists(output_file):
    print(f"✅ File '{output_file}' created successfully!")
    
    # Read and display the file info
    verify_df = pd.read_csv(output_file)
    print(f"📊 File contains {len(verify_df)} predictions")
    print(f"📋 Columns: {verify_df.columns.tolist()}")
    
    # Show first and last few rows
    print(f"\n🔍 First 10 rows:")
    print(verify_df.head(10))
    
    print(f"\n🔍 Last 5 rows:")
    print(verify_df.tail(5))
    
    # Show statistics
    print(f"\n📈 Prediction Statistics:")
    print(f"   Min prediction: {verify_df['Prediction'].min()}")
    print(f"   Max prediction: {verify_df['Prediction'].max()}")
    print(f"   Mean prediction: {verify_df['Prediction'].mean():.2f}")
    print(f"   Unique predictions: {verify_df['Prediction'].nunique()}")
    
    print(f"\n📊 Confidence Statistics:")
    print(f"   Min weighted confidence: {verify_df['Weighted_Confidence'].min():.4f}")
    print(f"   Max weighted confidence: {verify_df['Weighted_Confidence'].max():.4f}")
    print(f"   Mean weighted confidence: {verify_df['Weighted_Confidence'].mean():.4f}")
    
    print(f"\n🏆 Model Performance Analysis:")
    model_counts = verify_df['Highest_Conf_Model'].value_counts()
    for model, count in model_counts.items():
        percentage = (count / len(verify_df)) * 100
        print(f"   {model}: {count} times most confident ({percentage:.1f}%)")
    
    # Check for any missing values
    print(f"\n🔍 Data Quality Check:")
    print(f"   Missing Sentence_ID: {verify_df['Sentence_ID'].isna().sum()}")
    print(f"   Missing Prediction: {verify_df['Prediction'].isna().sum()}")
    print(f"   Missing Weighted_Confidence: {verify_df['Weighted_Confidence'].isna().sum()}")
    
    # Show some examples of different confidence levels
    print(f"\n📊 Confidence Level Examples:")
    high_conf = verify_df.nlargest(3, 'Weighted_Confidence')[['Sentence_ID', 'Prediction', 'Weighted_Confidence', 'Highest_Conf_Model']]
    print("Highest confidence predictions:")
    print(high_conf)
    
    low_conf = verify_df.nsmallest(3, 'Weighted_Confidence')[['Sentence_ID', 'Prediction', 'Weighted_Confidence', 'Highest_Conf_Model']]
    print("\nLowest confidence predictions:")
    print(low_conf)
    
else:
    print(f"❌ File '{output_file}' was not created!")

✅ File 'Average Weighted Predictions.csv' created successfully!
📊 File contains 3420 predictions
📋 Columns: ['Sentence_ID', 'Prediction', 'Weighted_Confidence', 'Highest_Conf_Model', 'Highest_Conf_Value', 'Highest_Conf_Prediction', 'Num_Models']

🔍 First 10 rows:
   Sentence_ID  Prediction  Weighted_Confidence Highest_Conf_Model  \
0  10102950001           7             0.678624           MaraBERT   
1  10102950002           4             0.600945         AraELECTRA   
2  10102950003          11             0.698403         AraELECTRA   
3  10102950004           9             0.592355           MaraBERT   
4  10102950005           7             0.757594         AraELECTRA   
5  10102950006           6             0.561651         AraELECTRA   
6  10102950007           6             0.743168         AraELECTRA   
7  10102950008           6             0.675883         AraELECTRA   
8  10102950009           6             0.592698           MaraBERT   
9  10102950010           7          