In [50]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from itertools import product
from functools import reduce

In [51]:
def load_data(path):
    df = pd.read_parquet(path)
    df["labels"] = df["winner"].map({"model_a": 0, "model_b": 1})
    df = df[['id', 'logits', 'logits_tta', 'labels']].copy()
    df = df.sort_values(by='id').reset_index(drop=True)
    return df

In [52]:
def merge_model_dataframes(dfs):
    result = None
    for model_name, df in dfs.items():
        temp_df = df.copy()

        new_columns = {
            'logits': f'logits_{model_name}',
            'logits_tta': f'logits_tta_{model_name}',
            'labels': f'labels_{model_name}'  # Including labels if needed
        }
        
        temp_df = temp_df.rename(columns=new_columns)
        
        if result is None:
            result = temp_df
        else:
            # Merge on 'id' column
            result = pd.merge(result, temp_df, on='id', how='outer')

    # After all merges are done, keep only one labels column and rename it
    labels_columns = [col for col in result.columns if col.startswith('labels_')]
    if labels_columns:
        # Keep the first labels column and rename it
        result = result.rename(columns={labels_columns[0]: 'labels'})
        # Drop all other labels columns
        result = result.drop(columns=labels_columns[1:])    
    
    return result

In [53]:
# -> weight for logits and (1 - weight for logits_tta)

# - (0.7179169249845009, 0.6891919818144244, 0.6931184128952262) -> 0.5
# - (0.7154370737755734, 0.6898119446166563, 0.6949783013019218) -> 0.6
# - (0.7164703451126265, 0.6896052903492457, 0.6978714610456705) -> 0.4
# - (0.7141971481711097, 0.689398636081835, 0.7013845835916511) -> 0.3
# - (0.712957222566646, 0.69105187022112, 0.6931184128952262) -> 0.7

# (0.7057243232072742, 0.6881587104773713, 0.6846455879313907) -> without tta

In [54]:
data_dir = Path('../data/pseudo_labeled_final_data/eval_data')

In [55]:
dfs = {
    'gemma9b': load_data(data_dir / "eval_data_gemma2_9b_stage1_init_stage0_combined.parquet"),
    'llama': load_data(data_dir / "eval_data_deepseek_llama3.18B_stage1_init_stage0_combined.parquet"),
    'qwen': load_data(data_dir / "eval_data_qwen_2.5_7B_stage1_init_stage0_combined.parquet"),
    'gemma27b': load_data(data_dir / "eval_data_gemma2_27B_stage1_lora_707_combined.parquet"),
    'phi4': load_data(data_dir / "eval_data_phi4_stage1_lora_700_combined.parquet")
}

In [56]:
for model_name, d in dfs.items():
    logits = 0.5 * np.array(d["logits"].tolist()) + 0.5 * np.array(d["logits_tta"].tolist())
    print(model_name, accuracy_score(d["labels"], np.argmax(logits, axis=1)))

gemma9b 0.7179169249845009
llama 0.6891919818144244
qwen 0.6933250671626369
gemma27b 0.7131638768340566
phi4 0.703657780533168


In [57]:
renamed_dfs = {}
for model_name, df in dfs.items():
    # Create a copy and rename all columns except the ID column (assuming there's an ID column)
    temp_df = df.copy()
    # Get columns that need to be renamed (exclude 'id' if it exists)
    cols_to_rename = [col for col in temp_df.columns if col != 'id']
    # Create rename mapping
    rename_dict = {col: f"{col}_{model_name}" for col in cols_to_rename}
    temp_df = temp_df.rename(columns=rename_dict)
    renamed_dfs[model_name] = temp_df


merged_df = reduce(lambda left, right: pd.merge(left, right, on='id', how='outer'), 
                  renamed_dfs.values())

columns_to_drop = [
    'labels_llama',
    'labels_qwen',
    'labels_gemma27b',
    'labels_phi4'
]
merged_df = merged_df.drop(columns=columns_to_drop)
merged_df = merged_df.rename(columns={'labels_gemma9b': 'labels'})
merged_df

Unnamed: 0,id,logits_gemma9b,logits_tta_gemma9b,labels,logits_llama,logits_tta_llama,logits_qwen,logits_tta_qwen,logits_gemma27b,logits_tta_gemma27b,logits_phi4,logits_tta_phi4
0,00010ed04b536f56ebe43eef1100c13906abea12bf9855...,"[1.390625, -0.46875]","[0.68359375, -0.1435546875]",0,"[1.0078125, -0.453125]","[0.39453125, -0.024658203125]","[0.84765625, -1.5390625]","[0.380859375, -0.7109375]","[-0.08837890625, -2.640625]","[-0.98828125, -1.515625]","[1.265625, -0.55859375]","[0.76171875, -0.50390625]"
1,00172aab8af10cc0648041c94a41eeab7d9caaea7717a3...,"[0.625, 0.1416015625]","[0.419921875, 0.228515625]",1,"[0.3515625, -0.055908203125]","[0.4296875, -0.1298828125]","[-0.134765625, -0.333984375]","[-0.58984375, 0.040771484375]","[-0.9140625, -1.515625]","[-1.5859375, -0.91796875]","[0.142578125, 0.1728515625]","[-0.1865234375, 0.58984375]"
2,00314ee979ffc9e4e4dd3716d02c401ba117d00640a3e1...,"[1.2265625, -0.5]","[1.5859375, -1.078125]",0,"[1.234375, -0.72265625]","[1.328125, -0.98046875]","[0.609375, -1.25]","[0.7734375, -1.0234375]","[0.023681640625, -2.28125]","[-0.181640625, -1.8125]","[1.2265625, -0.494140625]","[1.109375, -1.0]"
3,003aa0a76eb58c06adce6e9db59ad1da73929a431f3f23...,"[0.484375, 0.1904296875]","[0.32421875, 0.34375]",0,"[0.26171875, 0.046630859375]","[0.076171875, 0.298828125]","[-0.1240234375, -0.345703125]","[-0.515625, -0.003753662109375]","[-0.90234375, -1.546875]","[-1.6171875, -0.91796875]","[0.24609375, 0.1337890625]","[0.060791015625, 0.345703125]"
4,00448e4160ceb9578584293b6aee5d680c8e6dbbcd1d13...,"[0.75390625, -0.0167236328125]","[0.5859375, 0.09228515625]",0,"[0.15625, 0.12451171875]","[0.291015625, 0.06005859375]","[0.061279296875, -0.5703125]","[-0.06201171875, -0.37890625]","[-0.6015625, -2.078125]","[-1.34375, -1.2109375]","[0.66015625, -0.2421875]","[0.546875, -0.185546875]"
...,...,...,...,...,...,...,...,...,...,...,...,...
4834,ffc2a1a209a1053658645fc60dd29249392c7e0040ae5d...,"[-1.7890625, 2.3125]","[-1.34375, 2.0625]",1,"[-1.859375, 2.109375]","[-1.125, 1.8359375]","[-1.4921875, 1.4296875]","[-2.140625, 1.3828125]","[-2.796875, 0.396484375]","[-3.109375, 0.337890625]","[-1.7890625, 1.875]","[-1.109375, 1.6953125]"
4835,ffd929efef61e5b353180a6e790d35896363ecf02973ee...,"[0.921875, -0.2080078125]","[0.5625, -0.00811767578125]",0,"[0.87109375, -0.443359375]","[0.68359375, -0.365234375]","[0.28125, -0.8671875]","[-0.076171875, -0.361328125]","[-0.78125, -1.703125]","[-1.34375, -0.9921875]","[0.0927734375, 0.30078125]","[0.796875, -0.482421875]"
4836,ffe2645c0cdb2cbbb755ad0766cfc14663726619968c4e...,"[0.00250244140625, 0.490234375]","[-0.06787109375, 0.859375]",1,"[-0.0042724609375, 0.375]","[0.05322265625, 0.271484375]","[-0.359375, -0.09716796875]","[-0.7734375, 0.212890625]","[-1.2421875, -1.265625]","[-1.71875, -0.86328125]","[-0.369140625, 0.68359375]","[0.197265625, 0.10400390625]"
4837,ffe2a8b3cf149dc4ffd040623f4c8e4e9e87b98bd41b14...,"[0.0025177001953125, 0.60546875]","[-0.039306640625, 0.75]",0,"[-0.31640625, 0.60546875]","[-0.09765625, 0.486328125]","[-0.546875, 0.171875]","[-0.984375, 0.39453125]","[-1.2578125, -1.0625]","[-1.9609375, -0.6875]","[-0.072265625, 0.4140625]","[-0.103515625, 0.56640625]"


In [58]:
model_logits = {
    'gemma9b': 0.5 * np.array(merged_df["logits_gemma9b"].tolist()) + 0.5 * np.array(merged_df["logits_tta_gemma9b"].tolist()),
    'llama': 0.5 * np.array(merged_df["logits_llama"].tolist()) + 0.5 * np.array(merged_df["logits_tta_llama"].tolist()),
    'qwen': 0.5 * np.array(merged_df["logits_qwen"].tolist()) + 0.5 * np.array(merged_df["logits_tta_qwen"].tolist()),
    'gemma27b': 0.5 * np.array(merged_df["logits_gemma27b"].tolist()) + 0.5 * np.array(merged_df["logits_tta_gemma27b"].tolist()),
    'phi4': 0.5 * np.array(merged_df["logits_phi4"].tolist()) + 0.5 * np.array(merged_df["logits_tta_phi4"].tolist())
}
true_labels = np.array(merged_df["labels"].values)

In [59]:
for m, l in model_logits.items():
    acc = accuracy_score(true_labels, l.argmax(-1))
    print(f"{m}: {acc}")

gemma9b: 0.7179169249845009
llama: 0.6891919818144244
qwen: 0.6933250671626369
gemma27b: 0.7131638768340566
phi4: 0.703657780533168


In [60]:
import numpy as np
from itertools import combinations
from typing import List, Tuple, Dict

class EnsembleOptimizer:
    def __init__(
        self,
        model_logits: Dict[str, np.ndarray],
        true_labels: np.ndarray,
        max_models: int = 6,
        weight_range: Tuple[float, float] = (0.0, 1.0),
        weight_steps: int = 10
    ):
        """
        Initialize the ensemble optimizer.
        
        Args:
            model_logits: Dictionary of model names to logits arrays
            true_labels: Ground truth labels
            max_models: Maximum number of models to include in ensemble
            weight_range: Range of weights to try
            weight_steps: Number of weight values to try in the range
        """
        self.model_logits = model_logits
        self.true_labels = true_labels
        self.max_models = max_models
        self.weight_range = weight_range
        self.weight_steps = weight_steps
        
    def compute_accuracy(self, combined_logits: np.ndarray) -> float:
        """Compute accuracy from combined logits"""
        predictions = (combined_logits[:, 1] > combined_logits[:, 0]).astype(int)
        return np.mean(predictions == self.true_labels)
    
    def combine_logits(self, models: List[str], weights: List[float]) -> np.ndarray:
        """Combine logits from multiple models using given weights"""
        combined = np.zeros_like(self.model_logits[models[0]])
        for model, weight in zip(models, weights):
            combined += weight * self.model_logits[model]
        return combined
    
    def optimize_weights(self, current_models: List[str], new_model: str) -> Tuple[List[float], float]:
        """Find optimal weights for combining current models with a new model"""
        n_models = len(current_models) + 1
        best_accuracy = 0
        best_weights = None
        
        # Generate weight combinations
        weight_values = np.linspace(self.weight_range[0], self.weight_range[1], self.weight_steps)
        
        for weights in combinations(weight_values, n_models):
            # Normalize weights to sum to 1
            weights = np.array(weights) / sum(weights)
            
            # Combine logits
            combined = self.combine_logits(current_models + [new_model], weights)
            accuracy = self.compute_accuracy(combined)
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_weights = weights
                
        return best_weights, best_accuracy
    
    def hill_climb(self) -> Tuple[List[str], List[float], float]:
        """
        Perform hill climbing to find optimal model combination.
        
        Returns:
            Tuple of (best models, best weights, best accuracy)
        """
        # Start with best single model
        best_models = []
        best_weights = []
        best_accuracy = 0
        
        for model in self.model_logits.keys():
            accuracy = self.compute_accuracy(self.model_logits[model])
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_models = [model]
                best_weights = [1.0]
        
        # Iteratively add models
        available_models = set(self.model_logits.keys()) - set(best_models)
        
        while len(best_models) < self.max_models and available_models:
            improved = False
            best_new_accuracy = best_accuracy
            best_new_model = None
            best_new_weights = None
            
            # Try adding each available model
            for model in available_models:
                weights, accuracy = self.optimize_weights(best_models, model)
                
                if accuracy > best_new_accuracy:
                    improved = True
                    best_new_accuracy = accuracy
                    best_new_model = model
                    best_new_weights = weights
            
            if not improved:
                break
                
            # Update best ensemble
            best_accuracy = best_new_accuracy
            best_models.append(best_new_model)
            best_weights = best_new_weights
            available_models.remove(best_new_model)
            
        return best_models, best_weights, best_accuracy

# Example usage:
def optimize_ensemble(all_logits: Dict[str, np.ndarray], true_labels: np.ndarray) -> Tuple[List[str], List[float], float]:
    """
    Optimize ensemble weights using hill climbing.
    
    Args:
        all_logits: Dictionary mapping model names to their logit outputs
        true_labels: Ground truth labels
        
    Returns:
        Tuple of (best model names, best weights, best accuracy)
    """
    optimizer = EnsembleOptimizer(
        model_logits=all_logits,
        true_labels=true_labels,
        max_models=10,  # Maximum models to include
        weight_range=(0.0, 1.0),  # Range of weights to try
        weight_steps=10  # Number of weight values to try
    )
    
    return optimizer.hill_climb()



def brute_force_weights(model_logits, true_labels, step_size=0.1):
    """
    Find optimal weights for ensemble models using grid search.
    """
    # Get model names
    models = list(model_logits.keys())
    
    # Generate weight combinations that sum to 1
    weights = np.arange(0, 1 + step_size, step_size)
    combinations = []
    
    # Generate all possible weight combinations
    for w in product(weights, repeat=len(models)):
        if abs(sum(w) - 1.0) < 1e-10:  # Check if weights sum to 1
            combinations.append(w)
    
    best_accuracy = 0
    best_weights = None
    
    # Try each weight combination
    for weights in combinations:
        # Compute weighted ensemble logits
        ensemble_logits = sum(w * model_logits[m] for w, m in zip(weights, models))
        
        # Get predictions and compute accuracy
        predictions = np.argmax(ensemble_logits, axis=-1)
        accuracy = accuracy_score(true_labels, predictions)
        
        # Update best weights if accuracy improves
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_weights = dict(zip(models, weights))
            
    return best_weights, best_accuracy

In [115]:
model_logits = {
    'gemma9b': (np.array(merged_df["logits_gemma9b"].to_list()) + np.array(merged_df["logits_tta_gemma9b"].to_list())) / 2,
    'llama': (np.array(merged_df["logits_llama"].to_list()) + np.array(merged_df["logits_tta_llama"].to_list())) / 2,
    'qwen': (np.array(merged_df["logits_qwen"].to_list()) + np.array(merged_df["logits_tta_qwen"].to_list())) / 2,
    'gemma27b': (np.array(merged_df["logits_gemma27b"].to_list()) + np.array(merged_df["logits_tta_gemma27b"].to_list())) / 2,
    'phi4': (np.array(merged_df["logits_phi4"].to_list()) + np.array(merged_df["logits_tta_phi4"].to_list())) / 2,
}
true_labels = np.array(merged_df["labels"].values)

In [116]:
for model, logits in model_logits.items():
    acc = accuracy_score(true_labels, np.argmax(logits, axis=-1))
    print(f"Model {model} accuracy: {acc}")

Model gemma9b accuracy: 0.7179169249845009
Model llama accuracy: 0.6891919818144244
Model qwen accuracy: 0.6933250671626369
Model gemma27b accuracy: 0.7131638768340566
Model phi4 accuracy: 0.703657780533168


In [117]:
optimize_ensemble(model_logits, true_labels)

(['gemma9b', 'gemma27b'], array([0.45454545, 0.54545455]), 0.7201901219260177)

In [118]:
best_weights, best_accuracy = brute_force_weights(model_logits, true_labels, step_size=0.05)
best_weights, round(best_accuracy, 4)

({'gemma9b': 0.55, 'llama': 0.0, 'qwen': 0.0, 'gemma27b': 0.2, 'phi4': 0.25},
 0.7237)

In [None]:
# ({'gemma9b': 0.4, 'llama': 0.0, 'qwen': 0.2, 'gemma27b': 0.4, 'phi4': 0.0} -> step_size = 0.1
# ({'gemma9b': 0.55, 'llama': 0.0, 'qwen': 0.0, 'gemma27b': 0.2, 'phi4': 0.25} -> step_size = 0.05

In [108]:
best_weights = {'gemma9b': 0.55, 'gemma27b': 0.2, 'phi4': 0.25} # score: 0.7237
# best_weights = {'gemma9b': 0.333, 'gemma27b': 0.33333, 'phi4': 0.33333} # average: 0.7179

In [110]:
ensemble_logits = 0
for model, weight in best_weights.items():
    if weight > 0:
        ensemble_logits += model_logits[model] * weight
acc = round(accuracy_score(true_labels, np.argmax(ensemble_logits, axis=-1)), 4)
acc

0.7237

In [153]:
# ddd = pd.read_parquet("../data/pseudo_labeled_final_data/pl_gemma2_27B_stage1_lora_707_merged_combined.parquet")
ddd = pd.read_parquet("../data/pseudo_labeled_final_data/pl_phi4_stage1_lora_700_merged.parquet")
ddd = ddd[ddd['split'] == 'train']
ddd = ddd.dropna(subset=['winner'])
ddd['labels'] = ddd['winner'].map({'model_a': 0, 'model_b': 1})
ddd

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language,split,source,logits,logits_tta,labels
0,41b149ae0c7e8c40d72b26312d4e40a32b903d83e5432c...,شوند. سناریوی زیر را در نظر 􀍬 پردازنده اجرا م ...,بیایید سناریوی مطرح شده را به تفصیل تحلیل کنیم...,برای تحلیل سناریوی ارائه شده در مورد رویدادهای...,model_a,gemini-1.5-flash-002,gpt-4o-mini-2024-07-18,Persian,train,current_comp,"[0.55859375, -0.1015625]","[0.67578125, -0.345703125]",0
1,72551638,using UnityEngine;\nusing System.Text;\n\npubl...,Here are a few ways to improve the console com...,This script appears to be a simple in-game con...,model_b,claude-2.1,gpt-4-1106-preview,English,train,prev_comp,"[0.03662109375, 0.392578125]","[-0.125, 0.640625]",1
2,24d45b90e7dd908f35f9af57a264aee2e9982d14960104...,"A point mass P, mass = 6 * 10^24kg, is fixed a...","To solve the problem, we'll analyze the motion...","To solve this problem, we need to calculate th...",model_a,o1-mini,o1-preview,English,train,current_comp,"[0.54296875, -0.051025390625]","[0.4609375, -0.08984375]",0
3,097c490a923a5987947089da7f4ae79ceaedc598d14941...,Упрости текст для легкого запоминания : Античн...,### Античная философия и медицина\n\nНатурфило...,Antičная философия и медицина - это две민Startu...,model_b,mistral-large-2407,llama-3.2-1b-instruct,Russian,train,current_comp,"[2.125, -1.4296875]","[1.765625, -1.75]",1
4,5184bb73881a2fc67211b6090dc50b99a9dfa2e9782bdc...,Подготовить программу наставничества . 1. Инди...,**Программа Наставничества**\n\n**Цель:** Сокр...,### Программа наставничества\n\n#### Цель прог...,model_b,llama-3.1-8b-instruct,yi-lightning,Russian,train,current_comp,"[-0.447265625, 0.75]","[-0.2177734375, 0.73828125]",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,3274457708,"dog -> puppy, animal, canine\nocean ->","marine, aquatic, sea, sea creature","dog -> puppy, animal, canine\nocean -> ocean a...",model_a,mpt-7b-chat,dolly-v2-12b,English,train,prev_comp,"[2.296875, -1.671875]","[2.28125, -2.359375]",0
991,b9b6c538eb1f4357a81049ac22c5e3ad,what's 3 times 20100?,"3 times 20100 is 60,300.",3 times 20100 is equal to 60200.,model_a,gpt-3.5-turbo,chatglm-6b,English,train,lmsys_33k,"[1.7578125, -1.0703125]","[1.0703125, -0.98828125]",0
997,27c0475986a9af98659979ef6be5c1b2980aba11c37565...,Who you one short sentence,I am an AI assistant.,I am an AI assistant here to help you.,model_b,claude-3-haiku-20240307,qwen-plus-0828,English,train,current_comp,"[-0.1787109375, 0.41796875]","[0.07080078125, 0.318359375]",1
998,3765590473,hey!,Hey there! How's it going?,Hey there! How's it going?,model_a,llama-2-13b-chat,codellama-34b-instruct,English,train,prev_comp,"[0.26171875, 0.103515625]","[0.10009765625, 0.26171875]",0


In [154]:
logits = np.array(ddd['logits'].to_list())
logits_tta  = np.array(ddd['logits_tta'].to_list())
l = (logits + logits_tta) / 2
acc = accuracy_score(ddd['labels'], l.argmax(-1))
acc

0.7954971857410882

In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("../data/pseudo_labeled_final_data/oof_preds.parquet")
df

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language,split,source,labels,prob_0,prob_1,prediction
0,00010ed04b536f56ebe43eef1100c13906abea12bf9855...,You will be given a piece of news. Analyze it ...,Let's break down the news and analyze it accor...,"```json\n{\n ""contains_orgs"": true,\n ""orgs""...",model_a,gemma-2-27b-it,gemini-1.5-flash-002,Russian,valid,current_comp,0,0.865793,0.134208,0
1,00172aab8af10cc0648041c94a41eeab7d9caaea7717a3...,kısaca dopamin detoksu,"Dopamin detoksu, beyninizin dopamin seviyeleri...","Dopamin detoksunun amacı, aşırı uyarılmaya ned...",model_b,grok-2-2024-08-13,gemini-1.5-pro-002,Finnish,valid,current_comp,1,0.679179,0.320821,0
2,00314ee979ffc9e4e4dd3716d02c401ba117d00640a3e1...,"In Webgpu/WGSL, what are override variables?",In **WebGPU** using **WGSL** (WebGPU Shading L...,"As of my last update in 2023, WebGPU and WGSL ...",model_a,chatgpt-4o-latest-20240903,gpt-4-0125-preview,English,valid,current_comp,0,0.540924,0.459076,0
3,003aa0a76eb58c06adce6e9db59ad1da73929a431f3f23...,"nêu lý do chọn đây là đề tài nghiên cứu ""Đánh...",Dưới đây là một số lý do có thể được sử dụng đ...,"Chọn đề tài ""Đánh giá tác động của Social Medi...",model_a,llama-3.1-405b-instruct-bf16,reka-core-20240904,Vietnamese,valid,current_comp,0,0.418697,0.581303,1
4,00448e4160ceb9578584293b6aee5d680c8e6dbbcd1d13...,package ex3_Excercise;\n//버블정렬\npublic class T...,```java\npackage ex3_Excercise;\n\npublic clas...,```java\npackage ex3_Excercise;\n\npublic clas...,model_a,gemma-2-9b-it,command-r-08-2024,English,valid,current_comp,0,0.375706,0.624294,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4834,ffc2a1a209a1053658645fc60dd29249392c7e0040ae5d...,Are there decent AI therapy options yet? If so...,"As an AI, I cannot provide medical advice, and...",The fascinating intersection of AI and mental ...,model_b,gemma-2-9b-it,llama-3.1-nemotron-70b-instruct,English,valid,current_comp,1,0.933869,0.066131,0
4835,ffd929efef61e5b353180a6e790d35896363ecf02973ee...,Imagine you have to get two goats and a cabbag...,To solve the problem of getting two goats and ...,"To solve this classic problem, we'll break it ...",model_a,o1-mini,llama-3.1-405b-instruct-bf16,English,valid,current_comp,0,0.513668,0.486332,0
4836,ffe2645c0cdb2cbbb755ad0766cfc14663726619968c4e...,"翻译一下这句话：Builder Journey Experience, 优化开发者在网站及各...",Builder Journey Experience optimizes developer...,Here is a translation of the sentence:\n\n**Bu...,model_b,glm-4-plus,llama-3.1-nemotron-70b-instruct,Chinese,valid,current_comp,1,0.028114,0.971886,1
4837,ffe2a8b3cf149dc4ffd040623f4c8e4e9e87b98bd41b14...,Какое программное обеспечение развернуто на Уп...,На управляющих узлах OpenShift развернуто неск...,На Управляющих узлах (Master Nodes) OpenShift ...,model_a,gemini-1.5-flash-8b-001,yi-lightning,unknown,valid,current_comp,0,0.993996,0.006004,0


In [8]:
prediction = df['prediction'].values
labels = df['labels'].values
acc = accuracy_score(labels, prediction)
acc

0.49121719363504857