In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer
import torch


In [None]:

data = pd.read_csv('/content/drive/MyDrive/better-output/train.csv')
data.head()


Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner
0,29468,RWKV-4-Raven-14B,gpt4all-13b-snoozy,what is the best way to determine the number o...,"The number of neurons, input neurons, and outp...",The best way to determine the number of neuron...,model_a
1,51285,zephyr-7b-beta,llama-2-70b-chat,\\nAssistant: Hello!\\nUser: Hello? Whats goin...,\\nHow may I assist you today? Whether it's an...,"Sure, I'd be happy to help! Is there something...",model_b
2,1552,vicuna-13b,oasst-pythia-12b,explain why all startups of the world will fail,"It is unlikely that all startups will fail, as...","There are many reasons why startups fail, and ...",model_a
3,28023,mpt-7b-chat,vicuna-7b,Why is the sun orange?,The sun appears to be orange because the light...,The sun appears orange because of the way that...,model_b
4,67406,gpt-4-1106-preview,gpt-3.5-turbo-0613,"tell me about thalassery,who are some famous r...","Thalassery, also known as Tellicherry, is a to...",Thalassery is a coastal town located in the Ka...,model_a


In [None]:
import re

def clean_text(text):
    """
    Basic text cleaning: remove extra spaces and unwanted characters.
    """
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


for col in ['prompt', 'response_a', 'response_b']:
    data[col] = data[col].astype(str).apply(clean_text)

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two numpy vectors."""
    if np.linalg.norm(vec1) == 0 or np.linalg.norm(vec2) == 0:
        return 0.0
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


In [None]:

embedder = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    """
    Return the embedding for a given text using the transformer-based model.
    """

    embedding = embedder.encode([text], convert_to_numpy=True)
    return embedding[0]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:

features = []
labels = []


label_mapping = {'model_a': 0, 'model_b': 1, 'tie': 2}


for _, row in data.iterrows():
    prompt = row['prompt']
    resp_a = row['response_a']
    resp_b = row['response_b']


    emb_prompt = get_embedding(prompt)
    emb_resp_a = get_embedding(resp_a)
    emb_resp_b = get_embedding(resp_b)


    sim_a = cosine_similarity(emb_prompt, emb_resp_a)
    sim_b = cosine_similarity(emb_prompt, emb_resp_b)


    features.append([sim_a, sim_b])


    labels.append(label_mapping[row['winner']])


X = np.array(features)
y = np.array(labels)

print("Feature shape:", X.shape)
print("Labels distribution:", np.unique(y, return_counts=True))


Feature shape: (56291, 2)
Labels distribution: (array([0, 1, 2]), array([19660, 19391, 17240]))


In [None]:

data['sim_a'] = [feat[0] for feat in features]
data['sim_b'] = [feat[1] for feat in features]


data['winner_label'] = labels


data.to_csv('data_with_features.csv', index=False)
print("CSV file 'data_with_features.csv' saved successfully!")


CSV file 'data_with_features.csv' saved successfully!


In [None]:
from google.colab import files

files.download('data_with_features.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

data = pd.read_csv('data_with_features.csv')

print(data.head())


      id             model_a             model_b  \
0  29468    RWKV-4-Raven-14B  gpt4all-13b-snoozy   
1  51285      zephyr-7b-beta    llama-2-70b-chat   
2   1552          vicuna-13b    oasst-pythia-12b   
3  28023         mpt-7b-chat           vicuna-7b   
4  67406  gpt-4-1106-preview  gpt-3.5-turbo-0613   

                                              prompt  \
0  what is the best way to determine the number o...   
1  \\nAssistant: Hello!\\nUser: Hello? Whats goin...   
2    explain why all startups of the world will fail   
3                             Why is the sun orange?   
4  tell me about thalassery,who are some famous r...   

                                          response_a  \
0  The number of neurons, input neurons, and outp...   
1  \\nHow may I assist you today? Whether it's an...   
2  It is unlikely that all startups will fail, as...   
3  The sun appears to be orange because the light...   
4  Thalassery, also known as Tellicherry, is a to...   

             

In [None]:

text_columns = ['prompt', 'response_a', 'response_b']


for col in text_columns:
    data[col] = data[col].fillna('')


print(data[text_columns].isnull().sum())


prompt        0
response_a    0
response_b    0
dtype: int64


In [None]:
def extract_additional_features(X):

    features = pd.DataFrame()
    features['prompt_word_count'] = X['prompt'].apply(lambda x: len(x.split()))
    features['response_a_word_count'] = X['response_a'].apply(lambda x: len(x.split()))
    features['response_b_word_count'] = X['response_b'].apply(lambda x: len(x.split()))
    features['response_a_char_count'] = X['response_a'].apply(len)
    features['response_b_char_count'] = X['response_b'].apply(len)
    features['word_count_diff'] = features['response_a_word_count'] - features['response_b_word_count']
    return features.values


In [None]:
from sklearn.preprocessing import FunctionTransformer


def combine_text_columns(X):
    return X['prompt'] + " " + X['response_a'] + " " + X['response_b']

combine_text_transformer = FunctionTransformer(combine_text_columns, validate=False)


text_pipeline = Pipeline([
    ('combine_text', combine_text_transformer),
    ('tfidf', TfidfVectorizer(max_features=1000))
])


categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


numeric_pipeline = Pipeline([
    ('scaler', StandardScaler())
])


additional_features_pipeline = Pipeline([
    ('extract', FunctionTransformer(extract_additional_features, validate=False)),
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer(transformers=[
    ('text', text_pipeline, ['prompt', 'response_a', 'response_b']),
    ('cat', categorical_pipeline, ['model_a', 'model_b']),
    ('num', numeric_pipeline, ['sim_a', 'sim_b']),
    ('add', additional_features_pipeline, ['prompt', 'response_a', 'response_b'])
])


In [None]:

X = data[['prompt', 'response_a', 'response_b', 'model_a', 'model_b', 'sim_a', 'sim_b']]
y = data['winner_label']


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)


Training set shape: (45032, 7)
Validation set shape: (11259, 7)


In [None]:

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', lgb.LGBMClassifier(random_state=42))
])


param_grid = {
    'clf__num_leaves': [31, 50],
    'clf__learning_rate': [0.1, 0.01],
    'clf__n_estimators': [100, 200]
}


grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)


grid_search.fit(X_train, y_train)


print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))


Fitting 3 folds for each of 8 candidates, totalling 24 fits




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.573987 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250866
[LightGBM] [Info] Number of data points in the train set: 45032, number of used features: 1144
[LightGBM] [Info] Start training from score -1.051931
[LightGBM] [Info] Start training from score -1.065759
[LightGBM] [Info] Start training from score -1.183285
Best parameters found: {'clf__learning_rate': 0.1, 'clf__n_estimators': 100, 'clf__num_leaves': 50}
Best cross-validation accuracy: 0.5141


In [None]:

y_pred = grid_search.predict(X_val)


accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)
cm = confusion_matrix(y_val, y_pred)

print("Validation Accuracy after tuning: {:.4f}".format(accuracy))
print("\nClassification Report:\n", report)
print("\nConfusion Matrix:\n", cm)




Validation Accuracy after tuning: 0.5133

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.60      0.56      3932
           1       0.53      0.58      0.55      3879
           2       0.46      0.34      0.39      3448

    accuracy                           0.51     11259
   macro avg       0.51      0.51      0.50     11259
weighted avg       0.51      0.51      0.51     11259


Confusion Matrix:
 [[2350  890  692]
 [ 947 2247  685]
 [1135 1131 1182]]


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer


embedder = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):

    return embedder.encode(text)

def cosine_similarity(vec1, vec2):

    if np.linalg.norm(vec1) == 0 or np.linalg.norm(vec2) == 0:
        return 0.0
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


test_data = pd.read_csv('test.csv')


for col in ['prompt', 'response_a', 'response_b']:
    test_data[col] = test_data[col].fillna('')


if 'sim_a' not in test_data.columns or 'sim_b' not in test_data.columns:
    sims_a = []
    sims_b = []
    for _, row in tqdm(test_data.iterrows(), total=test_data.shape[0], desc="Computing similarity features"):
        emb_prompt = get_embedding(row['prompt'])
        emb_resp_a = get_embedding(row['response_a'])
        emb_resp_b = get_embedding(row['response_b'])
        sims_a.append(cosine_similarity(emb_prompt, emb_resp_a))
        sims_b.append(cosine_similarity(emb_prompt, emb_resp_b))
    test_data['sim_a'] = sims_a
    test_data['sim_b'] = sims_b


X_test = test_data[['prompt', 'response_a', 'response_b', 'model_a', 'model_b', 'sim_a', 'sim_b']]


y_test_pred = grid_search.best_estimator_.predict(X_test)


inv_label_mapping = {0: 'model_a', 1: 'model_b', 2: 'tie'}
test_data['winner_prediction'] = [inv_label_mapping[pred] for pred in y_test_pred]


submission = test_data[['id', 'winner_prediction']]


submission.to_csv('submission.csv', index=False)
print("submission.csv file created successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing similarity features: 100%|██████████| 30312/30312 [11:23<00:00, 44.37it/s]


submission.csv file created successfully!


In [None]:

try:
    from google.colab import files
    files.download('submission.csv')
except ImportError:
    print("Not running in Google Colab.")


from IPython.display import FileLink, display
display(FileLink('submission.csv'))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>