In [None]:
from utils import load_json, use_tf_idf_vectorizer, use_count_vectorizer
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import os

In [None]:
df = pd.read_csv("../data/train.csv")
df.head()

In [None]:
train_pred = pd.read_csv("train_pred.csv")
train_pred.rename(
    {
        "winner_model_a": "winner_model_a_pred_prob", 
        "winner_model_b": "winner_model_b_pred_prob", 
        "winner_tie": "winner_tie_pred_prob"
    }, 
    axis=1, 
    inplace=True
)
print(train_pred.head())
print(train_pred.shape)

In [None]:
df = df.merge(train_pred, on="id", how="left")
df.head()

In [None]:
df.drop(columns=["model_a", "model_b"], inplace=True)
df["label"] = df.apply(lambda row: 0 if row["winner_model_a"] else (1 if row["winner_model_b"] else 2), axis=1)
df.drop(columns=["winner_model_a", "winner_model_b", "winner_tie"], inplace=True)
df.head()

In [None]:
df = load_json(df)
df.head()

In [None]:
def calculate_length_sum(response):
    length_sum = sum(len(s) if s is not None else 0 for s in response)
    return length_sum

def calculate_total_words(response):
    total_words = sum(len(s.split()) if s is not None else 0 for s in response)
    return total_words

df["len_a"] = df["response_a"].apply(calculate_length_sum)
df["len_b"] = df["response_b"].apply(calculate_length_sum)
df["word_a"] = df["response_a"].apply(calculate_total_words)
df["word_b"] = df["response_b"].apply(calculate_total_words)

df.head()

In [None]:
type(df["prompt"][0])

In [None]:
def join_strings(string_list):
    filtered_list = [s if s else "NULL" for s in string_list]
    return " ".join(filtered_list)

df["prompt"] = df["prompt"].apply(join_strings)
df["response_a"] = df["response_a"].apply(join_strings)
df["response_b"] = df["response_b"].apply(join_strings)
print(df.dtypes)

In [None]:
prompt_tf_idf_vectorizer, prompt_tf_idf_matrix = use_tf_idf_vectorizer(train=df, column_name="prompt")
response_a_tf_idf_vectorizer, response_a_tf_idf_matrix = use_tf_idf_vectorizer(train=df, column_name="response_a")
response_b_tf_idf_vectorizer, response_b_tf_idf_matrix = use_tf_idf_vectorizer(train=df, column_name="response_b")
print("The shape of prompt_tf_idf_matrix is:", prompt_tf_idf_matrix.shape)
print("The shape of response_a_tf_idf_matrix is:", response_a_tf_idf_matrix.shape)
print("The shape of response_b_tf_idf_matrix is:", response_b_tf_idf_matrix.shape)

In [None]:
# prompt_count_vectorizer, prompt_count_matrix = use_count_vectorizer(train=df, column_name="prompt")
# response_a_count_vectorizer, response_a_count_matrix = use_count_vectorizer(train=df, column_name="response_a")
# response_b_count_vectorizer, response_b_count_matrix = use_count_vectorizer(train=df, column_name="response_b")
# print("The shape of prompt_count_matrix is:", prompt_count_matrix.shape)
# print("The shape of response_a_count_matrix is:", response_a_count_matrix.shape)
# print("The shape of response_b_count_matrix is:", response_b_count_matrix.shape)

In [None]:
df = df.merge(prompt_tf_idf_matrix, on="id", how="left")
df = df.merge(response_a_tf_idf_matrix, on="id", how="left")
df = df.merge(response_b_tf_idf_matrix, on="id", how="left")
print("The shape of df after merging all tf_idf matrix is:", df.shape)

In [None]:
# df = df.merge(prompt_count_matrix, on="id", how="left")
# df = df.merge(response_a_count_matrix, on="id", how="left")
# df = df.merge(response_b_count_matrix, on="id", how="left")
# print("The shape of df after merging all count matrix is:", df.shape)

In [None]:
df.head()

#### Use LGBM

In [None]:
if not os.path.exists("lgbm_models"):
    os.makedirs("lgbm_models")

In [None]:
df["len_a"] = df["len_a"].astype(float)
df["len_b"] = df["len_b"].astype(float)
df["word_a"] = df["word_a"].astype(float)
df["word_b"] = df["word_b"].astype(float)

In [None]:
X = df.drop(columns=["id", "prompt", "response_a", "response_b", "label"])
y = df["label"]

n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

columns_to_normalize = ["len_a", "len_b", "word_a", "word_b"]
scaler = MinMaxScaler()

log_loss_scores = []

for idx, (train_index, val_index) in tqdm(enumerate(kf.split(X)), total=n_splits):
    print(f"---------- Fold {idx + 1} ----------")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    
    X_train.loc[:, columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_val.loc[:, columns_to_normalize] = scaler.transform(X_val[columns_to_normalize])
    
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    params = {
        "objective": "multiclass",
        "num_class": 3,
        "metric": "multi_logloss",
        "n_estimators": 2048,
        "learning_rate": 0.005,
        "colsample_bytree": 0.75,
        "num_leaves": 18,
        "max_depth": 12,
        "verbose": 1
    }
    model = lgb.LGBMClassifier(**params)
    early_stopping_callback = lgb.early_stopping(50, first_metric_only=True, verbose=True)

    model.fit(
        X=X_train, y=y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping_callback]
    )
    
    y_pred = model.predict_proba(X_val, num_iteration=model.best_iteration_)
    # print(y_pred)

    model.booster_.save_model(f"./lgbm_models/lgbm_model_fold_{idx + 1}.txt")

    # To load the model
    # model = lgb.Booster(model_file=f"./lgbm_models/lgbm_model_fold_{idx + 1}.txt")
    
    score = log_loss(y_val, y_pred)
    log_loss_scores.append(score)
    
    print(f"Fold {idx + 1} log_loss: {score}")

mean_log_loss = np.mean(log_loss_scores)
print(f"Mean log_loss: {mean_log_loss}")