In [None]:
from utils_prime import load_json, use_tf_idf_vectorizer, use_count_vectorizer, use_tf_idf_vectorizer_for_test
from tqdm import tqdm
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import os
import warnings
warnings.filterwarnings("ignore")

In [None]:
sample = 10000

In [None]:
df = pd.read_csv("../data/train.csv")
df = df[:sample]
df.head()

In [None]:
train_pred = pd.read_csv("train_pred.csv")
train_pred = train_pred[:sample]
train_pred.rename(
    {
        "winner_model_a": "winner_model_a_pred_prob", 
        "winner_model_b": "winner_model_b_pred_prob", 
        "winner_tie": "winner_tie_pred_prob"
    }, 
    axis=1, 
    inplace=True
)
print(train_pred.head())
print(train_pred.shape)

In [None]:
df = df.merge(train_pred, on="id", how="left")
df.head()

In [None]:
df.drop(columns=["model_a", "model_b"], inplace=True)
df["label"] = df.apply(lambda row: 0 if row["winner_model_a"] else (1 if row["winner_model_b"] else 2), axis=1)
df.drop(columns=["winner_model_a", "winner_model_b", "winner_tie"], inplace=True)
df.head()

In [None]:
df = load_json(df)
df.head()

In [None]:
def calculate_length_sum(response):
    length_sum = sum(len(s) if s is not None else 0 for s in response)
    return length_sum

def calculate_total_words(response):
    total_words = sum(len(s.split()) if s is not None else 0 for s in response)
    return total_words

df["len_a"] = df["response_a"].apply(calculate_length_sum)
df["len_b"] = df["response_b"].apply(calculate_length_sum)
df["word_a"] = df["response_a"].apply(calculate_total_words)
df["word_b"] = df["response_b"].apply(calculate_total_words)

df.head()

In [None]:
type(df["prompt"][0])

In [None]:
def join_strings(string_list):
    filtered_list = [s if s else "NULL" for s in string_list]
    return " ".join(filtered_list)

df["prompt"] = df["prompt"].apply(join_strings)
df["response_a"] = df["response_a"].apply(join_strings)
df["response_b"] = df["response_b"].apply(join_strings)
print(df.dtypes)

In [None]:
df.head()

#### Use LGBM

In [None]:
if not os.path.exists("lgbm_models"):
    os.makedirs("lgbm_models")

In [None]:
df["len_a"] = df["len_a"].astype(float)
df["len_b"] = df["len_b"].astype(float)
df["word_a"] = df["word_a"].astype(float)
df["word_b"] = df["word_b"].astype(float)

n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

columns_to_normalize = ["len_a", "len_b", "word_a", "word_b"]
scaler = MinMaxScaler()

log_loss_scores = []

for idx, (train_index, val_index) in tqdm(enumerate(kf.split(df)), total=n_splits):
    print(f"---------- Fold {idx + 1} ----------")

    copy_df = df.copy()
    copy_df["concat"] = copy_df["prompt"] + copy_df["response_a"] + copy_df["response_b"]

    X_train = copy_df.iloc[train_index]
    X_val = copy_df.iloc[val_index]
    
    concat_tf_idf_vectorizer, concat_tf_idf_matrix = use_tf_idf_vectorizer(train=X_train, column_name="concat", idx=idx + 1)
    print("The shape of concat_tf_idf_matrix is:", concat_tf_idf_matrix.shape)
    X_train = X_train.merge(concat_tf_idf_matrix, on="id", how="left")

    concat_tf_idf_matrix_val = use_tf_idf_vectorizer_for_test(concat_tf_idf_vectorizer, X_val, column_name="concat", idx=idx + 1)
    X_val = X_val.merge(concat_tf_idf_matrix_val, on="id", how="left")

    X_train = X_train.drop(columns=["id", "prompt", "response_a", "response_b", "concat"])
    y_train = X_train["label"]
    X_train = X_train.drop(columns=["label"])

    X_val = X_val.drop(columns=["id", "prompt", "response_a", "response_b", "concat"])
    y_val = X_val["label"]
    X_val = X_val.drop(columns=["label"])
    
    X_train.loc[:, columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_val.loc[:, columns_to_normalize] = scaler.transform(X_val[columns_to_normalize])
    with open(f"scalers/scaler_{idx + 1}.pkl", "wb") as f:
        pickle.dump(scaler, f)
    
    params = {
        "objective": "multiclass",
        "num_class": 3,
        "metric": "multi_logloss",
        "n_estimators": 3000,
        "learning_rate": 0.02,
        "colsample_bytree": 0.7,
        "num_leaves": 127,
        "max_depth": 12,
        "verbose": 1,
        "n_jobs": -1
    }
    model = lgb.LGBMClassifier(**params)
    early_stopping_callback = lgb.early_stopping(50, first_metric_only=True, verbose=True)

    model.fit(
        X=X_train, y=y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping_callback]
    )
    
    y_pred = model.predict_proba(X_val, num_iteration=model.best_iteration_)

    model.booster_.save_model(f"./lgbm_models/lgbm_model_fold_{idx + 1}.txt")

    # To load the model
    # model = lgb.Booster(model_file=f"./lgbm_models/lgbm_model_fold_{idx + 1}.txt")
    
    score = log_loss(y_val, y_pred)
    log_loss_scores.append(score)
    
    print(f"Fold {idx + 1} log_loss: {score}")

mean_log_loss = np.mean(log_loss_scores)
print(f"Mean log_loss: {mean_log_loss}")