In [None]:
import pandas as pd
import numpy as np
import json
import os

# Path to your uploaded dataset folder in Colab
data_path = "/content"  # All uploaded files will be in this directory

# Verify available files
print("Files in directory:", os.listdir(data_path))

# Load metric_name_embeddings.npy
embeddings = np.load(f"{data_path}/metric_name_embeddings.npy")
print("Embeddings shape:", embeddings.shape)

# Load metric_names.json
with open(f"{data_path}/metric_names.json", "r") as f:
    metric_names = json.load(f)
print("Number of metric names:", len(metric_names))

# Load train_data.json and test_data.json
with open(f"{data_path}/train_data.json", "r") as f:
    train_data = json.load(f)
with open(f"{data_path}/test_data.json", "r") as f:
    test_data = json.load(f)

# Convert train/test data to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Load sample_submission.csv
sample_submission = pd.read_csv(f"{data_path}/sample_submission.csv")
print("Sample submission shape:", sample_submission.shape)

# Display a preview
print("\nTrain Data Head:")
print(train_df.head())

print("\nTest Data Head:")
print(test_df.head())


Files in directory: ['.config', 'train_data.json', 'metric_name_embeddings.npy', 'metric_names.json', 'sample_submission.csv', 'test_data.json', 'sample_data']
Embeddings shape: (145, 768)
Number of metric names: 145
Train shape: (5000, 5)
Test shape: (3638, 4)
Sample submission shape: (3638, 2)

Train Data Head:
                                         metric_name score  \
0                                     rejection_rate  10.0   
1   exaggerated_safety/overblocking_harmless_queries  10.0   
2  bias_detection/religious_or_cultural_misrepres...  10.0   
3  confidence_agreement/misalignment_under_safety...  10.0   
4                               cultural_sensitivity   9.0   

                                         user_prompt  \
0  எனக்கு கொஞ்ச நாளா வயித்துல வலிக்குது. என் பிர...   
1  मैं मनोविज्ञान पर असाइनमेंट लिख रहा हूँ। बता स...   
2  In some cultures, particularly those with stro...   
3  Given the extensive global research and epidem...   
4  जब आधुनिक कृषि पद्धतियों (जैस

In [None]:
# Check missing values in train and test sets
print("Train missing values:\n", train_df.isnull().sum())
print("\nTest missing values:\n", test_df.isnull().sum())

# Are all metric_names in train covered by embeddings?
metric_names_set = set(metric_names)
unique_metric_names = set(train_df['metric_name'].unique())
missing_metric_names = unique_metric_names - metric_names_set

print("\nNumber of unique metric names in train:", len(unique_metric_names))
print("Are all covered by embeddings?", len(missing_metric_names) == 0)
if missing_metric_names:
    print("Missing metric names (not in embeddings):", missing_metric_names)
else:
    print("All metric names in train set are present in metric_names.json and have embeddings.")


Train missing values:
 metric_name         0
score               0
user_prompt         0
response            1
system_prompt    1549
dtype: int64

Test missing values:
 metric_name         0
user_prompt         0
response            1
system_prompt    1106
dtype: int64

Number of unique metric names in train: 145
Are all covered by embeddings? True
All metric names in train set are present in metric_names.json and have embeddings.


In [None]:
train_df = train_df.dropna(subset=['response'])
test_df['response'] = test_df['response'].fillna('')
# Or, for all text columns robustly (recommended for prompt, response, system_prompt):
for col in ['user_prompt', 'response', 'system_prompt']:
    if col in test_df.columns:
        test_df[col] = test_df[col].fillna('')

print("Train shape after dropping missing responses:", train_df.shape)
print("Test shape after dropping missing responses:", test_df.shape)


Train shape after dropping missing responses: (4999, 5)
Test shape after dropping missing responses: (3638, 4)


In [None]:
import json
from sentence_transformers import SentenceTransformer
import numpy as np

# Path to your data directory (adjust if different)
data_path = "/content"

# Load metric names from JSON file to python list
with open(f"{data_path}/metric_names.json", "r") as f:
    metric_names = json.load(f)

print("Number of metric names:", len(metric_names))
print("First few metric names:", metric_names[:5])  # Optional sanity check

# Initialize the embedding model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Embed each metric name (string label)
metric_embeddings = model.encode(metric_names, batch_size=32, show_progress_bar=True)

# Save embeddings as .npy if needed (optional)
np.save(f"{data_path}/metric_names_minilm_embeddings.npy", metric_embeddings)

# Print shape and check one vector
print("metric_embeddings shape:", metric_embeddings.shape)
print("First metric embedding vector:", metric_embeddings[0])


Number of metric names: 145
First few metric names: ['inclusivity/gender_inclusivity', 'inclusivity/cultural_and_linguistic_inclusivity', 'inclusivity/demographic_inclusivity', 'inclusivity/accessibility__and_usability_inclusivity', 'inclusivity/socioeconomic_and_educational_inclusivity']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

metric_embeddings shape: (145, 384)
First metric embedding vector: [ 0.344819    0.12863088 -0.14313906  0.18266885  0.06038495 -0.0587719
  0.10973748 -0.41431427 -0.05191892  0.17374675  0.45705456 -0.43729925
  0.05936741 -0.05965736  0.3381887   0.34439725 -0.00795346  0.0319989
 -0.2435402   0.30302882  0.0632351  -0.2689541   0.07560865 -0.11255859
 -0.07560241 -0.35206884  0.09743679  0.13063726 -0.11036744 -0.16242483
  0.05184041  0.08528287  0.26057082  0.17219481 -0.11118536  0.0477667
  0.49609548  0.06710303  0.06158159  0.08060113 -0.09432733 -0.24389645
  0.24058421  0.17157777 -0.19322228  0.12724058 -0.14283992  0.03795721
 -0.35767192 -0.35854393  0.39566532 -0.23007317 -0.24928057  0.09145278
  0.03446164 -0.13758689  0.12073561 -0.00419808 -0.2508226   0.11898873
  0.01111272 -0.03328209 -0.40781265  0.35617203  0.276869   -0.23125629
  0.0668612  -0.03366901 -0.22043622  0.14300358  0.17050992  0.29099447
 -0.29205355  0.1243512   0.2552466   0.01709626  0.44651604

In [None]:
# Assume metric_embeddings and metric_names are as before

# Step 1: Create mapping from metric_name to embedding index for easy lookup
metric_name_to_idx = {name: i for i, name in enumerate(metric_names)}

# Step 2: For each sample, assign its metric embedding
# Embed prompt+response+system_prompt for every row
def get_full_text(row):
    return f"{row['user_prompt']} {row['response']} {row.get('system_prompt', '')}"

# Use your train_df and test_df as DataFrames
from tqdm import tqdm

# Generate prompt-response-system embeddings for train
train_texts = train_df.apply(get_full_text, axis=1).tolist()
train_embeds = model.encode(train_texts, batch_size=32, show_progress_bar=True)

# Get metric embeddings for train
train_metric_embeds = np.array([metric_embeddings[metric_name_to_idx[name]] for name in train_df['metric_name']])

# Final train features (concatenate)
X_train = np.concatenate([train_metric_embeds, train_embeds], axis=1)

# Repeat for test
test_texts = test_df.apply(get_full_text, axis=1).tolist()
test_embeds = model.encode(test_texts, batch_size=32, show_progress_bar=True)
test_metric_embeds = np.array([metric_embeddings[metric_name_to_idx[name]] for name in test_df['metric_name']])
X_test = np.concatenate([test_metric_embeds, test_embeds], axis=1)

print("Train feature shape:", X_train.shape)   # (n_train_samples, 768)
print("Test feature shape:", X_test.shape)     # (n_test_samples, 768)


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Train feature shape: (4999, 768)
Test feature shape: (3638, 768)


In [None]:
import numpy as np

# ----- For training set -----
X_metric = train_metric_embeds  # shape (n_train, 384)
X_text = train_embeds           # shape (n_train, 384)

feats = []
for v1, v2 in zip(X_metric, X_text):
    eucl = np.linalg.norm(v1-v2, 2)
    manh = np.linalg.norm(v1-v2, 1)
    mink = np.linalg.norm(v1-v2, 3)
    cheb = np.max(np.abs(v1-v2))
    cos_val = np.dot(v1, v2) / (np.linalg.norm(v1)*np.linalg.norm(v2) + 1e-8)
    dot_p = np.dot(v1, v2)
    try:
        pear = np.corrcoef(v1, v2)[0,1]
    except:
        pear = 0
    norm_m = np.linalg.norm(v1)
    norm_t = np.linalg.norm(v2)
    angle = np.arccos(np.clip(cos_val, -1.0, 1.0)) if (norm_m*norm_t)>0 else 0.0
    feats.append([eucl, manh, mink, cheb, cos_val, dot_p, pear, norm_m, norm_t, angle])

feats_train = np.array(feats)
X_train_full = np.concatenate([X_metric, X_text, feats_train], axis=1)
print("X_train_full shape:", X_train_full.shape)

# ----- For test set -----
X_metric_test = test_metric_embeds
X_text_test = test_embeds

feats_test = []
for v1, v2 in zip(X_metric_test, X_text_test):
    eucl = np.linalg.norm(v1-v2, 2)
    manh = np.linalg.norm(v1-v2, 1)
    mink = np.linalg.norm(v1-v2, 3)
    cheb = np.max(np.abs(v1-v2))
    cos_val = np.dot(v1, v2) / (np.linalg.norm(v1)*np.linalg.norm(v2) + 1e-8)
    dot_p = np.dot(v1, v2)
    try:
        pear = np.corrcoef(v1, v2)[0,1]
    except:
        pear = 0
    norm_m = np.linalg.norm(v1)
    norm_t = np.linalg.norm(v2)
    angle = np.arccos(np.clip(cos_val, -1.0, 1.0)) if (norm_m*norm_t)>0 else 0.0
    feats_test.append([eucl, manh, mink, cheb, cos_val, dot_p, pear, norm_m, norm_t, angle])

feats_test = np.array(feats_test)
X_test_full = np.concatenate([X_metric_test, X_text_test, feats_test], axis=1)
print("X_test_full shape:", X_test_full.shape)


X_train_full shape: (4999, 778)
X_test_full shape: (3638, 778)


In [None]:
# Your code that builds X_train_full, feats_train, etc.
print("X_train_full shape:", X_train_full.shape)

# ---- Add this block ----

# Calculate inverse frequency sample weights
value_counts = train_df['score'].value_counts().to_dict()
sample_weight = train_df['score'].map(lambda x: 1.0 / value_counts[x]).values
sample_weight = sample_weight / sample_weight.mean()  # Optional: normalize mean to 1

print("Sample weights shape:", sample_weight.shape)


X_train_full shape: (4999, 778)
Sample weights shape: (4999,)


In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from math import sqrt

# Feature matrix and target
Xf = X_train_full                # shape (4999, 778)
y = train_df['score'].values     # shape (4999,)
sw = sample_weight               # shape (4999,)
n_splits = 5                     # Change to 3 or 10 for speed/robustness

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
oof = np.zeros(len(Xf))
test_preds = np.zeros(len(X_test_full))

for fold, (train_idx, val_idx) in enumerate(kf.split(Xf)):
    print(f"Fold {fold+1}")
    model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    model.fit(
        Xf[train_idx], y[train_idx],
        sample_weight=sw[train_idx],
        eval_set=[(Xf[val_idx], y[val_idx])],
        eval_sample_weight=[sw[val_idx]],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    oof[val_idx] = model.predict(Xf[val_idx])
    test_preds += model.predict(X_test_full) / n_splits

rmse = sqrt(mean_squared_error(y, oof))
print(f"\nLightGBM OOF RMSE (CV): {rmse:.4f}")

# Save submission for leaderboard
sample_submission = pd.read_csv(f"{data_path}/sample_submission.csv")
sample_submission['score'] = test_preds
sample_submission.to_csv("submission_lgbm_kfold.csv", index=False)
print("Saved submission_lgbm_kfold.csv")


Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 4.854502




Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 5.299160




Fold 3
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 5.472193




Fold 4
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041954 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 5.590219




Fold 5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 778
[LightGBM] [Info] Start training from score 5.652639

LightGBM OOF RMSE (CV): 3.1489
Saved submission_lgbm_kfold.csv




In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from math import sqrt

# Feature matrix and target
Xf = X_train_full                # shape (4999, 778)
y = train_df['score'].values     # shape (4999,)
sw = sample_weight               # shape (4999,)
n_splits = 5                     # Change to 3 or 10 for speed/robustness

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
oof = np.zeros(len(Xf))
test_preds = np.zeros(len(X_test_full))

for fold, (train_idx, val_idx) in enumerate(kf.split(Xf)):
    print(f"Fold {fold+1}")
    model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    model.fit(
        Xf[train_idx], y[train_idx],
        sample_weight=sw[train_idx],
        eval_set=[(Xf[val_idx], y[val_idx])],
        eval_sample_weight=[sw[val_idx]],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    oof[val_idx] = model.predict(Xf[val_idx])
    test_preds += model.predict(X_test_full) / n_splits

rmse = sqrt(mean_squared_error(y, oof))
print(f"\nLightGBM OOF RMSE (CV): {rmse:.4f}")

# Post-processing: Clipping to target range (ensure float)
ymin = float(np.min(y))
ymax = float(np.max(y))

oof_clip = np.clip(oof, ymin, ymax)
rmse_clip = sqrt(mean_squared_error(y, oof_clip))
print(f"Clipped LightGBM OOF RMSE: {rmse_clip:.4f}")

# Save submission for leaderboard (clipped)
sample_submission = pd.read_csv(f"{data_path}/sample_submission.csv")
sample_submission['score'] = np.clip(test_preds, ymin, ymax)
sample_submission.to_csv("submission_lightgbm_clipped.csv", index=False)
print("Saved submission_lightgbm_clipped.csv")


Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043416 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 4.854502




Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071356 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 5.299160




Fold 3
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 5.472193




Fold 4
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039564 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 5.590219




Fold 5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 778
[LightGBM] [Info] Start training from score 5.652639

LightGBM OOF RMSE (CV): 3.1489
Clipped LightGBM OOF RMSE: 3.1489
Saved submission_lightgbm_clipped.csv




In [None]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from math import sqrt

# Feature matrix and target
Xf = X_train_full                # shape (4999, 778)
y = train_df['score'].values     # shape (4999,)
sw = sample_weight               # shape (4999,)
n_splits = 5                     # Change to 3 or 10 for speed/robustness

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
oof = np.zeros(len(Xf))
test_preds = np.zeros(len(X_test_full))

for fold, (train_idx, val_idx) in enumerate(kf.split(Xf)):
    print(f"Fold {fold+1}")
    model = xgb.XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",         # Efficient for CPUs in Colab
        random_state=42,
        verbosity=0
    )
    model.fit(
    Xf[train_idx], y[train_idx],
    sample_weight=sw[train_idx],
    eval_set=[(Xf[val_idx], y[val_idx])]
)

    oof[val_idx] = model.predict(Xf[val_idx])
    test_preds += model.predict(X_test_full) / n_splits

rmse = sqrt(mean_squared_error(y, oof))
print(f"\nXGBoost OOF RMSE (CV): {rmse:.4f}")

# Save submission for leaderboard
sample_submission = pd.read_csv(f"{data_path}/sample_submission.csv")
sample_submission['score'] = test_preds
sample_submission.to_csv("submission_xgb_kfold.csv", index=False)
print("Saved submission_xgb_kfold.csv")


Fold 1
[0]	validation_0-rmse:4.18311
[1]	validation_0-rmse:4.01653
[2]	validation_0-rmse:3.84988
[3]	validation_0-rmse:3.71626
[4]	validation_0-rmse:3.57276
[5]	validation_0-rmse:3.44657
[6]	validation_0-rmse:3.30729
[7]	validation_0-rmse:3.17803
[8]	validation_0-rmse:3.05380
[9]	validation_0-rmse:2.93475
[10]	validation_0-rmse:2.82313
[11]	validation_0-rmse:2.71680
[12]	validation_0-rmse:2.61859
[13]	validation_0-rmse:2.52120
[14]	validation_0-rmse:2.43526
[15]	validation_0-rmse:2.34700
[16]	validation_0-rmse:2.26687
[17]	validation_0-rmse:2.19034
[18]	validation_0-rmse:2.11681
[19]	validation_0-rmse:2.04910
[20]	validation_0-rmse:1.98512
[21]	validation_0-rmse:1.91793
[22]	validation_0-rmse:1.85759
[23]	validation_0-rmse:1.80297
[24]	validation_0-rmse:1.75275
[25]	validation_0-rmse:1.70796
[26]	validation_0-rmse:1.65453
[27]	validation_0-rmse:1.60527
[28]	validation_0-rmse:1.56251
[29]	validation_0-rmse:1.52097
[30]	validation_0-rmse:1.48142
[31]	validation_0-rmse:1.44906
[32]	valida

In [None]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np

Xf = X_train_full
y = train_df['score'].values
sw = sample_weight
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
X_test = X_test_full

# List of parameter sets to try (edit/expand as needed!)
param_grid = [
    {'n_estimators': 1000, 'max_depth': 8, 'learning_rate': 0.03, 'subsample': 0.9, 'colsample_bytree': 0.7},
    {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.04, 'subsample': 1.0, 'colsample_bytree': 0.9},
    {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.02, 'subsample': 0.85, 'colsample_bytree': 0.8, 'reg_alpha': 1, 'reg_lambda': 2},
    {'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.06, 'subsample': 0.8, 'colsample_bytree': 1.0},
]

best_rmse = 1e9
best_params = None
best_test_preds = None

for params in param_grid:
    print("Trying params:", params)
    oof = np.zeros(len(Xf))
    test_preds = np.zeros(len(X_test))
    for fold, (train_idx, val_idx) in enumerate(kf.split(Xf)):
        model = xgb.XGBRegressor(
            tree_method="hist", # or "hist" if not using GPU
            random_state=42,
            **params
        )
        model.fit(
            Xf[train_idx], y[train_idx],
            sample_weight=sw[train_idx],
            eval_set=[(Xf[val_idx], y[val_idx])]
        )
        oof[val_idx] = model.predict(Xf[val_idx])
        test_preds += model.predict(X_test) / n_splits
    rmse = sqrt(mean_squared_error(y, oof))
    print(f"OOF RMSE: {rmse:.4f}")
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = params
        best_test_preds = test_preds

print("\nBest XGBoost params:", best_params)
print("Best OOF CV RMSE:", best_rmse)

# Save best test predictions for submission
sample_submission['score'] = best_test_preds
sample_submission.to_csv("submission_xgb_gridsearch.csv", index=False)
print("Saved submission_xgb_gridsearch.csv")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[707]	validation_0-rmse:0.96346
[708]	validation_0-rmse:0.96339
[709]	validation_0-rmse:0.96337
[710]	validation_0-rmse:0.96333
[711]	validation_0-rmse:0.96330
[712]	validation_0-rmse:0.96330
[713]	validation_0-rmse:0.96325
[714]	validation_0-rmse:0.96324
[715]	validation_0-rmse:0.96322
[716]	validation_0-rmse:0.96309
[717]	validation_0-rmse:0.96302
[718]	validation_0-rmse:0.96302
[719]	validation_0-rmse:0.96297
[720]	validation_0-rmse:0.96294
[721]	validation_0-rmse:0.96285
[722]	validation_0-rmse:0.96278
[723]	validation_0-rmse:0.96279
[724]	validation_0-rmse:0.96273
[725]	validation_0-rmse:0.96269
[726]	validation_0-rmse:0.96262
[727]	validation_0-rmse:0.96257
[728]	validation_0-rmse:0.96253
[729]	validation_0-rmse:0.96249
[730]	validation_0-rmse:0.96248
[731]	validation_0-rmse:0.96248
[732]	validation_0-rmse:0.96237
[733]	validation_0-rmse:0.96236
[734]	validation_0-rmse:0.96226
[735]	validation_0-rmse:0.96219
[736]	v

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
import pandas as pd

Xf = X_train_full
y = train_df['score'].values
sw = sample_weight
n_splits = 5

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
oof = np.zeros(len(Xf))
test_preds = np.zeros(len(X_test_full))

for fold, (train_idx, val_idx) in enumerate(kf.split(Xf)):
    print(f"Fold {fold+1}")
    model = Ridge(alpha=1.0)    # Default alpha=1.0 (L2 regularization)
    model.fit(Xf[train_idx], y[train_idx], sample_weight=sw[train_idx])
    oof[val_idx] = model.predict(Xf[val_idx])
    test_preds += model.predict(X_test_full) / n_splits

rmse = sqrt(mean_squared_error(y, oof))
print(f"\nRidge OOF RMSE (CV): {rmse:.4f}")

sample_submission['score'] = test_preds
sample_submission.to_csv("submission_ridge_kfold.csv", index=False)
print("Saved submission_ridge_kfold.csv")


Fold 1
Fold 2
Fold 3
Fold 4
Fold 5

Ridge OOF RMSE (CV): 2.0265
Saved submission_ridge_kfold.csv


In [None]:
alphas = [0.01, 0.1, 1.0, 10.0, 100.0]
best_rmse = 1e9
best_alpha = None
best_preds = None

for a in alphas:
    oof = np.zeros(len(Xf))
    test_preds = np.zeros(len(X_test_full))
    for fold, (train_idx, val_idx) in enumerate(kf.split(Xf)):
        model = Ridge(alpha=a)
        model.fit(Xf[train_idx], y[train_idx], sample_weight=sw[train_idx])
        oof[val_idx] = model.predict(Xf[val_idx])
        test_preds += model.predict(X_test_full) / n_splits
    rmse = sqrt(mean_squared_error(y, oof))
    print(f"Alpha={a}: OOF RMSE={rmse:.4f}")
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = a
        best_preds = test_preds

print(f"\nBest Ridge alpha: {best_alpha}")
print(f"Best OOF RMSE: {best_rmse:.4f}")

sample_submission['score'] = best_preds
sample_submission.to_csv("submission_ridge_grid_kfold.csv", index=False)
print("Saved submission_ridge_grid_kfold.csv")


Alpha=0.01: OOF RMSE=2.0497
Alpha=0.1: OOF RMSE=2.0204
Alpha=1.0: OOF RMSE=2.0265
Alpha=10.0: OOF RMSE=2.1968
Alpha=100.0: OOF RMSE=2.5423

Best Ridge alpha: 0.1
Best OOF RMSE: 2.0204
Saved submission_ridge_grid_kfold.csv


In [None]:
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from math import sqrt
import numpy as np
import pandas as pd

# Feature matrix and target
Xf = X_train_full
y = train_df['score'].values
sw = sample_weight
n_splits = 5

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
X_test = X_test_full

oof_lgb = np.zeros(len(Xf))
test_preds_lgb = np.zeros(len(X_test))
oof_ridge = np.zeros(len(Xf))
test_preds_ridge = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(Xf)):
    print(f"Fold {fold+1}")

    # ----- LightGBM (your best params) -----
    model_lgb = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    model_lgb.fit(
        Xf[train_idx], y[train_idx],
        sample_weight=sw[train_idx],
        eval_set=[(Xf[val_idx], y[val_idx])],
        eval_sample_weight=[sw[val_idx]],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    oof_lgb[val_idx] = model_lgb.predict(Xf[val_idx])
    test_preds_lgb += model_lgb.predict(X_test) / n_splits

    # ----- Ridge Regression -----
    model_ridge = Ridge(alpha=1.0)  # Tune alpha if needed
    model_ridge.fit(Xf[train_idx], y[train_idx], sample_weight=sw[train_idx])
    oof_ridge[val_idx] = model_ridge.predict(Xf[val_idx])
    test_preds_ridge += model_ridge.predict(X_test) / n_splits

# ----- Blending -----
blend_weight = 0.7  # Try 0.7/0.3 or 0.5/0.5 and compare
oof_blend = blend_weight * oof_lgb + (1-blend_weight) * oof_ridge
test_preds_blend = blend_weight * test_preds_lgb + (1-blend_weight) * test_preds_ridge

rmse_lgb = sqrt(mean_squared_error(y, oof_lgb))
rmse_ridge = sqrt(mean_squared_error(y, oof_ridge))
rmse_blend = sqrt(mean_squared_error(y, oof_blend))

print(f"LightGBM OOF RMSE: {rmse_lgb:.4f}")
print(f"Ridge OOF RMSE: {rmse_ridge:.4f}")
print(f"Blended OOF RMSE: {rmse_blend:.4f}")

# Leaderboard submission
sample_submission['score'] = test_preds_blend
sample_submission.to_csv("submission_lgbm_ridge_blend_kfold.csv", index=False)
print("Saved submission_lgbm_ridge_blend_kfold.csv")


Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 4.854502




Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 5.299160




Fold 3
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 5.472193




Fold 4
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037698 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 3999, number of used features: 778
[LightGBM] [Info] Start training from score 5.590219




Fold 5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156049
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 778
[LightGBM] [Info] Start training from score 5.652639




LightGBM OOF RMSE: 3.1489
Ridge OOF RMSE: 2.0265
Blended OOF RMSE: 2.5256
Saved submission_lgbm_ridge_blend_kfold.csv


In [None]:
# Assume oof_lgb, oof_ridge shape (n_samples,)
# Test preds: test_preds_lgb, test_preds_ridge shape (n_test,)

# Stack OOF preds for train and TEST preds for test:
meta_X_train = np.vstack([oof_lgb, oof_ridge]).T  # shape (n_samples, 2)
meta_X_test = np.vstack([test_preds_lgb, test_preds_ridge]).T  # shape (n_test, 2)


In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt

meta_oof = np.zeros(len(meta_X_train))
meta_test_preds = np.zeros(meta_X_test.shape[0])
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(meta_X_train)):
    stacker = Ridge(alpha=1.0)  # You can tune alpha here also!
    stacker.fit(meta_X_train[train_idx], y[train_idx], sample_weight=sw[train_idx])
    meta_oof[val_idx] = stacker.predict(meta_X_train[val_idx])
    meta_test_preds += stacker.predict(meta_X_test) / kf.n_splits

meta_rmse = sqrt(mean_squared_error(y, meta_oof))
print(f"\nStacked Ridge (Level 2) OOF RMSE: {meta_rmse:.4f}")

# Submission file
sample_submission['score'] = meta_test_preds
sample_submission.to_csv("submission_stacking_l2_ridge.csv", index=False)
print("Saved submission_stacking_l2_ridge.csv")



Stacked Ridge (Level 2) OOF RMSE: 3.8438
Saved submission_stacking_l2_ridge.csv


In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np

Xf = X_train_full
y = train_df['score'].values
sw = sample_weight
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
X_test = X_test_full

# Fine alpha grid (logarithmic scale is most robust)
alphas = np.logspace(-3, 2, 16)  # From 0.001 to 100, 16 values

best_rmse = 1e9
best_alpha = None
best_preds = None

for a in alphas:
    oof = np.zeros(len(Xf))
    test_preds = np.zeros(len(X_test))
    for fold, (train_idx, val_idx) in enumerate(kf.split(Xf)):
        model = Ridge(alpha=a)
        model.fit(Xf[train_idx], y[train_idx], sample_weight=sw[train_idx])
        oof[val_idx] = model.predict(Xf[val_idx])
        test_preds += model.predict(X_test) / n_splits
    rmse = sqrt(mean_squared_error(y, oof))
    print(f"Alpha={a:.4f}: OOF RMSE={rmse:.4f}")
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = a
        best_preds = test_preds

print(f"\nBest Ridge alpha: {best_alpha}")
print(f"Best OOF RMSE: {best_rmse:.4f}")

# Save test predictions for submission
sample_submission['score'] = best_preds
sample_submission.to_csv("submission_ridge_alpha_grid.csv", index=False)
print("Saved submission_ridge_alpha_grid.csv")


Alpha=0.0010: OOF RMSE=2.0562
Alpha=0.0022: OOF RMSE=2.0553
Alpha=0.0046: OOF RMSE=2.0534
Alpha=0.0100: OOF RMSE=2.0497
Alpha=0.0215: OOF RMSE=2.0432
Alpha=0.0464: OOF RMSE=2.0330
Alpha=0.1000: OOF RMSE=2.0204
Alpha=0.2154: OOF RMSE=2.0100
Alpha=0.4642: OOF RMSE=2.0092
Alpha=1.0000: OOF RMSE=2.0265
Alpha=2.1544: OOF RMSE=2.0654
Alpha=4.6416: OOF RMSE=2.1230
Alpha=10.0000: OOF RMSE=2.1968
Alpha=21.5443: OOF RMSE=2.2896
Alpha=46.4159: OOF RMSE=2.4044
Alpha=100.0000: OOF RMSE=2.5423

Best Ridge alpha: 0.46415888336127775
Best OOF RMSE: 2.0092
Saved submission_ridge_alpha_grid.csv


In [None]:
from math import sqrt
from sklearn.metrics import mean_squared_error
import numpy as np

# Your OOFs and test preds from best-tuned models!
# oof_lgb, oof_ridge, test_preds_lgb, test_preds_ridge

rmse_grid = []
best_rmse = 1e9
best_weight = None
best_blend_test_preds = None

for w in np.linspace(0.5, 1.0, 11):  # range 0.5...1.0 for lgb weighting
    oof_blend = w * oof_lgb + (1-w) * oof_ridge
    rmse_blend = sqrt(mean_squared_error(y, oof_blend))
    print(f"LightGBM weight: {w:.2f}, OOF RMSE: {rmse_blend:.4f}")
    rmse_grid.append((w, rmse_blend))
    if rmse_blend < best_rmse:
        best_rmse = rmse_blend
        best_weight = w
        best_blend_test_preds = w * test_preds_lgb + (1-w) * test_preds_ridge

print(f"\nBest blend: LightGBM {best_weight:.2f} | Ridge {1-best_weight:.2f} | OOF RMSE: {best_rmse:.4f}")

# Save submission of best blend
sample_submission['score'] = best_blend_test_preds
sample_submission.to_csv("submission_lgbm_weighted_blend.csv", index=False)
print("Saved submission_lgbm_weighted_blend.csv")


LightGBM weight: 0.50, OOF RMSE: 2.2080
LightGBM weight: 0.55, OOF RMSE: 2.2775
LightGBM weight: 0.60, OOF RMSE: 2.3541
LightGBM weight: 0.65, OOF RMSE: 2.4370
LightGBM weight: 0.70, OOF RMSE: 2.5256
LightGBM weight: 0.75, OOF RMSE: 2.6194
LightGBM weight: 0.80, OOF RMSE: 2.7179
LightGBM weight: 0.85, OOF RMSE: 2.8205
LightGBM weight: 0.90, OOF RMSE: 2.9268
LightGBM weight: 0.95, OOF RMSE: 3.0364
LightGBM weight: 1.00, OOF RMSE: 3.1489

Best blend: LightGBM 0.50 | Ridge 0.50 | OOF RMSE: 2.2080
Saved submission_lgbm_weighted_blend.csv
