In [23]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/qwen-embedding/qwen_embeddings.csv
/kaggle/input/qwen-embedding/qwen_embeddings (1).npy
/kaggle/input/lola-llm-assisted-online-learning-algorithm/upworthy-archive-confirmatory-packages-03.12.2020.csv
/kaggle/input/lola-llm-assisted-online-learning-algorithm/LoRA_CTR_test.csv
/kaggle/input/lola-llm-assisted-online-learning-algorithm/upworthy-archive-exploratory-packages-03.12.2020.csv
/kaggle/input/lola-llm-assisted-online-learning-algorithm/winner-all.csv
/kaggle/input/lola-llm-assisted-online-learning-algorithm/selected_pairs_df_005_256.csv
/kaggle/input/lola-llm-assisted-online-learning-algorithm/LoRA_CTR_train.csv
/kaggle/input/lola-llm-assisted-online-learning-algorithm/selected_pairs_df_005_3072.csv
/kaggle/input/lola-llm-assisted-online-learning-algorithm/upworthy-archive-holdout-packages-03.12.2020.csv
/kaggle/input/lola-llm-assisted-online-learning-algorithm/all_test_headline_embed_3072.csv


# 1. Data Description:
   This is the Headline A/B testing dataset of Upworthy. The data is of the A/B test conducted in between year 2013-2015. The dataset has Headlines and their impressions and clicks they got. We would assess the headline through their click-through-ratio(CTR).

In [8]:
#Combine three subsets to create a complete dataset
csv_file_path_1 = '/kaggle/input/lola-llm-assisted-online-learning-algorithm/upworthy-archive-exploratory-packages-03.12.2020.csv'
csv_file_path_2 = '/kaggle/input/lola-llm-assisted-online-learning-algorithm/upworthy-archive-confirmatory-packages-03.12.2020.csv'
csv_file_path_3 = '/kaggle/input/lola-llm-assisted-online-learning-algorithm/upworthy-archive-holdout-packages-03.12.2020.csv'
csv_file_path_4 = '/kaggle/input/lola-llm-assisted-online-learning-algorithm/LoRA_CTR_train.csv'
csv_file_path_5 = '/kaggle/input/lola-llm-assisted-online-learning-algorithm/LoRA_CTR_test.csv'

# Read the files, converting column 15 to string type
df1 = pd.read_csv(csv_file_path_1, dtype={15: str})
df2 = pd.read_csv(csv_file_path_2, dtype={15: str})
df3 = pd.read_csv(csv_file_path_3, dtype={15: str})
ctr_train = pd.read_csv(csv_file_path_4, dtype={15: str})
ctr_test = pd.read_csv(csv_file_path_5, dtype={15: str})

df = pd.concat([df1, df2, df3])
df.to_csv('upworthy-archive-packages-all.csv', index=False) # saves the combined dataset

# 2. Data Statistics
- Packages: A individual headline 
- Tests: Each test had multiple packages compared with each other
- Clicks: Number of people who clicked the article
- Impression: Number of people who viewed the headline
- clickabality_id: id for each test conducted
- CTR: It is the ratio of Clicks/Total Impression


In [9]:
# print(f"---------- first few Rows ----------")
# print(df.head(10))
# print('\n \n \n \n \n')

print(f'---------- Datatype of each Column ----------')
print(df.info())
print('\n')

print(f'---------- Dataset info ----------')

print('\n')
# Dataset shape info
print(f'shape of our datset: {df.shape}')

print('\n')
# packages information
num_of_packages = len(df['clickability_test_id'])
print(f'Total number of packages: {num_of_packages}')

# tests information
num_of_tests = len(df['clickability_test_id'].unique())
print(f'number of tests: {num_of_tests}')

print('\n')
# user engagement info
total_impressions = df['impressions'].sum()
total_clicks = df['clicks'].sum()
print(f'Total number of impressions in complete dataset: {total_impressions}')
print(f'Total number of clicks in the complete datset: {total_clicks}')
# df[['impressions', 'clicks']].sum()


# general info
average_packages_per_test = num_of_packages / num_of_tests
average_clicks = total_clicks / num_of_packages
average_impressions = total_impressions / num_of_packages 
average_ctr = total_clicks / total_impressions

print('\n')
print(f'average packages(Headlines) per tests: {average_packages_per_test:.2f}')
print(f'average clicks per package(Headline): {average_clicks:.2f}')
print(f'average impressions per packages(Headlines): {average_impressions:.2f}')
print(f'average ctr per package(Headline): {average_ctr:.2f}')

---------- Datatype of each Column ----------
<class 'pandas.core.frame.DataFrame'>
Index: 150817 entries, 0 to 22599
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Unnamed: 0            150817 non-null  int64  
 1   created_at            150817 non-null  object 
 2   updated_at            150817 non-null  object 
 3   clickability_test_id  150817 non-null  object 
 4   excerpt               134790 non-null  object 
 5   headline              150816 non-null  object 
 6   lede                  150713 non-null  object 
 7   slug                  150817 non-null  object 
 8   eyecatcher_id         150636 non-null  object 
 9   impressions           150817 non-null  int64  
 10  clicks                150817 non-null  int64  
 11  significance          150817 non-null  float64
 12  first_place           150817 non-null  bool   
 13  winner                150817 non-null  bool   
 14  share_text  

# 3. Data Filtering

In [10]:
# convert the int64 values to FP64
df['clicks'] = df['clicks'].astype(float) 
df['impressions'] = df['impressions'].astype(float) 

# Create a CTR column
df['CTR'] = df['clicks']/df['impressions']

# convert tto datetime format dtype
df['created_at'] = pd.to_datetime(df['created_at'], format='mixed', errors='coerce')

# Only takes tests which has more than 1 package, this step takes time
filtered_groups = df.groupby(['clickability_test_id', 'eyecatcher_id']).filter(lambda x: x['headline'].nunique() > 1).reset_index(drop=True)

# Drop unnecessary columns
new_df = filtered_groups[['clickability_test_id', 'eyecatcher_id', 'created_at','headline', 'CTR','clicks', 'impressions']].drop_duplicates()

# saves this filtered dataset
new_df.to_csv('filtered-ctr-all.csv', index=False)

del df, new_df # removes the data from current memory 

In [11]:
df = pd.read_csv('/kaggle/working/filtered-ctr-all.csv')
df = df.sort_values(by='created_at').reset_index(drop=True)
df['new_id'] = df.groupby(['clickability_test_id', 'eyecatcher_id']).ngroup()

# chek few values to understand the structure
# df.head(30)

In [12]:
# print(f"---------- first few Rows ----------")
# print(df.head(10))
# print('\n \n \n \n \n')

print(f'---------- Datatype of each Column ----------')
print(df.info())
print('\n')

print(f'---------- Dataset info ----------')

print('\n')
# Dataset shape info
print(f'shape of our datset: {df.shape}')

print('\n')
# packages information
filtered_num_of_packages = len(df['clickability_test_id'])
print(f'Total number of packages: {filtered_num_of_packages}')

# tests information
filtered_num_of_tests = len(df['new_id'].unique())
print(f'number of tests: {filtered_num_of_tests}')

print('\n')
# user engagement info
filtered_total_impressions = df['impressions'].sum()
filtered_total_clicks = df['clicks'].sum()
print(f'Total number of impressions in complete dataset: {filtered_total_impressions}')
print(f'Total number of clicks in the complete datset: {filtered_total_clicks}')
# df[['impressions', 'clicks']].sum()


# general info
filtered_average_packages_per_test = filtered_num_of_packages / filtered_num_of_tests
filtered_average_clicks = filtered_total_clicks / filtered_num_of_packages
filtered_average_impressions = filtered_total_impressions / filtered_num_of_packages 
filtered_average_ctr = filtered_total_clicks / filtered_total_impressions

print('\n')
print(f'average packages(Headlines) per tests: {filtered_average_packages_per_test:.2f}')
print(f'average clicks per package(Headline): {filtered_average_clicks:.2f}')
print(f'average impressions per packages(Headlines): {filtered_average_impressions:.2f}')
print(f'average ctr per package(Headline): {filtered_average_ctr:.2f}')

print('\n')
# Get the frequency of each new_id value
id_counts = df['new_id'].value_counts().sort_index()
frequency_table = id_counts.value_counts().sort_index()

# Set the threshold for grouping
threshold = 7

print("Frequency Table - Number of Rows with Same new_id")
print("=" * 65)
print(f"{'Rows per new_id':<20} {'Number of Groups':<20} {'% of Dataset':<15}")
print("-" * 65)

# Display counts below threshold
for num_rows in range(1, threshold):
    if num_rows in frequency_table.index:
        num_groups = frequency_table[num_rows]
        percentage = (num_groups / filtered_num_of_tests) * 100
        print(f"{num_rows:<20} {num_groups:<20} {percentage:.2f}%")

# Display grouped count for threshold and above
grouped_groups = frequency_table[frequency_table.index >= threshold].sum()
if grouped_groups > 0:
    percentage_grouped = (grouped_groups / filtered_num_of_tests) * 100
    print(f"{f'{threshold} or more':<20} {grouped_groups:<20} {percentage_grouped:.2f}%")

---------- Datatype of each Column ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77245 entries, 0 to 77244
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   clickability_test_id  77245 non-null  object 
 1   eyecatcher_id         77245 non-null  object 
 2   created_at            77245 non-null  object 
 3   headline              77244 non-null  object 
 4   CTR                   77245 non-null  float64
 5   clicks                77245 non-null  float64
 6   impressions           77245 non-null  float64
 7   new_id                77245 non-null  int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 4.7+ MB
None


---------- Dataset info ----------


shape of our datset: (77245, 8)


Total number of packages: 77245
number of tests: 17681


Total number of impressions in complete dataset: 277338713.0
Total number of clicks in the complete datset: 3741517.0


average packages(He

# 4. Train-Test Split

In [13]:
# Determine the indices for train and test split
train_size = int(len(df) * 0.7)
test_size = int(len(df) * 0.2)

# Create the train and test DataFrames
train_df = df.iloc[:train_size]
test_df = df.iloc[-test_size:]

# Display the resulting DataFrames
print("Train DataFrame (first 70%):")
print(f'Shape of train data by time: {train_df.shape}')

print("\nTest DataFrame (last 20%):")
print(f'shape of test data by time: {test_df.shape}')

train_df.to_csv('train_order_by_time.csv', index=False)
test_df.to_csv('test_order_by_time.csv', index=False)

Train DataFrame (first 70%):
Shape of train data by time: (54071, 8)

Test DataFrame (last 20%):
shape of test data by time: (15449, 8)


In [14]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import train_test_split

df = pd.read_csv('/kaggle/working/filtered-ctr-all.csv')

df = df[['clickability_test_id', 'eyecatcher_id', 'headline', 'impressions', 'clicks']]
df.loc[:, 'CTR'] = df['clicks'] / df['impressions']
df['test_id'] = df.groupby(['clickability_test_id', 'eyecatcher_id']).ngroup()

train_ratio = 0.7
calibrate_ratio = 0.1
test_ratio = 0.2

gss = GroupShuffleSplit(n_splits=1, test_size=test_ratio+calibrate_ratio, random_state=42)
train_idx, temp_idx = next(gss.split(df, groups=df['test_id']))
train_df = df.iloc[train_idx]
temp_df = df.iloc[temp_idx]

gss2 = GroupShuffleSplit(n_splits=1, test_size=test_ratio/(test_ratio+calibrate_ratio), random_state=42)
calibrate_idx, test_idx = next(gss2.split(temp_df, groups=temp_df['test_id']))
calibrate_df = temp_df.iloc[calibrate_idx]
test_df = temp_df.iloc[test_idx]

del temp_df

print(f'headlines in train date: {len(train_df)}')
print(f'headlines in test data: {len(test_df)}')
print(f'headlines in calibration data: {len(calibrate_df)}')

unique_headlines_train = set(train_df['headline'].unique())
test_df = test_df[~test_df['headline'].isin(unique_headlines_train)]
calibrate_df = calibrate_df[~calibrate_df['headline'].isin(unique_headlines_train)]

print('\nremoving the duplicate from the training data the test and calibrate data size reduces')
print(f'- removed data leak headlines: {len(test_df)}')
print(f'- removed data leak headlines: {len(calibrate_df)}') # this used as validation data

headlines in train date: 53965
headlines in test data: 15483
headlines in calibration data: 7797

removing the duplicate from the training data the test and calibrate data size reduces
- removed data leak headlines: 12039
- removed data leak headlines: 6072


In [15]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
calibrate_df.to_csv('calibrate.csv', index=False)

# 5. Embedding Creation

In [17]:
import os
import json
import math
from pathlib import Path
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

DEVICE = "cuda"
print("Device:", DEVICE)

# Replace this path with your actual CSV path if different
DATA_PATH_TRAIN = "/kaggle/working/train.csv"  # <--- change if needed
DATA_PATH_CALIBRATE = "/kaggle/working/calibrate.csv"
DATA_PATH_TEST = "/kaggle/working/test.csv"

df_train = pd.read_csv(DATA_PATH_TRAIN)  # we have loaded the training data
df_calibrate = pd.read_csv(DATA_PATH_CALIBRATE)
df_test = pd.read_csv(DATA_PATH_TEST)

# quick checks
assert 'headline' in df_train.columns, "CSV must contain a 'headline' column"
assert 'CTR' in df_train.columns, "CSV must contain a 'CTR' column"
print("the columns 'headline' and 'CTR' exists")

assert 'headline' in df_calibrate.columns, "CSV must contain a 'headline' column"
assert 'CTR' in df_calibrate.columns, "CSV must contain a 'CTR' column"
print("the columns 'headline' and 'CTR' exists")

assert 'headline' in df_test.columns, "CSV must contain a 'headline' column"
assert 'CTR' in df_test.columns, "CSV must contain a 'CTR' column"
print("the columns 'headline' and 'CTR' exists")

# Basic preprocessing function (customize as you like)
def preprocess_text(s):
    if pd.isna(s):
        return ""
    # minimal preprocessing: strip, replace multiple spaces
    t = str(s).strip()
    t = " ".join(t.split())
    return t

# Apply preprocessing and deduplicate headings optionally
df_train['headline'] = df_train['headline'].astype(str).apply(preprocess_text)
df_train = df_train.reset_index(drop=True)

df_calibrate['headline'] = df_calibrate['headline'].astype(str).apply(preprocess_text)
df_calibrate = df_calibrate.reset_index(drop=True)

df_test['headline'] = df_test['headline'].astype(str).apply(preprocess_text)
df_test = df_test.reset_index(drop=True)

print(f"Loaded train {len(df_train)} headlines. Example:\n", df_train.head(3))
print(f"Loaded calibrate{len(df_calibrate)} headlines. Example:\n", df_calibrate.head(3))
print(f"Loaded test {len(df_test)} headlines. Example:\n", df_test.head(3))

Device: cuda
the columns 'headline' and 'CTR' exists
the columns 'headline' and 'CTR' exists
the columns 'headline' and 'CTR' exists
Loaded train 53965 headlines. Example:
        clickability_test_id             eyecatcher_id  \
0  546e009a9ad54ec65b00004b  546c7f2dbadeb5788700000a   
1  546e009a9ad54ec65b00004b  546c7f2dbadeb5788700000a   
2  546e009a9ad54ec65b00004b  546c7f2dbadeb5788700000a   

                                            headline  impressions  clicks  \
0  What They Learned From The Scientist Was Terri...       4594.0    51.0   
1  A Science Guy Helps 3 Dudes From America Under...       4571.0    58.0   
2  He Sat Them Down And Told Them About An Immine...       4601.0    27.0   

        CTR  test_id  
0  0.011101    14366  
1  0.012689    14366  
2  0.005868    14366  
Loaded calibrate6072 headlines. Example:
        clickability_test_id             eyecatcher_id  \
0  546f889587942aedcb000048  546ebd5b92f391daa3000014   
1  546f889587942aedcb000048  546ebd5b92f3

In [27]:
from sentence_transformers import SentenceTransformer
import torch

# qwen_model_name = "Qwen/Qwen3-Embedding-8B"
qwen_model_name = "Qwen/Qwen3-Embedding-0.6B"


print(f"Trying SentenceTransformer({qwen_model_name}) ...")
qwen_model = SentenceTransformer(qwen_model_name)
print(f"Succesfully loaded SentenceTransformer({qwen_model_name}) ...")

batch_size = 64


print('Creating embedding for training data')
## ----- Train Embedding -----
# creating all the embeddings
texts = df_train['headline'].tolist()
all_embeddings = []
done = 0
for i in range(0, len(texts), batch_size):  
    batch_texts = texts[i:i+batch_size]
    if done >= 10000:
        print(f'Embedings done: {done}')
        done = 0
    # print(f'Embedings done: {done}')
    emb = qwen_model.encode(batch_texts)
    # ensure numpy
    all_embeddings.append(np.array(emb))
    done += batch_size
qwen_embeddings_train = np.vstack(all_embeddings)

# saving the embeddings and csv with embedding
qwen_path_npy = "qwen_embeddings_train.npy"
np.save(qwen_path_npy, qwen_embeddings_train)
print(f"Saved embeddings numpy to: {qwen_path_npy}")

qwen_path_csv = "qwen_embeddings_train.csv"
df_out = pd.DataFrame({'headline': df_train['headline'], 
                       'CTR': df_train['CTR'], 
                       'embedding': [json.dumps(e.tolist()) for e in qwen_embeddings_train]})
df_out.to_csv(qwen_path_csv, index=False)
print(f"Saved CSV to: {qwen_path_csv}")


print('Creating embedding for calibration data')
## ----- Calibrate Embedding -----
# creating all the embeddings
texts = df_calibrate['headline'].tolist()
all_embeddings = []
done = 0
for i in range(0, len(texts), batch_size):  
    batch_texts = texts[i:i+batch_size]
    if done >= 1000:
        print(f'Embedings done: {done}')
        done = 0
    emb = qwen_model.encode(batch_texts)
    # ensure numpy
    all_embeddings.append(np.array(emb))
    done += batch_size
qwen_embeddings_calibrate = np.vstack(all_embeddings)

# saving the embeddings and csv with embedding
qwen_path_npy = "qwen_embeddings_calibrate.npy"
np.save(qwen_path_npy, qwen_embeddings_calibrate)
print(f"Saved embeddings numpy to: {qwen_path_npy}")

qwen_path_csv = "qwen_embeddings_calibrate.csv"
df_out = pd.DataFrame({'headline': df_calibrate['headline'], 
                       'CTR': df_calibrate['CTR'], 
                       'embedding': [json.dumps(e.tolist()) for e in qwen_embeddings_calibrate]})
df_out.to_csv(qwen_path_csv, index=False)
print(f"Saved CSV to: {qwen_path_csv}")


print('Creating embedding for test data')
## ----- Test Embedding -----
# creating all the embeddings
texts = df_test['headline'].tolist()
all_embeddings = []
done = 0
for i in range(0, len(texts), batch_size):  
    batch_texts = texts[i:i+batch_size]
    if done >= 1000:
        print(f'Embedings done: {done}')
        done = 0
    emb = qwen_model.encode(batch_texts)
    # ensure numpy
    all_embeddings.append(np.array(emb))
    done += batch_size
qwen_embeddings_test = np.vstack(all_embeddings)

# saving the embeddings and csv with embedding
qwen_path_npy = "qwen_embeddings_test.npy"
np.save(qwen_path_npy, qwen_embeddings_test)
print(f"Saved embeddings numpy to: {qwen_path_npy}")

qwen_path_csv = "qwen_embeddings_test.csv"
df_out = pd.DataFrame({'headline': df_test['headline'], 
                       'CTR': df_test['CTR'], 
                       'embedding': [json.dumps(e.tolist()) for e in qwen_embeddings_test]})
df_out.to_csv(qwen_path_csv, index=False)
print(f"Saved CSV to: {qwen_path_csv}")

Trying SentenceTransformer(Qwen/Qwen3-Embedding-0.6B) ...
Succesfully loaded SentenceTransformer(Qwen/Qwen3-Embedding-0.6B) ...
Creating embedding for training data
Embedings done: 10048
Embedings done: 10048
Embedings done: 10048
Embedings done: 10048
Embedings done: 10048
Saved embeddings numpy to: qwen_embeddings_train.npy
Saved CSV to: qwen_embeddings_train.csv
Creating embedding for calibration data
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Saved embeddings numpy to: qwen_embeddings_calibrate.npy
Saved CSV to: qwen_embeddings_calibrate.csv
Creating embedding for test data
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Embedings done: 1024
Saved embeddings numpy to: qwen_embeddings_test.npy
Saved CSV to: qwen_embeddings_test.csv


In [28]:
import os
from pathlib import Path
import json
import numpy as np
import pandas as pd
import random
import joblib
import math
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt

# Paths
# OUT_DIR = Path("outputs")
# EMB_DIR = "/kaggle/input/qwen-embedding"
MODEL_DIR = Path("models")
TRAIN_DIR = Path("training")
TRAIN_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

Device: cuda


In [30]:
import numpy as np
train_data = np.load('/kaggle/working/qwen_embeddings_train.npy')
print(train_data)
print(train_data.shape)

calibrate_data = np.load('/kaggle/working/qwen_embeddings_calibrate.npy')
print(calibrate_data)
print(calibrate_data.shape)

test_data = np.load('/kaggle/working/qwen_embeddings_test.npy')
print(test_data)
print(test_data.shape)

[[-0.01979227  0.02034266 -0.00955267 ... -0.0189545   0.05996537
  -0.02946962]
 [ 0.02401167 -0.03049346 -0.01257041 ... -0.01376737  0.01509776
  -0.00046995]
 [-0.02812944 -0.00499259 -0.01171883 ...  0.04234135 -0.03107574
  -0.01004842]
 ...
 [ 0.00937955 -0.01251978 -0.01221728 ...  0.0040628   0.02104761
  -0.00253818]
 [-0.00240453 -0.02713586 -0.01258758 ...  0.00677357  0.00387503
  -0.01317829]
 [-0.00472617 -0.02527185 -0.01252851 ...  0.01094262  0.01490286
  -0.01516217]]
(53965, 1024)
[[ 0.00559677 -0.05313345 -0.00863181 ... -0.03989751  0.0113431
  -0.01605429]
 [ 0.02003668 -0.02326474 -0.01197334 ...  0.02004261  0.01724499
  -0.01138762]
 [-0.0063905  -0.04787905 -0.0114645  ... -0.01090019 -0.00906689
  -0.01790828]
 ...
 [-0.08983362  0.05888341 -0.00955247 ... -0.01174458 -0.05532799
  -0.01934313]
 [-0.08517613  0.03511509 -0.01249848 ... -0.02933976  0.00455961
  -0.03781397]
 [-0.06771341  0.06507223 -0.01150156 ... -0.00324242 -0.02727953
  -0.02608895]]
(60

In [32]:
npy_path_train = "/kaggle/working/qwen_embeddings_train.npy"
csv_path_train = "/kaggle/working/qwen_embeddings_train.csv"

npy_path_calibrate = "/kaggle/working/qwen_embeddings_calibrate.npy"
csv_path_calibrate = "/kaggle/working/qwen_embeddings_calibrate.csv"

npy_path_test = "/kaggle/working/qwen_embeddings_test.npy"
csv_path_test = "/kaggle/working/qwen_embeddings_test.csv"

# load the data
X_train = np.load(npy_path_train) # npy file contain only embeddings, 1024 dimensional embedding vector
X_valid = np.load(npy_path_calibrate)
X_test = np.load(npy_path_test)

df_train = pd.read_csv(csv_path_train) # contains headline, CTR, embedding
df_calibrate = pd.read_csv(csv_path_calibrate)
df_test = pd.read_csv(csv_path_test)

# headlines = df['headline'].astype(str).tolist()
# y = df['CTR'].values.astype(float)

print(f'Shape of the input X: {X.shape}')
print(f'shape of the CTR output y: {y.shape}')

Shape of the input X: (53965, 1024)
shape of the CTR output y: (53965,)


In [34]:
# ---------------------------------------------------------
# 1. MLP Class
# ---------------------------------------------------------
class MLPRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dims=[512,256], dropout=0.2, activation=nn.ReLU):
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev, h))
            layers.append(activation())
            layers.append(nn.Dropout(dropout))
            prev = h
        layers.append(nn.Linear(prev, 1))
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x).squeeze(-1)

In [36]:
# ---------------------------------------------------------
# 2. Training Function (Modified to accept specific data)
# ---------------------------------------------------------
def train_mlp_specific_data(X_train, y_train, X_val, y_val, 
                            headlines_val, # for saving predictions
                            run_name="experiment",
                            hidden_dims=[512, 256],
                            dropout=0.2,
                            batch_size=64,
                            lr=1e-3,
                            weight_decay=1e-5,
                            epochs=50,
                            patience=6,
                            checkpoint_every=5):
    
    print(f"[{run_name}] Train shape: {X_train.shape}, Val shape: {X_val.shape}")
    
    # Standard Scaling
    # Fit ONLY on training data, transform both
    x_scaler = StandardScaler().fit(X_train)
    y_scaler = StandardScaler().fit(y_train.reshape(-1, 1))
    
    X_train_s = x_scaler.transform(X_train)
    X_val_s = x_scaler.transform(X_val)
    y_train_s = y_scaler.transform(y_train.reshape(-1, 1)).ravel()
    y_val_s = y_scaler.transform(y_val.reshape(-1, 1)).ravel()
    
    # Create output directory for this run
    run_dir = MODEL_DIR / run_name
    run_dir.mkdir(parents=True, exist_ok=True)
    
    # Datasets & Loaders
    train_ds = TensorDataset(torch.from_numpy(X_train_s).float(), torch.from_numpy(y_train_s).float())
    val_ds = TensorDataset(torch.from_numpy(X_val_s).float(), torch.from_numpy(y_val_s).float())
    
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
    
    # Model Setup
    model = MLPRegressor(input_dim=X_train_s.shape[1], hidden_dims=hidden_dims, dropout=dropout).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.MSELoss()
    
    best_val_loss = float('inf')
    epochs_no_improve = 0
    history = []
    
    # Training Loop
    
    
    for epoch in range(1, epochs+1):
        model.train()
        train_losses = []
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        train_loss = np.mean(train_losses)
        
        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                pred = model(xb)
                loss = criterion(pred, yb)
                val_losses.append(loss.item())
        val_loss = np.mean(val_losses)
        
        history.append({"epoch": epoch, "train_loss": train_loss, "val_loss": val_loss})
        
        # Checkpointing
        if val_loss < best_val_loss - 1e-8:
            best_val_loss = val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), run_dir / "best_model.pt")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"[{run_name}] Early stopping at epoch {epoch}")
                break
                
    # Save History and Metrics
    hist_df = pd.DataFrame(history)
    hist_df.to_csv(run_dir / "history.csv", index=False)
    
    # Load best model for final evaluation on validation set
    model.load_state_dict(torch.load(run_dir / "best_model.pt", weights_only=True))
    model.eval()
    
    with torch.no_grad():
        val_X_tensor = torch.from_numpy(X_val_s).float().to(DEVICE)
        pred_s = model(val_X_tensor).cpu().numpy()
        
    # Inverse transform to get original CTR scale
    pred_orig = y_scaler.inverse_transform(pred_s.reshape(-1,1)).ravel()
    
    mse = mean_squared_error(y_val, pred_orig)
    mae = mean_absolute_error(y_val, pred_orig)
    r2 = r2_score(y_val, pred_orig)
    
    metrics = {"mse": mse, "mae": mae, "r2": r2, "best_val_loss_scaled": best_val_loss}
    
    return metrics, run_dir, x_scaler, y_scaler, model

In [37]:
# ---------------------------------------------------------
# 3. Main Execution
# ---------------------------------------------------------

# --- A. Load Data ---
print("Loading data...")
npy_path_train = "/kaggle/working/qwen_embeddings_train.npy"
csv_path_train = "/kaggle/working/qwen_embeddings_train.csv"

npy_path_calibrate = "/kaggle/working/qwen_embeddings_calibrate.npy"
csv_path_calibrate = "/kaggle/working/qwen_embeddings_calibrate.csv"

npy_path_test = "/kaggle/working/qwen_embeddings_test.npy"
csv_path_test = "/kaggle/working/qwen_embeddings_test.csv"

X_train_full = np.load(npy_path_train)
X_calibrate = np.load(npy_path_calibrate)
X_test = np.load(npy_path_test)

df_train = pd.read_csv(csv_path_train)
df_calibrate = pd.read_csv(csv_path_calibrate)
df_test = pd.read_csv(csv_path_test)

# Extract Targets
y_train_full = df_train['CTR'].values.astype(float)
y_calibrate = df_calibrate['CTR'].values.astype(float)
y_test = df_test['CTR'].values.astype(float)

Loading data...


In [38]:
# --- B. Hyperparameter Tuning (Small Subset) ---
print("\n--- Starting Hyperparameter Tuning on Small Subset ---")

# Define Subset (e.g., first 20% of data for speed)
subset_size = int(len(X_train_full) * 0.2)
X_train_sub = X_train_full[:subset_size]
y_train_sub = y_train_full[:subset_size]

# Hyperparameter Grid
param_grid = [
    {"hidden_dims": [512, 256], "lr": 1e-3, "dropout": 0.2},
    {"hidden_dims": [256, 128], "lr": 1e-3, "dropout": 0.1},
    {"hidden_dims": [512, 256], "lr": 5e-4, "dropout": 0.2},
]

best_params = None
best_score = float('inf') # Minimizing MSE

tuning_results = []

for i, params in enumerate(param_grid):
    run_name = f"tune_run_{i}"
    print(f"\nTesting params: {params}")
    
    # Validate on Calibrate set
    metrics, _, _, _, _ = train_mlp_specific_data(
        X_train_sub, y_train_sub, 
        X_calibrate, y_calibrate,
        headlines_val=df_calibrate['headline'],
        run_name=run_name,
        hidden_dims=params['hidden_dims'],
        lr=params['lr'],
        dropout=params['dropout'],
        epochs=15 # Reduced epochs for tuning
    )
    
    tuning_results.append({**params, **metrics})
    print(f"Result: MSE={metrics['mse']:.6f}")
    
    if metrics['mse'] < best_score:
        best_score = metrics['mse']
        best_params = params

print("\nBest Parameters found:", best_params)
pd.DataFrame(tuning_results).to_csv(TRAIN_DIR / "tuning_results.csv", index=False)


--- Starting Hyperparameter Tuning on Small Subset ---

Testing params: {'hidden_dims': [512, 256], 'lr': 0.001, 'dropout': 0.2}
[tune_run_0] Train shape: (10793, 1024), Val shape: (6072, 1024)
[tune_run_0] Early stopping at epoch 13
Result: MSE=0.000093

Testing params: {'hidden_dims': [256, 128], 'lr': 0.001, 'dropout': 0.1}
[tune_run_1] Train shape: (10793, 1024), Val shape: (6072, 1024)
[tune_run_1] Early stopping at epoch 7
Result: MSE=0.000094

Testing params: {'hidden_dims': [512, 256], 'lr': 0.0005, 'dropout': 0.2}
[tune_run_2] Train shape: (10793, 1024), Val shape: (6072, 1024)
[tune_run_2] Early stopping at epoch 13
Result: MSE=0.000096

Best Parameters found: {'hidden_dims': [512, 256], 'lr': 0.001, 'dropout': 0.2}


In [39]:
# --- C. Final Training (Full Data) ---
print("\n--- Retraining Best Model on Full Training Data ---")

final_metrics, final_model_dir, x_scaler_final, y_scaler_final, best_model = train_mlp_specific_data(
    X_train_full, y_train_full, 
    X_calibrate, y_calibrate,
    headlines_val=df_calibrate['headline'],
    run_name="final_best_model",
    hidden_dims=best_params['hidden_dims'],
    lr=best_params['lr'],
    dropout=best_params['dropout'],
    epochs=50 # Full epochs
)

print(f"Final Validation Metrics: {final_metrics}")


--- Retraining Best Model on Full Training Data ---
[final_best_model] Train shape: (53965, 1024), Val shape: (6072, 1024)
[final_best_model] Early stopping at epoch 13
Final Validation Metrics: {'mse': 7.794710857119753e-05, 'mae': 0.006212676950567204, 'r2': 0.31196912508675, 'best_val_loss_scaled': 0.5484905444477733}


In [40]:
# --- D. Final Test Set Evaluation ---
print("\n--- Evaluating on Test Set ---")

best_model.eval()
with torch.no_grad():
    # Transform test data using the scaler fitted on FULL training data
    X_test_s = x_scaler_final.transform(X_test)
    X_test_tensor = torch.from_numpy(X_test_s).float().to(DEVICE)
    
    # Predict
    test_preds_s = best_model(X_test_tensor).cpu().numpy()
    
    # Inverse transform
    test_preds_orig = y_scaler_final.inverse_transform(test_preds_s.reshape(-1, 1)).ravel()

# Calculate Test Metrics
mse_test = mean_squared_error(y_test, test_preds_orig)
r2_test = r2_score(y_test, test_preds_orig)

print(f"TEST SET RESULTS -> MSE: {mse_test:.6f}, R2: {r2_test:.4f}")

# Save Test Predictions
test_df_out = pd.DataFrame({
    "headline": df_test['headline'],
    "CTR_true": y_test,
    "CTR_pred": test_preds_orig
})
test_df_out.to_csv(TRAIN_DIR / "final_test_predictions.csv", index=False)
print("Saved final test predictions to", TRAIN_DIR / "final_test_predictions.csv")


--- Evaluating on Test Set ---
TEST SET RESULTS -> MSE: 0.000075, R2: 0.3273
Saved final test predictions to training/final_test_predictions.csv
