In [1]:
import pandas as pd
from pathlib import Path

part_path = Path("part-1")
df = pd.read_csv(f"{part_path}/processed/train-embeddings.csv")

In [2]:
from ast import literal_eval

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI
import numpy as np

client = OpenAI()

def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / norm
    else:
        norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
        return np.where(norm == 0, x, x / norm)



In [4]:
def process_embeddings(embedding_series):
    return np.array([normalize_l2(literal_eval(emb)[:512]) for emb in embedding_series])

In [5]:
embeddings_a = process_embeddings(df.embedding)
embeddings_b = process_embeddings(df.embedding_b)

In [6]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

features = np.hstack([
    embeddings_a,
    embeddings_b
])

# Initialize model
rfr = RandomForestRegressor(n_estimators=100, random_state=42)

# Perform k-fold cross validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
mse_scores = -cross_val_score(rfr, features, df.weight_a, 
                             scoring='neg_mean_squared_error', 
                             cv=kf)
mae_scores = -cross_val_score(rfr, features, df.weight_a, 
                             scoring='neg_mean_absolute_error', 
                             cv=kf)

# Print results
print(f"Cross-validation results ({n_folds} folds):")
print(f"MSE: {mse_scores.mean():.6f} (+/- {mse_scores.std() * 2:.6f})")
print(f"MAE: {mae_scores.mean():.6f} (+/- {mae_scores.std() * 2:.6f})")

Cross-validation results (5 folds):
MSE: 0.023058 (+/- 0.002901)
MAE: 0.116612 (+/- 0.006274)


In [18]:
df_test = pd.read_csv("data/test-part-1-processed.csv")

In [None]:
test_features = np.hstack([
    np.vstack(df_test.embedding.values),
    np.vstack(df_test.embedding_b.values)
])
preds = rfr.predict(test_features)
df_test['predicted_weight'] = preds
df_test.to_csv("data/test-part-1-processed-predicted.csv", index=False)
