# Model Training
Project: Legal Document Importance Prediction   
Objective: Train CatBoost regression model and generate predictions

NOTE: This notebook is for experimentation.   
Production code lives in the src/ directory.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

## 2. Load Feature Data

In [None]:
train_df = pd.read_csv("../data/processed/train_features.csv")
test_df = pd.read_csv("../data/processed/test_features.csv")

## 3. Target Preparation

In [None]:
import ast

def str_to_list(x):
    if isinstance(x, str) and x.startswith("[") and x.endswith("]"):
        try:
            return ast.literal_eval(x)
        except:
            return []
    return x if isinstance(x, list) else []

def list_to_text(x):
    if isinstance(x, list):
        return "; ".join(x)
    return "" if pd.isna(x) else str(x)

# Convert stringified lists to real lists
list_cols = ["Lead_Types", "Power_Mentions", "Agencies", "Tags"]

for col in list_cols:
    train_df[col] = train_df[col].apply(str_to_list)
    test_df[col] = test_df[col].apply(str_to_list)

# Convert lists to text
for col in list_cols:
    train_df[col] = train_df[col].apply(list_to_text)
    test_df[col] = test_df[col].apply(list_to_text)

train_df["target"] = train_df["Importance_Score"] / 100

X = train_df.drop(columns=["Importance_Score", "target", "id"])
y = train_df["target"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

text_cols = ["Headline", "Reasoning", "Key_Insights"]

for col in text_cols:
    X_train[col] = X_train[col].fillna("").astype(str)
    X_valid[col] = X_valid[col].fillna("").astype(str)
    test_df[col] = test_df[col].fillna("").astype(str)

model = CatBoostRegressor(
    loss_function="RMSE",
    depth=8,
    learning_rate=0.03,
    n_estimators=3000,
    random_seed=42,
    verbose=200,
)

model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),
    text_features=text_cols,
    use_best_model=True
)

## 4. Freeze & Save the Model

In [None]:
model.save_model("catboost_final.cbm")

In [None]:
feature_cols = X_train.columns.tolist()

import json
with open("features.json", "w") as f:
    json.dump(feature_cols, f)

## 5. Train on FULL TRAIN DATA

In [None]:
text_cols = ["Headline", "Reasoning", "Key_Insights"]

for col in text_cols:
    train_df[col] = train_df[col].fillna("").astype(str)
    test_df[col] = test_df[col].fillna("").astype(str)

X_full = train_df.drop(columns=["Importance_Score", "target", "id"])
y_full = train_df["target"]

model_final = CatBoostRegressor(
    loss_function="RMSE",
    depth=8,
    learning_rate=0.03,
    n_estimators=3000,
    random_seed=42,
    verbose=200,
)

model_final.fit(
    X_full,
    y_full,
    text_features=text_cols
)

model_final.save_model("catboost_full.cbm")

## 6. Load Test Features & Predict

In [None]:
# Load test features
test_features = pd.read_csv("../data/processed/test_features.csv")

# Ensure text columns are clean
text_cols = ["Headline", "Reasoning", "Key_Insights"]
for col in text_cols:
    test_features[col] = test_features[col].fillna("").astype(str)

# Drop id for prediction
X_test = test_features.drop(columns=["id"])

for col in X_test.columns:
    if col not in text_cols:
        X_test[col] = (
            X_test[col]
            .replace("[]", 0)
            .replace("", 0)
        )
        X_test[col] = pd.to_numeric(X_test[col], errors="coerce").fillna(0)

# Predict (scaled target)
test_preds_scaled = model_final.predict(X_test)

test_preds = test_preds_scaled * 100
test_preds = np.clip(test_preds, 0, 100)

## 7. Create submission

In [None]:
submission = pd.DataFrame({
    "id": test_features["id"],
    "Importance_Score": test_preds
})

submission.to_csv("submission.csv", index=False)

## Summary
- This notebook focuses on training a regression model to predict document importance using engineered textual and structured features.
- It includes data preparation, trainâ€“validation splitting, CatBoost model training with text features, performance monitoring using RMSE, and final model training on the complete dataset for generating predictions.