# Feature Engineering
Project: Legal Document Importance Prediction     
Objective: Prepare model-ready features from raw legal documents

NOTE: This notebook is for experimentation.   
Production code lives in the src/ directory.

In [None]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

## 1. Load Clean Data

In [None]:
train_df = pd.read_csv("../data/processed/train_clean.csv")
test_df = pd.read_csv("../data/processed/test_clean.csv")

## 2. Base Helper Metrics

In [None]:
train_df["headline_len"] = train_df["Headline"].str.len()
train_df["insight_len"] = train_df["Key_Insights"].str.len()
train_df["reason_len"] = train_df["Reasoning"].str.len()

test_df["headline_len"] = test_df["Headline"].str.len()
test_df["insight_len"] = test_df["Key_Insights"].str.len()
test_df["reason_len"] = test_df["Reasoning"].str.len()

## 3. Features

### 3.1 Power Density

In [None]:
train_df["power_density"] = train_df["Power_Mentions"].apply(len) / (train_df["insight_len"] + 1)
test_df["power_density"] = test_df["Power_Mentions"].apply(len) / (test_df["insight_len"] + 1)

### 3.2 agency density

In [None]:
train_df["agency_density"] = train_df["Agencies"].apply(len) / (train_df["insight_len"] + 1)
test_df["agency_density"] = test_df["Agencies"].apply(len) / (test_df["insight_len"] + 1)

### 3.3 institutional index

In [None]:
train_df["institutional_index"] = train_df["Agencies"].apply(len) + train_df["Lead_Types"].apply(len)
test_df["institutional_index"] = test_df["Agencies"].apply(len) + test_df["Lead_Types"].apply(len)

### 3.4 lead complexity ratio

In [None]:
train_df["lead_complexity_ratio"] = train_df["Lead_Types"].apply(len) / (train_df["Power_Mentions"].apply(len) + 1)
test_df["lead_complexity_ratio"] = test_df["Lead_Types"].apply(len) / (test_df["Power_Mentions"].apply(len) + 1)

### 3.5 insight concentration

In [None]:
train_df["insight_concentration"] = train_df["insight_len"] / (train_df["reason_len"] + 1)
test_df["insight_concentration"] = test_df["insight_len"] / (test_df["reason_len"] + 1)

### 3.6 entity insight ratio

In [None]:
train_df["entity_insight_ratio"] = train_df["Power_Mentions"].apply(len) / (train_df["insight_len"] + 1)
test_df["entity_insight_ratio"] = test_df["Power_Mentions"].apply(len) / (test_df["insight_len"] + 1)

### 3.7 tag lead alignment

In [None]:
train_df["tag_lead_alignment"] = (
    train_df["num_tags"] / (train_df["Lead_Types"].apply(len) + 1)
)
test_df["tag_lead_alignment"] = test_df["num_tags"] / (test_df["Lead_Types"].apply(len) + 1)

### 3.8 actionability proxy

In [None]:
train_df["actionability_proxy"] = (
    train_df["Power_Mentions"].apply(len) +
    train_df["Agencies"].apply(len) +
    train_df["Lead_Types"].apply(len)
)

test_df["actionability_proxy"] = (
    test_df["Power_Mentions"].apply(len) +
    test_df["Agencies"].apply(len) +
    test_df["Lead_Types"].apply(len)
)

### 3.9 information density

In [None]:
train_df["information_density"] = (
    train_df["insight_len"] + train_df["reason_len"]
) / (train_df["headline_len"] + 1)

test_df["information_density"] = (
    test_df["insight_len"] + test_df["reason_len"]
) / (test_df["headline_len"] + 1)

### 3.10 importance prior

In [None]:
train_df["importance_prior"] = train_df.groupby("num_power_mentions")["Importance_Score"].transform("mean")

global_prior = train_df["Importance_Score"].mean()
test_df["importance_prior"] = global_prior

### 3.11 Intent Score 

In [None]:
def compute_intent_score(row):
    score = 0
    score += row['num_lead_types'] * 2
    score += row['num_agencies'] * 3
    score += row['num_power_mentions']
    score += row['insight_len'] / 150
    return score

### 3.12 Intent Category

In [None]:
def map_intent_category(score):
    if score <= 1:
        return "contextual"
    elif score <= 3:
        return "informational"
    elif score <= 6:
        return "disclosure"
    elif score <= 9:
        return "analytical"
    else:
        return "allegational"

### 3.13 Ordinal Encode the Category

In [None]:
intent_encoding = {
    "contextual": 0,
    "informational": 1,
    "disclosure": 2,
    "analytical": 3,
    "allegational": 4
}

### 3.14 Investigative Intent Extraction

In [None]:
# Compute raw intent score
train_df['intent_score'] = train_df.apply(compute_intent_score, axis=1)
test_df['intent_score'] = test_df.apply(compute_intent_score, axis=1)

# Map to category
train_df['intent_category'] = train_df['intent_score'].apply(map_intent_category)
test_df['intent_category'] = test_df['intent_score'].apply(map_intent_category)

# Ordinal encode for modeling
train_df['intent_level'] = train_df['intent_category'].map(intent_encoding)
test_df['intent_level'] = test_df['intent_category'].map(intent_encoding)

### 3.15 SBERT Semantic Alignment

In [None]:
#train_df
train_df["Headline"] = train_df["Headline"].fillna("").astype(str)
train_df["Key_Insights"] = train_df["Key_Insights"].fillna("").astype(str)

model = SentenceTransformer("paraphrase-MiniLM-L3-v2")

headline_emb = model.encode(train_df["Headline"].tolist())
insight_emb = model.encode(train_df["Key_Insights"].tolist())

train_df["semantic_alignment"] = [
    cosine_similarity(headline_emb[i].reshape(1,-1), insight_emb[i].reshape(1,-1))[0][0]
    for i in range(len(train_df))
]
# test_df
test_df["Headline"] = test_df["Headline"].fillna("").astype(str)
test_df["Key_Insights"] = test_df["Key_Insights"].fillna("").astype(str)

headline_emb = model.encode(test_df["Headline"].tolist())
insight_emb = model.encode(test_df["Key_Insights"].tolist())

test_df["semantic_alignment"] = [
    cosine_similarity(headline_emb[i].reshape(1,-1), insight_emb[i].reshape(1,-1))[0][0]
    for i in range(len(test_df))
]

### 3.16 Temporal Evidence Score

In [None]:
date_pattern = r"\b(19|20)\d{2}\b"
train_df["temporal_density"] = train_df["Reasoning"].str.count(date_pattern) / (train_df["reason_len"] + 1)
test_df["temporal_density"] = test_df["Reasoning"].str.count(date_pattern) / (test_df["reason_len"] + 1)

### 3.17 Legal Trigger Count

In [None]:
legal_terms = ["foia", "indict", "prosecut", "sanction", "oversight", "plea", "probe"]
train_df["legal_trigger_count"] = train_df["Reasoning"].apply(lambda t: sum(term in t for term in legal_terms))
test_df["legal_trigger_count"] = test_df["Reasoning"].apply(lambda t: sum(term in t for term in legal_terms))

### 3.18 Power Density Score

In [None]:
train_df["power_density"] = train_df["num_power_mentions"] / (np.log1p(train_df["headline_len"] + train_df["insight_len"]))
test_df["power_density"] = test_df["num_power_mentions"] / (np.log1p(test_df["headline_len"] + test_df["insight_len"]))

### 3.19 Quick Validation

In [None]:
train_df[['num_lead_types','num_power_mentions','num_agencies','insight_len',
    'intent_score','intent_category','intent_level']].head()

In [None]:
test_df[['num_lead_types','num_power_mentions','num_agencies','insight_len',
    'intent_score','intent_category','intent_level']].head()

## 4. Remove Non-Modeling Columns

In [None]:
train_df.drop(columns=['intent_category'], inplace=True)
test_df.drop(columns=['intent_category'], inplace=True)

## 5. Export Processed Features

In [None]:
train_df.to_csv("../data/processed/train_features.csv", index=False)
test_df.to_csv("../data/processed/test_features.csv", index=False)