# Data Preprocessing  
Project: Legal Document Importance Prediction     
Objective: Prepare clean, model-ready features from raw legal documents

NOTE: This notebook is for experimentation.   
Production code lives in the src/ directory.

## Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import re

## 1. Load the Raw Dataset

In [None]:
# Load raw dataset
train_df = pd.read_csv("../data/raw/train.csv")
test_df = pd.read_csv("../data/raw/test.csv")

# Standardize column names for consistency
train_df.columns = train_df.columns.str.strip().str.replace(" ", "_")
test_df.columns = test_df.columns.str.strip().str.replace(" ", "_")

## 2. Initial Dataset Shape & Sanity Check

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.info()

In [None]:
test_df.info()

## 3. Create a Working Copy

In [None]:
train_clean = train_df.copy()
test_clean = test_df.copy()

## 4. Handle Missing Values (Based on EDA)

### 4.1 Text Columns - Fill with Empty Strings

In [None]:
text_cols = [
    "Headline",
    "Reasoning",
    "Key_Insights",
    "Tags"
]

train_clean[text_cols] = train_clean[text_cols].fillna("")
test_clean[text_cols] = test_clean[text_cols].fillna("")

### 4.2 List-like Metadata Columns - Fill Missing

In [None]:
list_cols = [
    "Lead_Types",
    "Power_Mentions",
    "Agencies"
]

train_clean[list_cols] = train_clean[list_cols].fillna("")
test_clean[list_cols] = test_clean[list_cols].fillna("")

## 5. Fix Text Encoding Issues

In [None]:
def fix_encoding(text):
    return re.sub(r"â€‘|â€“|â€”", "-", text)

for col in ["Headline", "Key_Insights", "Reasoning"]:
    train_clean[col] = train_clean[col].apply(fix_encoding)

for col in ["Headline", "Key_Insights", "Reasoning"]:
    test_clean[col] = test_clean[col].apply(fix_encoding)

## 6. Normalize Text Fields

In [None]:
for col in ["Headline", "Key_Insights", "Reasoning"]:
    train_clean[col] = train_clean[col].str.lower().str.strip()

for col in ["Headline", "Key_Insights", "Reasoning"]:
    test_clean[col] = test_clean[col].str.lower().str.strip()

## 7. Parse List-like Columns

In [None]:
def split_to_list(text):
    if text == "":
        return []
    return [t.strip() for t in text.split(";")]

for col in list_cols:
    train_clean[col] = train_clean[col].apply(split_to_list)

for col in list_cols:
    test_clean[col] = test_clean[col].apply(split_to_list)

## 8. Feature Creation (From EDA Insights)

### 8.1 Text Length Features

In [None]:
train_clean["headline_len"] = train_clean["Headline"].str.len()
train_clean["insight_len"] = train_clean["Key_Insights"].str.len()

test_clean["headline_len"] = test_clean["Headline"].str.len()
test_clean["insight_len"] = test_clean["Key_Insights"].str.len()

### 8.2 Metadata Count Features

In [None]:
train_clean["num_lead_types"] = train_clean["Lead_Types"].apply(len)
train_clean["num_power_mentions"] = train_clean["Power_Mentions"].apply(len)
train_clean["num_agencies"] = train_clean["Agencies"].apply(len)
train_clean["num_tags"] = train_clean["Tags"].apply(lambda x: len(x.split(";")) if x else 0)

test_clean["num_lead_types"] = test_clean["Lead_Types"].apply(len)
test_clean["num_power_mentions"] = test_clean["Power_Mentions"].apply(len)
test_clean["num_agencies"] = test_clean["Agencies"].apply(len)
test_clean["num_tags"] = test_clean["Tags"].apply(lambda x: len(x.split(";")) if x else 0)

## 9. Drop Non-Predictive Columns

In [None]:
train_clean.drop(columns=["Source_File"], inplace=True)
test_clean.drop(columns=["Source_File"], inplace=True)

## 10. Final Preprocessing Check

In [None]:
train_clean.info()

In [None]:
test_clean.info()

## 11. Save Preprocessed Dataset

In [None]:
train_clean.to_csv("../data/processed/train_clean.csv", index=False)
test_clean.to_csv("../data/processed/test_clean.csv", index=False)

## Summary
Missing values were handled meaningfully, text was cleaned and normalized, list-like metadata was structured, and key length/count features were engineered based on EDA insights. Non-informative columns were removed, making the dataset fully model-ready for importance score prediction.