In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from pathlib import Path
import json
from utils.data_loading import (
    load_enron_emails,
    load_invoices,
    #load_cuad_full_contracts,
    load_arxiv_from_jsonl,
    load_invoices_kaggle
)
from utils.balancing import cap_per_class
from utils.paths import *

In [None]:
# ---------- CONFIG ----------
MIN_CHARS = 200      # drop  short junk
MAX_CHARS = 5000     # drop  long stuff
MAX_DOCS_PER_CLASS = 20000  # cap per class for balance//match smallest

In [3]:
# paths 
ENRON_CSV   = RAW_ENRON                     
INVOICE_CSV = RAW_INVOICES
INVOICE_2_CSV = RAW_INVOICES_2
ARXIV_JSONL = RAW_ARXIV
#FULL_CONTRACT = RAW / "CUAD" / "FULL_CONTRACT_TXT"


In [4]:
emails_df   = load_enron_emails(ENRON_CSV)
invoices_df = load_invoices(INVOICE_CSV)
invoices_2_df = load_invoices_kaggle(INVOICE_2_CSV)
papers_df   = load_arxiv_from_jsonl(ARXIV_JSONL, max_rows=20000)
#legal_df    = load_cuad_full_contracts(FULL_CONTRACT)

Loading Enron emails...
Enron after cleaning: (169269, 3)
Loading invoices...
Invoices after cleaning: (10000, 3)
Loading Kaggle invoices from C:\Users\viach\Documents\doc_class\datasets\raw\invoices_2.csv …
Kaggle invoices after cleaning: (50000, 3)
Loading arXiv JSONL...
arXiv after cleaning: (19687, 3)


In [6]:
#----------------BUILD UNIFIED DATASET-------------------------------
#dfs = [emails_df, invoices_df, legal_df, papers_df]
dfs = [emails_df, invoices_df, invoices_2_df, papers_df]
df_all = pd.concat(dfs, ignore_index=True)

In [7]:
# last cleaning pass
df_all = df_all.dropna(subset=["text", "doc_type"])
df_all = df_all.drop_duplicates(subset=["text"])

print("Combined before balancing:", df_all["doc_type"].value_counts())

Combined before balancing: doc_type
EMAIL               169269
INVOICE              60000
SCIENTIFIC_PAPER     19687
Name: count, dtype: int64


In [9]:
# balance (downsample big classes)
df_all_bal = cap_per_class(
    df_all,
    label_col="doc_type",
    max_per_class=MAX_DOCS_PER_CLASS,
    random_state=42
)

In [10]:
# shuffle
df_all_bal = df_all_bal.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [11]:
print("Combined AFTER balancing:")
print(df_all_bal["doc_type"].value_counts())

Combined AFTER balancing:
doc_type
INVOICE             20000
EMAIL               20000
SCIENTIFIC_PAPER    19687
Name: count, dtype: int64


In [12]:
#------------------Train / test split:-------------------------------------------

In [13]:
from sklearn.model_selection import train_test_split

X = df_all_bal["text"].values
y = df_all_bal["doc_type"].values

# first: train vs temp (val+test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,        # 30% goes to temp → later split into val/test
    random_state=42,
    stratify=y
)

# second: split temp into val and test (each 15% of total)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,         # 0.5 of 30% = 15% of total
    random_state=42,
    stratify=y_temp
)
print("Train size:", len(X_train))
print("Val size:", len(X_val))
print("Test size:", len(X_test))



Train size: 41780
Val size: 8953
Test size: 8954


In [14]:
train_df = pd.DataFrame({"text": X_train, "doc_type": y_train})
val_df   = pd.DataFrame({"text": X_val,   "doc_type": y_val})
test_df  = pd.DataFrame({"text": X_test,  "doc_type": y_test})

print("Train label counts:\n", train_df["doc_type"].value_counts())
print("Val label counts:\n",   val_df["doc_type"].value_counts())
print("Test label counts:\n",  test_df["doc_type"].value_counts())


Train label counts:
 doc_type
INVOICE             14000
EMAIL               14000
SCIENTIFIC_PAPER    13780
Name: count, dtype: int64
Val label counts:
 doc_type
INVOICE             3000
EMAIL               3000
SCIENTIFIC_PAPER    2953
Name: count, dtype: int64
Test label counts:
 doc_type
EMAIL               3000
INVOICE             3000
SCIENTIFIC_PAPER    2954
Name: count, dtype: int64


In [16]:
#-----------SAVE--------------
train_path = PROCESSED / "doc_type_train.csv"
test_path  = PROCESSED / "doc_type_test.csv"
val_path = PROCESSED / "doc_type_val.csv"

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)
val_df.to_csv(val_path)