In [None]:
import pandas as pd
from utility.pdf_parser import get_adjusted_page_count, get_page_count


In [2]:
data = pd.read_csv('data/unctad_document_level_data.csv')

In [3]:
data["adjusted_page_count"] = data["doc_id"].apply(get_adjusted_page_count)
data['page_count'] = data["doc_id"].apply(get_page_count)

In [4]:
data.to_csv('data/unctad_document_level_data.csv', index=False)

In [None]:
grouped = data.groupby("arbitration_id")

adj_page_stats = grouped["adjusted_page_count"].agg(
    adj_page_sum="sum",
    adj_page_mean="mean",
    adj_page_var="var",
    adj_page_sd="std",
    adj_page_min="min",
    adj_page_max="max"
).reset_index()

page_stats = grouped["page_count"].agg(
    page_sum="sum",
    page_mean="mean",
    page_var="var",
    page_sd="std",
    page_min="min",
    page_max="max"
).reset_index()

italaw_vars = [col for col in data.columns if col.startswith("italaw_")]
italaw_info = data.groupby("arbitration_id")[italaw_vars].first().reset_index()

num_known_docs = data.dropna(subset=["doc_name"]).groupby("arbitration_id")["doc_name"].count().reset_index(name="num_known_docs")
num_avail_docs = data.dropna(subset=["doc_link"]).groupby("arbitration_id")["doc_link"].count().reset_index(name="num_avail_docs")

In [None]:
arbitration_level_data = pd.merge(adj_page_stats, italaw_info, on="arbitration_id")
arbitration_level_data = pd.merge(arbitration_level_data, page_stats, on="arbitration_id")
arbitration_level_data = pd.merge(arbitration_level_data, num_known_docs, on="arbitration_id")
arbitration_level_data = pd.merge(arbitration_level_data, num_avail_docs, on="arbitration_id")

In [6]:
arbitration_level_data['pct_avail_docs'] = arbitration_level_data['num_avail_docs'] / arbitration_level_data['num_known_docs']

In [7]:
arbitration_level_data.to_csv('data/arbitration_level_document_metadata.csv', index=False)