In [None]:
import pandas as pd
from janitor import clean_names
from numpy import nan

# Choose data source:
# - "excel" = Use UNCTAD Excel file (traditional method)
# - "scraped" = Use data scraped from UNCTAD website (run 00_scrape_unctad.ipynb first)
DATA_SOURCE = "excel"

In [None]:
if DATA_SOURCE == "excel":
    # Load from UNCTAD Excel file
    data = pd.read_excel('data/UNCTAD-ISDS-Navigator-data-set-31December2023.xlsx').clean_names()
    
elif DATA_SOURCE == "scraped":
    # Load from scraped UNCTAD website data
    data = pd.read_csv('data/unctad_cases.csv')
    data = data[data['fetch_status'] == 'success'].drop(columns=['fetch_status'])
    data = data.rename(columns={'italaw_link': 'link_to_italaws_case_page'})
    
else:
    raise ValueError(f"Unknown DATA_SOURCE: {DATA_SOURCE}. Use 'excel' or 'scraped'.")

print(f"Loaded {len(data)} cases from {DATA_SOURCE} source")

In [None]:
# Clean up the ITA Law link column
if 'link_to_italaws_case_page' in data.columns:
    # For Excel source: replace "Not available" text with NaN
    data['link_to_italaws_case_page'] = data['link_to_italaws_case_page'].replace('Not available', nan)
    
print(f"Cases with ITA Law link: {data['link_to_italaws_case_page'].notna().sum()}")

In [4]:
data.to_csv('data/unctad_clean.csv', index=False)