<a href="https://colab.research.google.com/github/JovannyReb/etch-a-sketch/blob/main/xlm_csv_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Handle the xml_data
!unzip -q xml_data.zip

In [None]:
!unzip -q respondant_xml.zip

In [None]:
import os
import glob
import xml.etree.ElementTree as ET
import pandas as pd

# Define source and target folders
xml_folder = 'xml_data'
csv_folder = 'truth_data'

# Ensure the output folder exists
os.makedirs(csv_folder, exist_ok=True)

# Process each XML file
for xml_path in glob.glob(os.path.join(xml_folder, '*.xml')):
    records = []

    # Parse XML
    tree = ET.parse(xml_path)
    root = tree.getroot()

    doc_source = root.findtext('docSource')

    for dt in root.findall('docTypeList/docType'):
        type_id = dt.findtext('typeId')
        for pg in dt.findall('pages/page'):
            page_no_text = pg.findtext('pageNo')
            try:
                page_no = int(page_no_text) if page_no_text else None
            except ValueError:
                page_no = None
            records.append({
                'docSource': doc_source,
                'typeId': type_id,
                'pageNo': page_no
            })

    # Build DataFrame and sort
    df = pd.DataFrame(records)
    df = df.sort_values(by='pageNo')
    # The hash map (dictionary) for the lookup

    # The hash map (dictionary) for the lookup
    doc_type_map = {
        'ABEY1': '29 (DOC-ABEYAG)',
        'BILL-MED': '03 (BILL-MED)',
        'COURT-DOC': '47 (COURT-DOC)',
        'COVERLETTER': '999 (COVERSHEET)',
        'DOC-AFF': '14 (DOC-AFF)',
        'DOC-ASN': '05 (DOC-ASN)',
        'DOC-BRIEFS': '01 (DOC-BRIEFS)',
        'DOC-CTORD': '31 (DOC-CTORD)',
        'DOC-EVD': '18 (DOC-EVD)',
        'DOC-INSPOL': '15 (DOC-INSPOL)',
        'DOC-LIEN': '24 (DOC-LIEN)',
        'DOC-LOSS': '26 (DOC-LOSS)',
        'DOC-POLRPT': '12 (DOC-POLRPT)',
        'DOC-POM': '07 (DOC-POM)',
        'DOC-PRV': '17 (DOC-PRV)',
        'DOC-SUBP': '46 (DOC-SUBP)',
        'DOC-WRKCMP': '29 (DOC-WRKCMP)',
        'EXP-WIT': '37 (EXP-WIT)',
        'FRM-AR1': '06 (FRM-AR1)',
        'FRM-NF10': '02 (FRM-NF10)',
        'FRM-NF2': '10 (FRM-NF2)',
        'LTR-DLY': '16 (REQ-VERIFY)',
        'LTR-EUO': '11 (PEER-REV)',
        'LTR-EXTRSP': '16 (REQ-VERIFY)',
        'LTR-REP': '42 (LTR-REP)',
        'LTR-SCHREQ': '43 (LTR-SCHREQ)',
        'LTR-STLAGR': '34 (LTR-STLAGR)',
        'LTR-STLOFF': '44 (SETTLE-OFFER)',
        'LTR-WTH': '36 (LTR-WTH)',
        'Medical Fee Schedule': 'Medical Fee Schedule',
        'MED-IHC': '38 (MED-IHC)',
        'MED-NEC': '23 (MED-NEC)',
        'MED-SUP': '04 (MED-RPT-EXC)',
        'MED-TST': '09 (MED-TST)',
        'NF-MISC': '25 (NF-MISC)',
        'REQ-AMEND': '06 (FRM-AR1)',
        'RPT-IME': '11 (PEER-REV)',
        'RPT-IMERE': '04 (MED-RPT-EXC)',
        'RPT-INIT': '04 (MED-RPT-EXC)',
        'RPT-PEER': '11 (PEER-REV)',
        'RPT-PEERRE': '11 (PEER-REV)'
    }

    # 1. Calculate the values for the new column first
    truth_values = df['typeId'].map(doc_type_map)

    # 2. Find the integer location of the 'typeId' column
    typeid_location = df.columns.get_loc('typeId')

    # 3. Use insert() to add the new column to the right of 'typeId'
    df.insert(loc=typeid_location + 1, column='Truth', value=truth_values)

    # Get original file name and create CSV path
    base_name = os.path.splitext(os.path.basename(xml_path))[0]
    csv_path = os.path.join(csv_folder, base_name + '.csv')

    # Save DataFrame to CSV
    df.to_csv(csv_path, index=False)

print("All XML files have been processed and saved to:", csv_folder)

All XML files have been processed and saved to: truth_data


In [None]:
from google.colab import files
import os

folder_to_download = "truth_data" # Replace with the name of your folder

# Check if the folder exists before trying to download
if os.path.exists(folder_to_download):
    # Zip the folder first to download it as a single file
    zip_filename = f"{folder_to_download}.zip"
    !zip -r "$zip_filename" "$folder_to_download"

    # Download the zip file
    files.download(zip_filename)

    # Optional: Remove the zip file after downloading
    # os.remove(zip_filename)
else:
    print(f"Folder '{folder_to_download}' not found.")

  adding: truth_data/ (stored 0%)
  adding: truth_data/31591465.csv (deflated 90%)
  adding: truth_data/31493590.csv (deflated 90%)
  adding: truth_data/32219527.csv (deflated 92%)
  adding: truth_data/31298541.csv (deflated 90%)
  adding: truth_data/32291864.csv (deflated 91%)
  adding: truth_data/31904349.csv (deflated 91%)
  adding: truth_data/31704962.csv (deflated 91%)
  adding: truth_data/32342552.csv (deflated 90%)
  adding: truth_data/31640157.csv (deflated 91%)
  adding: truth_data/31717013.csv (deflated 90%)
  adding: truth_data/31355950.csv (deflated 90%)
  adding: truth_data/32043331.csv (deflated 90%)
  adding: truth_data/31393918.csv (deflated 91%)
  adding: truth_data/31401508.csv (deflated 91%)
  adding: truth_data/31355392.csv (deflated 91%)
  adding: truth_data/32219660.csv (deflated 88%)
  adding: truth_data/32523431.csv (deflated 91%)
  adding: truth_data/31592685.csv (deflated 90%)
  adding: truth_data/31288607.csv (deflated 91%)
  adding: truth_data/31444573.csv (

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>