In [31]:
import pandas as pd
import os
import re

FINAL_COLUMNS = [
    'hospital_name', 'street_address', 'city', 'state', 'zip_code',
    'description', 'billing_code', 'billing_code_type', 'standard_charge',
    'discounted_cash_charge', 'payer_name', 'plan_name',
    'negotiated_dollar', 'negotiated_percentage', 'estimated_amount',
    'min_charge', 'max_charge'
]

def clean_single_tall_csv(filepath):
    # Read first two rows to extract metadata
    try:
        raw_meta = pd.read_csv(filepath, nrows=2, header=None, encoding='utf-8')
    except UnicodeDecodeError:
        raw_meta = pd.read_csv(filepath, nrows=2, header=None, encoding='latin1')

    hospital_name = str(raw_meta.iloc[1, 0]).strip().title()
    full_address = str(raw_meta.iloc[1, 4]).strip()

    if ',' in full_address:
        parts = [p.strip() for p in full_address.split(',')]
        street_address = parts[0]
        city = parts[1] if len(parts) > 1 else ''
        state_zip = parts[2].split() if len(parts) > 2 else []
        state = state_zip[0] if len(state_zip) > 0 else ''
        zip_code = state_zip[1] if len(state_zip) > 1 else ''
    else:
        street_address = full_address
        city = state = zip_code = ''

    try:
        df = pd.read_csv(filepath, header=2, encoding='utf-8', low_memory=False)
    except UnicodeDecodeError:
        df = pd.read_csv(filepath, header=2, encoding='latin1', low_memory=False)

    # Filter CPT-coded rows
    cpt_cols = [col for col in df.columns if 'code' in col.lower() and 'type' in col.lower()]
    cpt_rows = pd.Series([False] * len(df))
    for col in cpt_cols:
        cpt_rows |= df[col].astype(str).str.upper().str.contains('CPT', na=False)

    df = df[cpt_rows]

    for n in range(1, 5):
        code_col = f'code|{n}'
        type_col = f'code|{n}|type'
        if code_col in df.columns and type_col in df.columns:
            mask = df[type_col].astype(str).str.upper() == 'CPT'
            df.loc[mask, 'billing_code'] = df.loc[mask, code_col]
            df.loc[mask, 'billing_code_type'] = df.loc[mask, type_col]

    column_map = {
        'description': 'description',
        'standard_charge|gross': 'standard_charge',
        'standard_charge|discounted_cash': 'discounted_cash_charge',
        'payer_name': 'payer_name',
        'plan_name': 'plan_name',
        'standard_charge|negotiated_dollar': 'negotiated_dollar',
        'standard_charge|negotiated_percentage': 'negotiated_percentage',
        'estimated_amount': 'estimated_amount',
        'standard_charge|min': 'min_charge',
        'standard_charge|max': 'max_charge'
    }

    df = df[[col for col in df.columns if col in column_map or col in ['billing_code', 'billing_code_type', 'description']]]
    df.rename(columns=column_map, inplace=True)

    for col in FINAL_COLUMNS:
        if col not in df.columns:
            df[col] = ""

    # Assign metadata columns
    df['hospital_name'] = hospital_name
    df['street_address'] = street_address
    df['city'] = city
    df['state'] = state
    df['zip_code'] = zip_code

    df = df[FINAL_COLUMNS].drop_duplicates()
    return df, hospital_name


def batch_clean_tall_csvs(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    for i, filename in enumerate(os.listdir(input_folder)):
        if filename.endswith('.csv'):
            filepath = os.path.join(input_folder, filename)
            print(f"📄 Processing: {filename}")
            try:
                df_cleaned, hosp_name = clean_single_tall_csv(filepath)
                if df_cleaned.empty:
                    print(f"⚠️ No CPT-coded entries in {filename}")
                    continue
                safe_name = hosp_name.replace('/', '_').replace(' ', '_')
                output_path = os.path.join(output_folder, f"Cleaned_{safe_name}_{i}.csv")
                df_cleaned.to_csv(output_path, index=False)
                print(f"✅ Saved: {output_path}")
            except Exception as e:
                print(f"❌ Error in {filename}: {e}")

# USAGE
batch_clean_tall_csvs(
    input_folder='/Users/unmonadas/Desktop/TALL CSV RAW/OneDrive_1_5-5-2025',
    output_folder='/Users/unmonadas/Desktop/Cleaned_Tall_CSVs_Fixed'
)


📄 Processing: 930430029_Hillsboro-Medical-Center_standardcharges.csv
✅ Saved: /Users/unmonadas/Desktop/Cleaned_Tall_CSVs_Fixed/Cleaned_Hillsboro_Medical_Center_0.csv
📄 Processing: 590634433-1245520386_Nemours-Childrens-Hospital_standardcharges(1).csv
✅ Saved: /Users/unmonadas/Desktop/Cleaned_Tall_CSVs_Fixed/Cleaned_Nemours_Childrens_Health_Winter_Garden_1.csv
📄 Processing: 590634433-1245520386_Nemours-Childrens-Hospital_standardcharges(2).csv
✅ Saved: /Users/unmonadas/Desktop/Cleaned_Tall_CSVs_Fixed/Cleaned_Nemours_Childrens_Health_Lake_Mary_2.csv
📄 Processing: 590634433-1245520386_Nemours-Childrens-Hospital_standardcharges.csv
✅ Saved: /Users/unmonadas/Desktop/Cleaned_Tall_CSVs_Fixed/Cleaned_Nemours_Childrens_Hospital_Orlando_3.csv
📄 Processing: 931176109_Oregon-Health-and-Science-University_standardcharges.csv
✅ Saved: /Users/unmonadas/Desktop/Cleaned_Tall_CSVs_Fixed/Cleaned_Oregon_Health_&_Science_University_4.csv
📄 Processing: 930429015_Adventist-Health-Portland_standardcharges.csv