In [1]:
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path

root_folder = '../Data_CSV' 
output_folder = '../Cleaned_Data'

# 1. Setup Lists for master consolidation
data_matrix = []
sample_names = []
wavenumbers = None

file_list = []

# name = "BSA"
# name = "MP"
# name = "mAb"
name = None  # Set to None to process ALL files

# 2. Crawl the Directory 
for subdir, dirs, files in os.walk(root_folder):
    for file in files:
        if file.endswith('.csv') and 'Peak' not in file and 'Plots' not in subdir:
            if name is None or name in file:
                path = os.path.join(subdir, file)
                if path not in file_list:
                    file_list.append(path)

# 3. Process each file
for file_path in file_list:
    try:
        file = file_path.split('/')[-1]
        file_stem = file.replace('.csv', '')
        
        # Extract metadata (first 54 lines)
        metadata = {}
        with open(file_path, 'r') as f:
            for i in range(54):
                line = f.readline().strip()
                if ',' in line:
                    key, value = line.split(',', 1)
                    metadata[key] = value
        
        # Read CSV data (skip metadata)
        df = pd.read_csv(file_path, skiprows=54)
        
        # Basic Validation
        if 'Wavenumber' in df.columns and 'Processed' in df.columns:
            
            # Create output folder structure
            rel_path = Path(file_path).relative_to(Path(root_folder))
            output_dir = Path(output_folder) / rel_path.parent
            output_dir.mkdir(parents=True, exist_ok=True)
            
            # Save metadata as JSON
            json_path = output_dir / f"{file_stem}_metadata.json"
            with open(json_path, 'w') as f:
                json.dump(metadata, f, indent=2)
            
            # Save cleaned CSV (only Wavenumber and Processed)
            df_clean = df[['Wavenumber', 'Processed']].rename(columns={'Processed': 'Intensities'}).dropna()
            csv_path = output_dir / f"{file_stem}_data.csv"
            df_clean.to_csv(csv_path, index=False)
            
            print(f"Processed: {file}")
            
            # Also add to master consolidation
            if wavenumbers is None:
                wavenumbers = df['Wavenumber'].values
            
            if len(df['Wavenumber']) == len(wavenumbers):
                current_y = np.interp(wavenumbers, df['Wavenumber'].values, df['Processed'].values)
                current_y = np.nan_to_num(current_y, nan=0.0)
                data_matrix.append(current_y)
                sample_names.append(file_stem)
            else:
                print(f"  Skipping from master: Wavenumber length mismatch")

    except Exception as e:
        print(f"Error reading {file}: {e}")

# 4. Optional: Create master consolidated file
if len(data_matrix) > 0:
    X = np.array(data_matrix)
    master_df = pd.DataFrame(data=X, columns=wavenumbers, index=sample_names)
    master_df = master_df.loc[:, (master_df != 0).all(axis=0)]
    
    master_df.reset_index(inplace=True)
    master_df.rename(columns={'index': 'Sample'}, inplace=True)
    
    output_filename = f'{name}_cleaned_main.xlsx' if name else 'all_cleaned_main.xlsx'
    print(f"\nConsolidating {len(master_df)} spectra into {output_filename}...")
    master_df.to_excel(output_filename, index=False, header=True)
    print("Done!")


Processed: 100%_graded_450mW_8000ms_5rep.csv
Processed: 100%_graded_450mW_1000ms_5rep.csv
Processed: 100%_graded_450mW_8000ms_5rep_Final.csv
Processed: 100%_graded_450mW_8000ms_10rep.csv
Processed: 100%_graded_450mW_2000ms_5rep.csv
Processed: 100%_graded_450mW_4000ms_5rep.csv
Processed: 45%_mAb_55%_buffer_450mW_8000ms_5rep.csv
Processed: 35%_mAb_65%_buffer_450mW_8000ms_5rep.csv
Processed: 100%_mAb_0%_buffer_450mW_8000ms_5rep.csv
Processed: 5%_mAb_95%_buffer_450mW_8000ms_5rep.csv
Processed: 60%_mAb_40%_buffer_450mW_8000ms_5rep.csv
Processed: 40%_mAb_60%_buffer_450mW_8000ms_5rep.csv
Processed: 75%_mAb_25%_buffer_450mW_8000ms_5rep.csv
Processed: 15%_mAb_85%_buffer_450mW_8000ms_5rep.csv
Processed: 50%_mAb_50%_buffer_450mW_8000ms_5rep.csv
Processed: 90%_mAb_10%_buffer_450mW_8000ms_5rep.csv
Processed: 70%_mAb_30%_buffer_450mW_8000ms_5rep.csv
Processed: 80%_mAb_20%_buffer_450mW_8000ms_5rep.csv
Processed: 25%_mAb_75%_buffer_450mW_8000ms_5rep.csv
Processed: 10%_mAb_90%_buffer_450mW_8000ms_5rep.