# Oropharynx patient data USZ preparation

## Requiremements

In [None]:
import re
import os
import csv
import pandas as pd
import shutil
import numpy as np
#from patient_id_coding.ipynb import process_patient_id

## Structure patterns

In [None]:
patterns = {

    # Improved "gtvp" pattern
    "image": r"image",

    "gtvp": r"\b(?:klin|vorschlag|pr[ae]?eop|v1)?[\s._-]*gtv[\s._-]*p[\s._-]*t?[\s._-]*\d*(?:new|rimary|pr[ae]?eop|v1|xxgy|74\.4|70|ptv1)?[\s._-]*(gy)?[\s._-]*(1a)?[\s._-]*(1b)?\b",

    # Exact match for "body" with optional numbers or symbols following it
    "body": r"(?:^body[\s._-]*\d?$|^skin[\s._-]*\d?$)",

    # Improved "spinal cord" pattern for clearer boundary matching, including "myelon" as an alternative
    "spinal cord": r"(?:spinal[\s._-]*cord$|^myelon$|myelon[\s+]*5mm|spinal[\s._-]*canal)",

    # Matches strings starting with "mandib"
    "mandibula": r"^mandib",

}

## ID encoding and decoding

In [None]:
def encode_decode_patient_id(patient_id: str, coding='encode') -> str:
    """
    Encodes or decodes a patient ID using a mapping from digits to letters.
    
    Parameters:
    - patient_id (str): The patient ID as an 8-character string.
        For encoding, the patient ID should consist of digits.
        For decoding, the patient ID should consist of letters as defined in the mapping.
    - coding (str): Operation mode: 'encode' maps digits to letters, 
                    'decode' converts letters back to digits.
    
    Returns:
    - str: The processed patient ID (encoded or decoded).
    
    Raises:
    - ValueError: If the patient_id does not meet the required format or if an invalid coding mode is provided.
    """
    # Define a bijective mapping from digits to letters.
    mapping = {
        "0": "Q",
        "1": "W",
        "2": "E",
        "3": "R",
        "4": "T",
        "5": "Y",
        "6": "U",
        "7": "I",
        "8": "O",
        "9": "P"
    }
    
    # Create the inverse mapping: letters to digits.
    reverse_mapping = {v: k for k, v in mapping.items()}
    
    def encode_patient_id(patient_id: str) -> str:
        if len(patient_id) != 8 or not patient_id.isdigit():
            raise ValueError("Patient ID for encoding must be an 8-digit string.")
        # Map each digit to its corresponding letter.
        return ''.join(mapping[digit] for digit in patient_id)
    
    def decode_patient_id(encoded_id: str) -> str:
        if len(encoded_id) != 8 or not all(char in reverse_mapping for char in encoded_id):
            raise ValueError("Encoded ID must be an 8-character string with valid mapping letters.")
        # Map each letter back to the corresponding digit.
        return ''.join(reverse_mapping[char] for char in encoded_id)
    
    if coding == 'encode':
        return encode_patient_id(patient_id)
    elif coding == 'decode':
        return decode_patient_id(patient_id)
    else:
        raise ValueError("Coding must be either 'encode' or 'decode'.")

def process_patient_folders(root_dir: str, coding = 'encode'):
    """
    Loops over all folders in the specified directory that match the pattern "*[8 digits]".
    For each matching folder, the function encodes the patient ID (mapping digits to letters)
    using the encode_decode_patient_id function (in 'encode' mode) and renames the folder.
    
    Parameters:
    - root_dir (str): The root directory containing patient folders.
    """
    # Regular expression pattern: an asterisk followed by exactly 8 digits.
    pattern_encode = re.compile(r'(\d{8})$')
    pattern_decode = re.compile(r'([A-Z]{8})$')
    
    for folder in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, folder)
        if os.path.isdir(folder_path):
            if coding == 'encode':
                pattern = pattern_encode
            elif coding == 'decode':
                pattern = pattern_decode

            match = pattern.match(folder)

            if match:
                foldername = match.group(1)
                # Encode the patient ID using our mapping (digits -> letters).
                coded_id = encode_decode_patient_id(foldername, coding=coding)
                new_folder_name = f"{coded_id}"
                new_folder_path = os.path.join(root_dir, new_folder_name)
                
                # Check if a folder with the new name already exists.
                if os.path.exists(new_folder_path):
                    print(f"Folder '{new_folder_name}' already exists. Skipping folder '{folder}'.")
                else:
                    os.rename(folder_path, new_folder_path)
                    print(f"Renamed folder '{folder}' to '{new_folder_name}'")


## Find patients with missing gtvp structure

In [None]:
def find_missing_patterns(root_dir):
    """
    Searches through folders in the specified root directory for .gz files whose names match given patterns.
    
    Folder selection:
        - Only considers folders whose names are exactly 8 uppercase letters.
        - Recursively inspects all subdirectories of these folders (e.g., renamed1, renamed2, etc.)
    
    Matching:
        - For each .gz file, each filename is checked against a set of predefined regex patterns.
        - For every pattern, only the first occurrence is taken as a valid match.
    
    CSV Output:
        - Only folders missing a match for the "gtvp" pattern are written to the CSV.
        - Additionally, for these folders, extra columns are added containing the matches from files
          that match the additional simple pattern r"gtv" (one match per column).
          
    Parameters:
        root_dir (str): The root directory in which to search for folders.
    
    Output:
        A CSV file named "missing_patterns.csv" is created in the root directory.
    """
    # Define the regex patterns for the original matching
    patterns = {
        "image": r"image",
        "gtvp": r"\b(?:klin|vorschlag|pr[ae]?eop|v1)?[\s._-]*gtv[\s._-]*p[\s._-]*t?[\s._-]*\d*(?:new|rimary|pr[ae]?eop|v1|xxgy|74\.4|70|ptv1)?[\s._-]*(gy)?[\s._-]*(1a)?[\s._-]*(1b)?\b",
        "body": r"(?:^body[\s._-]*\d?$|^skin[\s._-]*\d?$)",
        "spinal cord": r"(?:spinal[\s._-]*cord$|^myelon$|myelon[\s+]*5mm|spinal[\s._-]*canal)",
        "mandibula": r"^mandib"
    }

    # Compile regex patterns for case-insensitive matching
    compiled_patterns = {key: re.compile(pattern, re.IGNORECASE) for key, pattern in patterns.items()}
    
    extra_regex = re.compile(r".*gtv.*", re.IGNORECASE)

    
    # List to store rows for CSV output (one row per folder with missing gtvp match)
    missing_data = []
    
    # Iterate over entries in the root directory
    for folder in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, folder)
        # Only consider directories with exactly 8 uppercase letters
        if os.path.isdir(folder_path) and re.fullmatch(r"[A-Z]{8}", folder):
            # Dictionary to track whether a pattern was found in the current folder
            found_patterns = {key: False for key in patterns.keys()}
            # List to accumulate matches for the extra pattern "gtv"
            gtv_matches = []
            
            # Walk recursively through the subdirectories of the folder
            for subdir, dirs, files in os.walk(folder_path):
                # Consider only files with the .gz extension
                for file in files:
                    file_lower = file.lower()
                    if file_lower.endswith(".gz"):
                        # If the file is a mask file, extract the substring between "mask_" and ".nii.gz"
                        if file_lower.startswith("mask_") and file_lower.endswith(".nii.gz"):
                            search_part = file_lower[len("mask_"):-len(".nii.gz")]
                        else:
                            # For non-mask files (e.g., image files), use the full filename
                            search_part = file_lower

                        # Check for extra pattern "gtv" and accumulate the match if found
                        if extra_regex.search(search_part):
                            gtv_matches.append(search_part)
                        
                        # Check each original pattern (if not already found)
                        for key, regex in compiled_patterns.items():
                            if not found_patterns[key]:
                                if regex.search(search_part):
                                    found_patterns[key] = True
            
            # Only process folders missing a gtvp match
            if not found_patterns["gtvp"]:
                # Create row data: folder name and subsequent columns for each gtv match
                row = {"folder": encode_decode_patient_id(folder, coding='decode')}
                for i, match in enumerate(gtv_matches):
                    row[f"match_{i+1}"] = match
                missing_data.append(row)
    
    # Determine the maximum number of "gtv" match columns among all rows
    max_matches = 0
    for row in missing_data:
        count = sum(1 for key in row if key.startswith("match_"))
        if count > max_matches:
            max_matches = count
    
    # Prepare the CSV header: one column for folder, then dynamic match columns
    header = ["folder"] + [f"match_{i+1}" for i in range(max_matches)]
    
    # Write the results to a CSV file in the root directory
    csv_file = os.path.join(root_dir, "missing_patterns.csv")
    with open(csv_file, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()
        for row in missing_data:
            # Ensure that each row has keys for all match columns
            for i in range(max_matches):
                key = f"match_{i+1}"
                if key not in row:
                    row[key] = ""
            writer.writerow(row)
    
    print(f"CSV file created at: {csv_file}")

# Example usage:
root_path = r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/06_midline_extraction"
#find_missing_patterns(root_path)


## Find all patient IDs from the second csv which occur in the first one

In [None]:
csv_path1 = r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/patients_no_adjuvant.csv"
csv_path2 = r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/missing_patterns.csv"


def clean_patient_id(pid):
    """
    Cleans a patient ID by converting it to a string, stripping leading/trailing whitespace,
    and removing any surrounding single or double quotes.
    The cleaned ID is then padded with zeros on the left to ensure it is 8 digits long.
    """
    if pd.isna(pid):
        return ""
    cleaned = str(pid).strip().strip("'").strip('"')
    return cleaned.zfill(8)

def filter_csv2_with_common_ids(csv_file1, csv_file2, output_csv="filtered_csv2.csv"):
    """
    Loads two CSV files (with headers) and filters the second CSV (csv_file2) to keep only
    rows whose patient IDs (from the first column) occur in the first CSV (csv_file1).

    Patient IDs are cleaned before comparison to ensure proper matching even if they contain quotes,
    and are zero-padded to be 8 digits long if necessary.

    The filtered CSV is saved with patient IDs quoted to preserve leading zeros.
    
    Parameters:
        csv_file1 (str): Path to the first CSV file.
        csv_file2 (str): Path to the second CSV file.
        output_csv (str): Path to save the filtered version of csv_file2.

    Returns:
        pandas.DataFrame: The filtered version of csv_file2 with 8-digit patient IDs.
    """
    # Load both CSV files with headers.
    df1 = pd.read_csv(csv_file1)
    df2 = pd.read_csv(csv_file2)
    
    # Clean the patient IDs from the first CSV.
    ids1 = set(clean_patient_id(pid) for pid in df1.iloc[:, 0])
    
    # Create a temporary column in df2 with cleaned (and padded) patient IDs.
    df2['cleaned_id'] = df2.iloc[:, 0].apply(clean_patient_id)
    
    # Filter df2: keep only rows where the cleaned patient ID occurs in ids1.
    filtered_df2 = df2[df2['cleaned_id'].isin(ids1)].copy()
    
    # Update the original patient ID column in filtered_df2 with the cleaned version.
    filtered_df2.iloc[:, 0] = filtered_df2['cleaned_id']
    
    # Drop the temporary 'cleaned_id' column.
    filtered_df2.drop(columns=['cleaned_id'], inplace=True)
    
    # Save the filtered DataFrame to a new CSV file.
    # The quoting parameter ensures that patient IDs remain as text with leading zeros.
    filtered_df2.to_csv(output_csv, index=False, quoting=csv.QUOTE_ALL)
    print(f"Filtered CSV file created at: {output_csv}")
    
    return filtered_df2

def find_missing_ids(csv_file1, csv_file2, output_csv="missing_in_csv2.csv"):
    """
    Loads two CSV files (with headers) and finds the patient IDs from CSV1 (first column)
    that are missing in CSV2. The patient IDs are cleaned and zero-padded to be 8 digits long.
    
    The function then filters CSV1 to keep only the rows with these missing patient IDs,
    and saves the filtered DataFrame to a new CSV file with quoted fields.
    
    Parameters:
        csv_file1 (str): Path to the first CSV file.
        csv_file2 (str): Path to the second CSV file.
        output_csv (str): Path to save the filtered version of CSV1 with missing patient IDs.
    
    Returns:
        pandas.DataFrame: The filtered DataFrame containing rows from CSV1 whose patient IDs
                          are missing in CSV2.
    """
    # Load both CSV files (with headers)
    df1 = pd.read_csv(csv_file1)
    df2 = pd.read_csv(csv_file2)
    
    # Create a temporary column with cleaned (and padded) patient IDs for both dataframes.
    df1['cleaned_id'] = df1.iloc[:, 0].apply(clean_patient_id)
    df2['cleaned_id'] = df2.iloc[:, 0].apply(clean_patient_id)
    
    # Compute sets of cleaned patient IDs
    ids1 = set(df1['cleaned_id'])
    ids2 = set(df2['cleaned_id'])
    
    # Find patient IDs in CSV1 that are missing in CSV2
    missing_ids = ids1 - ids2
    
    # Filter CSV1 to keep only rows where the cleaned ID is in the missing set
    missing_df = df1[df1['cleaned_id'].isin(missing_ids)].copy()
    
    # Update the original patient ID column with the cleaned version and drop the temporary column
    missing_df.iloc[:, 0] = missing_df['cleaned_id']
    missing_df.drop(columns=['cleaned_id'], inplace=True)
    
    # Save the filtered DataFrame to a new CSV file.
    # The quoting parameter ensures the IDs remain as text with leading zeros visible.
    missing_df.to_csv(output_csv, index=False, quoting=csv.QUOTE_ALL)
    print(f"Missing patient IDs CSV created at: {output_csv}")
    
    return missing_df

# filtered_df = filter_csv2_with_common_ids(csv_path1, csv_path2, output_csv= r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/missing_gtvp_no_adjuvant.csv")

# no_gtvp_adjuvant = find_missing_ids(csv_path2, csv_path1, output_csv= r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/missing_gtvp_adjuvant.csv")



## Copy all relevant structures in a new directory with folders 

In [None]:

def copy_patient_files_from_csv(csv_file, source_dir, dest_dir):
    """
    Reads a CSV file with headers: Patient ID, Extention, Position, GTVp, Body, Mandible, Spinal Cord.
    For each patient, finds the corresponding folder (named as the patient ID; padded to 8 digits)
    in source_dir, and copies:
      - the file named "image.nii.gz", and 
      - for each structure column (if non-empty) the file whose name is given in that cell
    from within the patient folder (searched recursively) to a new folder in dest_dir named after the patient ID.
    
    Before processing each patient, the function checks if the GTVp entry is empty.
    If it is, the patient is skipped.
    
    Parameters:
      csv_file (str): Path to the CSV file.
      source_dir (str): Directory containing patient folders (each named as an 8-digit patient ID).
      dest_dir (str): Destination directory where new patient folders (with selected files) will be created.
    
    Behavior:
      - For each patient, if the GTVp column is empty, that patient is skipped.
      - Otherwise, a new folder (dest_dir/patient_id) is created.
      - The function searches for "image.nii.gz" plus each non-empty structure file (GTVp, Body, Mandible, Spinal Cord)
        in the patient folder (searched recursively).
      - When a file is found, it is copied to the corresponding destination folder.
      - If a structure entry is empty or the file isn’t found, that file is skipped.
    """
    # Read CSV file (assumed to have headers)
    df = pd.read_csv(csv_file)
    
    # Define the structure columns that we need to consider
    structure_columns = ["GTVp", "Body", "Mandible", "Spinal Cord"]
    
    # Process each row (patient) in the CSV.
    for idx, row in df.iterrows():
        # Skip if Patient ID is missing or not a number.
        if pd.isna(row["Patient ID"]):
            print(f"Patient ID missing for row {idx}, skipping.")
            continue
        try:
            patient_id_num = float(row["Patient ID"])
        except Exception as e:
            print(f"Patient ID not valid for row {idx}, skipping.")
            continue
        
        # Convert the patient ID to an integer and pad to 8 digits.
        patient_id = str(int(patient_id_num)).zfill(8)
        
        # Check if the GTVp entry is empty. If so, skip this patient.
        gtvp_entry = str(row["GTVp"]).strip()
        if not gtvp_entry or gtvp_entry.lower() in ['nan', '']:
            print(f"Patient {patient_id} has an empty GTVp entry. Skipping.")
            continue
        
        print(f"Processing patient: {patient_id}")
        
        # Locate the patient folder in the source directory.
        patient_folder = os.path.join(source_dir, patient_id)
        if not os.path.isdir(patient_folder):
            print(f"Patient folder {patient_folder} not found, skipping patient {patient_id}.")
            continue
        
        # Create a new destination folder for this patient.
        dest_patient_folder = os.path.join(dest_dir, patient_id)
        if os.path.exists(dest_patient_folder):
            print(f"Destination folder {dest_patient_folder} already exists, skipping.")
            continue
        os.makedirs(dest_patient_folder, exist_ok=True)
        
        # Build the list of files to search for: always "image.nii.gz" plus each structure file (if provided).
        files_to_copy = []
        # Always include the image file.
        files_to_copy.append("image.nii.gz")
        
        # For each structure column, if the cell is non-empty, add that file name to the list.
        for col in structure_columns:
            file_name = str(row[col]).strip()
            if file_name and file_name.lower() not in ['nan', '']:
                files_to_copy.append(file_name)
        
        # For each target file, search recursively in the patient folder and copy the first found instance.
        for target_file in files_to_copy:
            found = False
            for root, dirs, files in os.walk(patient_folder):
                if target_file in files:
                    source_file_path = os.path.join(root, target_file)
                    dest_file_path = os.path.join(dest_patient_folder, target_file)
                    shutil.copy2(source_file_path, dest_file_path)
                    print(f"Copied {source_file_path} to {dest_file_path}")
                    found = True
                    break  # only copy the first found instance for this file
            if not found:
                print(f"File {target_file} not found for patient {patient_id}.")


# csv_file = r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/Patients_with_not_seperated_gtvp_and_ln/patients_gtvgesamt.csv"
# source_dir = r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/06_midline_extraction"
# dest_dir = r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/Patients_with_not_seperated_gtvp_and_ln"
# copy_patient_files_from_csv(csv_file, source_dir, dest_dir)

                

## Create a csv with structure names and extention info

In [None]:
def process_patient_structures(input_csv, patient_dir, output_csv, patterns):
    """
    Processes patient folders based on an input CSV and searches for structure files.
    
    The input CSV must have headers: Patient ID, Location, Extention, Position.
    The output CSV will have headers: Patient ID, Extention, Position, GTVp, Body, Mandible, Spinal Cord.
    
    For each patient (row in the CSV):
      - The patient ID is padded with zeros to ensure it is 8 digits.
      - The matching folder (named exactly as the padded Patient ID) in patient_dir is used.
      - The function recursively searches all subfolders for .nii.gz files.
      - For each file:
           * If the file is a mask file (filename starts with "mask_" and ends with ".nii.gz"),
             the search text is the substring between "mask_" and ".nii.gz".
           * If the file is named exactly "image.nii.gz", the search text is "image".
      - For each structure pattern (keys in the patterns dictionary, which must match the output CSV headers),
        if the search text matches the pattern and that structure hasn’t been recorded yet, the file name is recorded.
      - The search stops once all structure patterns have been found or after all files have been examined.
      - If a structure is not found, its column remains empty.
    
    Parameters:
      input_csv (str): Path to the input CSV with patient data.
      patient_dir (str): Directory containing patient folders (named by Patient ID, 8 digits).
      output_csv (str): Path to save the output CSV.
      patterns (dict): Dictionary of regex patterns with keys matching structure names in the output CSV.
    
    Returns:
      None. The output CSV is written to disk.
    """
    # Read the input CSV (with headers: Patient ID, Location, Extention, Position)
    df_in = pd.read_csv(input_csv)
    
    # Define output columns: from input (Patient ID, Extention, Position) + structure columns (from patterns)
    structure_keys = list(patterns.keys())  # Expected order: e.g., ["GTVp", "Body", "Mandible", "Spinal Cord"]
    output_columns = ["Patient ID", "Extention", "Position"] + structure_keys
    
    # Compile the regex patterns with case-insensitive matching.
    compiled_patterns = {key: re.compile(pattern, re.IGNORECASE) for key, pattern in patterns.items()}
    
    # Prepare a list to store output rows.
    output_rows = []
    
    # Process each patient from the input CSV.
    for idx, row in df_in.iterrows():
        # Pad the patient ID with zeros to ensure it is 8 digits.
        patient_id = str(row["Patient ID"]).strip().zfill(8)
        extention = row["Extention"]
        position = row["Position"]
        
        # Initialize output row with base columns.
        output_row = {
            "Patient ID": patient_id,
            "Extention": extention,
            "Position": position
        }
        # Initialize structure columns with empty strings.
        for key in structure_keys:
            output_row[key] = ""
        
        # Locate the patient folder (folder name is expected to be the 8-digit patient ID).
        patient_folder = os.path.join(patient_dir, patient_id)
        if not os.path.isdir(patient_folder):
            print(f"Patient folder '{patient_folder}' not found.")
            output_rows.append(output_row)
            continue
        
        # Dictionary to track which structure patterns have been found.
        found_patterns = {key: False for key in structure_keys}
        
        # Walk recursively through the patient folder.
        for folderpath, subfolders, files in os.walk(patient_folder):
            for file in files:
                file_lower = file.lower()
                if not file_lower.endswith(".nii.gz"):
                    continue
                
                # Determine the search text.
                if file_lower.startswith("mask_") and file_lower.endswith(".nii.gz"):
                    # Extract the substring between "mask_" and ".nii.gz".
                    search_text = file_lower[len("mask_"):-len(".nii.gz")]
                elif file_lower == "image.nii.gz":
                    search_text = "image"
                else:
                    continue
                
                # Check each structure pattern that hasn't been found.
                for key in structure_keys:
                    if not found_patterns[key]:
                        regex = compiled_patterns.get(key)
                        if regex.search(search_text):
                            found_patterns[key] = True
                            output_row[key] = file  # Optionally, use os.path.join(folderpath, file) for full path.
                # Stop searching if all structure patterns have been found.
                if all(found_patterns.values()):
                    break
            if all(found_patterns.values()):
                break
        
        output_rows.append(output_row)
    
    # Create output DataFrame and save to CSV.
    df_out = pd.DataFrame(output_rows, columns=output_columns)
    df_out.to_csv(output_csv, index=False)
    print(f"Output CSV saved to {output_csv}")


# Example patterns (ensure these match the structure header names)
patterns = {
    "GTVp": r"\b(?:klin|vorschlag|pr[ae]?eop|v1)?[\s._-]*gtv[\s._-]*p[\s._-]*t?[\s._-]*\d*(?:new|rimary|pr[ae]?eop|v1|xxgy|74\.4|70|ptv1)?[\s._-]*(gy)?[\s._-]*(1a)?[\s._-]*(1b)?\b",
    "Body": r"(?:^body[\s._-]*\d?$|^skin[\s._-]*\d?$)",
    "Mandible": r"^mandib",
    "Spinal Cord": r"(?:spinal[\s._-]*cord$|^myelon$|myelon[\s+]*5mm|spinal[\s._-]*canal)"
}

patient_no_adjuvant = r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/patients_no_adjuvant.csv"
patient_dir = r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/06_midline_extraction"
output_csv = r"/home/loriskeller/Documents/Master Project/Patient data/patient_data_complete/structures_and_extention_no_adjuvant.csv"

# process_patient_structures(patient_no_adjuvant, patient_dir, output_csv, patterns)
