In [1]:
import os
import pandas as pd
#results_dir = "/logo2/irfan/BioInfoSoftware/ichorCNA/scripts/snakemake/results/ichorCNA"

results_dir = "/logo2/irfan/BioInfoSoftware/ichorCNA/scripts/snakemake/results_Noah/ichorCNA"
outfile = "Default_ichorCNA.txt"


In [2]:


def extract_ichorcna_data(results_dir):
    """
    Extracts relevant data from ichorCNA results' params.txt files.

    Parameters:
    - results_dir (str): Path to the directory containing ichorCNA results folders.

    Returns:
    - DataFrame: A Pandas DataFrame containing extracted data.
    """

    # Columns for the resulting DataFrame
    columns = ["sample_name", "tumor_fraction", "ploidy", "gender", "ChrY_coverage_fraction", "ChrX_median_log_ratio"]
    
    # Initialize a list to store extracted data dictionaries
    data_list = []
    
    # Iterating through each folder in the results directory
    for sample_folder in os.listdir(results_dir):
        sample_path = os.path.join(results_dir, sample_folder)
        
        # Confirming that it's a folder
        if os.path.isdir(sample_path):
            # Searching for the params.txt file with .params.txt extension
            for file_name in os.listdir(sample_path):
                if file_name.endswith(".params.txt"):
                    params_file_path = os.path.join(sample_path, file_name)
                    break  # Exit the loop once we find the first .params.txt file
            else:  # No params.txt file found
                print(f"No params.txt file found for sample: {sample_folder}")
                continue
            
            # Extracting data from params.txt
            with open(params_file_path, "r") as file:
                # Initialize a dictionary to store extracted data
                data = {"sample_name": sample_folder}
                
                # Iterating through each line in params.txt to extract data
                for line in file:
                    line = line.strip()  # Remove leading/trailing whitespace
                    if "Gender:" in line:
                        data["gender"] = line.split(":")[1].strip()
                    elif "Tumor Fraction:" in line:
                        data["tumor_fraction"] = float(line.split(":")[1].strip())
                    elif "Ploidy:" in line:
                        data["ploidy"] = float(line.split(":")[1].strip())
                    elif "ChrY coverage fraction:" in line:
                        data["ChrY_coverage_fraction"] = float(line.split(":")[1].strip())
                    elif "ChrX median log ratio:" in line:
                        data["ChrX_median_log_ratio"] = float(line.split(":")[1].strip())
                
                # Adding extracted data to the data list
                data_list.append(data)
                    
    # Convert the list of data dictionaries to a DataFrame
    result_df = pd.DataFrame(data_list, columns=columns)
    
    return result_df


In [3]:
ichorcna_data = extract_ichorcna_data(results_dir)


In [4]:
ichorcna_data['sample_name'] = ichorcna_data['sample_name'].str.replace('-auto.final.dedup.bam', '',regex=False)
ichorcna_data.head()

Unnamed: 0,sample_name,tumor_fraction,ploidy,gender,ChrY_coverage_fraction,ChrX_median_log_ratio
0,Sample_LC025_CTF.dedup.bam,0.05148,2.092,female,0.0,0.00259
1,Sample_LC042_CTF.dedup.bam,0.03608,2.163,female,0.0,0.01454
2,Sample_LC029_CTF.dedup.bam,0.07199,2.784,female,0.0,0.008787
3,Sample_LC056_CTF.dedup.bam,0.04923,2.008,female,0.0,0.01545
4,Sample_LC007_CTF.dedup.bam,0.04766,1.999,female,0.0,0.008938


In [5]:
display(ichorcna_data.head())

Unnamed: 0,sample_name,tumor_fraction,ploidy,gender,ChrY_coverage_fraction,ChrX_median_log_ratio
0,Sample_LC025_CTF.dedup.bam,0.05148,2.092,female,0.0,0.00259
1,Sample_LC042_CTF.dedup.bam,0.03608,2.163,female,0.0,0.01454
2,Sample_LC029_CTF.dedup.bam,0.07199,2.784,female,0.0,0.008787
3,Sample_LC056_CTF.dedup.bam,0.04923,2.008,female,0.0,0.01545
4,Sample_LC007_CTF.dedup.bam,0.04766,1.999,female,0.0,0.008938


In [6]:
ichorcna_data.to_csv(outfile, sep='\t')