In [4]:
import pandas as pd
import os
import glob
from tqdm import tqdm

# Define the folder containing the TSV files
folder_path = r"H:\My Drive\Pathogenic_Landscape\data\absolute\filtered_mams_research_tsv"  # Replace with your folder path
output_file = r"research_filtered_combined.tsv"

# Columns to select from each file
sel_cols = [
    'CHROM_x', 'POS_x', 'End_x', 'REF_x', 'ALT_x', 'Ref.Gene', 
    'Func.ensGene', 'ExonicFunc.ensGene', 'AAChange.ensGene', 
    'Interpro_domain', 'avsnp150', 'CLNDN', 'CLNDISDB', 'clinvar: Clinvar '
]

# Create a list to hold dataframes
combined_data = []

# Use glob to iterate through TSV files
file_paths = glob.glob(os.path.join(folder_path, "*.tsv"))

for file_path in tqdm(file_paths, desc="Processing files"):
    try:
        # Extract sample name from file name and remove suffix "_filtered.tsv"
        sample_name = os.path.basename(file_path).replace("_filtered.tsv", "")
        
        # Read the TSV file with only the selected columns
        data = pd.read_csv(file_path, sep='\t', usecols=sel_cols, low_memory=False)
        
        # Add a column for sample name
        data.insert(0, 'Sample_Name', sample_name)
        
        combined_data.append(data)
    except ValueError as e:
        print(f"Skipping {file_path} due to missing columns: {e}")

# Combine all dataframes
if combined_data:
    final_combined = pd.concat(combined_data, ignore_index=True)
    
    # Save to TSV
    final_combined.to_csv(output_file, sep='\t', index=False)
    print(f"Combined file saved as {output_file}")
else:
    print("No valid files to combine.")


Processing files: 100%|██████████| 631/631 [11:26<00:00,  1.09s/it]


Combined file saved as research_filtered_combined.tsv


In [5]:
import pandas as pd
import glob
from tqdm import tqdm

# Define the folder containing the output TSV files
folder_path = r"E:\clinical_research_filtered_combined"  # Replace with your folder path
output_file = "final_filtered_combined.tsv"

# Columns to select (optional, you can omit this if all columns should be included)
sel_cols = None  # Set to None to include all columns, or specify columns like ['Column1', 'Column2']

# Create a list to hold dataframes
combined_data = []

# Use glob to find TSV files to combine
file_paths = glob.glob(os.path.join(folder_path, "*.tsv"))

for file_path in tqdm(file_paths, desc="Processing output files"):
    try:
        # Read the TSV file
        data = pd.read_csv(file_path, sep='\t', usecols=sel_cols, low_memory=False)
        
        combined_data.append(data)
    except ValueError as e:
        print(f"Skipping {file_path} due to missing columns: {e}")

# Combine all dataframes
if combined_data:
    final_combined = pd.concat(combined_data, ignore_index=True)
    
    # Save to a new TSV file
    final_combined.to_csv(output_file, sep='\t', index=False)
    print(f"Final combined file saved as {output_file}")
else:
    print("No valid files to combine.")


Processing output files: 100%|██████████| 2/2 [00:09<00:00,  4.84s/it]


Final combined file saved as final_filtered_combined.tsv
