In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import warnings
warnings.simplefilter(action='ignore')
import pandas as pd
import numpy as np
import os
import glob2
import re
import tqdm
import time
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Define paths
RAW_PATH = r"H:\Shared drives\16th_Oct_24_Data_Sharing\FENG"
FILTERED_PATH = r"E:\indigene"

# Option to process all files or selected files based on a sample list
process_all = True  # Set to False if processing a specific sample list

# If not processing all, load sample list
sample_list_path = r"H:\My Drive\Pathogenic_Landscape\data\absolute\Absolute_clinical.txt"  # Path to the sample list
if not process_all:
    with open(sample_list_path, 'r') as f:
        sample_list = [line.strip() for line in f.readlines()]  # Read and strip newline characters

# Fetch raw files
raw_files = glob2.glob(os.path.join(RAW_PATH, "*.xlsx"))

# Filter raw files based on sample list if required
if not process_all:
    raw_files = [file for file in raw_files if os.path.basename(file) in sample_list]

print(f"Processing {len(raw_files)} files.")

Processing 2562 files.


In [5]:
# Initialize a log file to record errors
error_log_file = "error_log.txt"

# Open the log file in write mode (will overwrite existing content)
with open(error_log_file, "w") as log_file:
    log_file.write("Error Log for File Processing\n")
    log_file.write("="*50 + "\n")

'''
Filtration of Certain Values:
    FILTER = Only PASS
    Func.ensGene = only Exonic, Splicing and Exonic;Splicing
    ExonicFunc.ensGene = remove synonymous SNV
    clinvar: Clinvar = Remove Unknown, Uncertain Significance, Benign
    Intervar_auto = Only Pathogenic
    Population Frequency Data = Only lesser than or equal to 0.01
    Ref_Depth = Only greather than 2
'''


for i in tqdm.tqdm(range(len(raw_files))):
    try:
        df = pd.read_excel(raw_files[i]).drop_duplicates('IGV_link')
        sp1 = re.split(r'/|\\', raw_files[i])[-1]
        col_n = sp1.replace('_FENG.xlsx', '')
        df[f'{col_n}:DP'] = df[f'{col_n}:DP'].replace('.', '0').astype('float')
        df = df[df['FILTER']=='PASS']

        ## Pop frequency
        columns_to_process = ['esp6500siv2_all', 'ExAC_ALL', 'ExAC_SAS', 'AF', 'AF_sas', '1000g2015aug_all', '1000g2015aug_all', '1000g2015aug_SAS', 'Ref_Depth']

        for column in columns_to_process:
            df[column] = df[column].replace('.', '0').astype('float')

        df = df[df['ExonicFunc.ensGene']!='synonymous SNV']
        df = df[df['esp6500siv2_all']<= 0.01]
        df = df[df['ExAC_ALL']<= 0.01]
        df = df[df['ExAC_SAS']<= 0.01]
        df = df[df['AF']<=0.01]
        df = df[df['AF_sas']<= 0.01]
        df = df[df['1000g2015aug_all']<= 0.01]
        df = df[df['1000g2015aug_all']<= 0.01]
        df = df[df['1000g2015aug_SAS']<= 0.01]
        df = df[df['Ref_Depth']>=2]
        df = df[~df['clinvar: Clinvar '].str.contains('benign', case=False)]
        df.to_csv(fr'{FILTERED_PATH}\{col_n}_filtered.csv', index=False)

    except Exception as e:
        # Log the error to the log file
        with open(error_log_file, "a") as log_file:
            log_file.write(f"File: {raw_files[i]} caused an error\n")
            log_file.write(f"Error: {str(e)}\n")
            log_file.write("-" * 50 + "\n")

        # Optionally print the error to console for immediate feedback
        print(f"Error processing file {raw_files[i]}: {e}")

  0%|          | 0/2562 [00:00<?, ?it/s]

  0%|          | 6/2562 [00:09<1:09:42,  1.64s/it]

Error processing file H:\Shared drives\16th_Oct_24_Data_Sharing\FENG\WBCF-F-D-CE-S89_FENG.xlsx: bad operand type for unary ~: 'float'


  4%|▍         | 107/2562 [02:47<1:26:52,  2.12s/it]

Error processing file H:\Shared drives\16th_Oct_24_Data_Sharing\FENG\VESA-F-ce-S131-L003-modified_FENG.xlsx: 'VESA-F-ce-S131-L003-modified:DP'


  7%|▋         | 184/2562 [04:39<48:33,  1.23s/it]  

Error processing file H:\Shared drives\16th_Oct_24_Data_Sharing\FENG\WDDB-F1-D-CE-S47_FENG.xlsx: 'WDDB-F1-D-CE-S47:DP'


 17%|█▋        | 429/2562 [10:13<52:18,  1.47s/it]  

Error processing file H:\Shared drives\16th_Oct_24_Data_Sharing\FENG\WBJI-F-D-CE-S3_FENG.xlsx: Index(['IGV_link'], dtype='object')


 21%|██        | 544/2562 [13:15<49:23,  1.47s/it]  

Error processing file H:\Shared drives\16th_Oct_24_Data_Sharing\FENG\WBJB-F-D-L1L2-CE-S2_FENG.xlsx: Index(['IGV_link'], dtype='object')


 22%|██▏       | 560/2562 [13:38<48:40,  1.46s/it]

Error processing file H:\Shared drives\16th_Oct_24_Data_Sharing\FENG\WCTE-F-D-CE-STD-S66_FENG.xlsx: 'WCTE-F-D-CE-STD-S66:DP'


 50%|█████     | 1284/2562 [30:06<26:13,  1.23s/it] 

Error processing file H:\Shared drives\16th_Oct_24_Data_Sharing\FENG\WCBD-F-D-CE-S65_FENG.xlsx: 'WCBD-F-D-CE-S65:DP'


 63%|██████▎   | 1614/2562 [38:10<18:06,  1.15s/it]  

Error processing file H:\Shared drives\16th_Oct_24_Data_Sharing\FENG\WBJA-F-D-CE-S1_FENG.xlsx: Index(['IGV_link'], dtype='object')


 82%|████████▏ | 2096/2562 [49:21<11:02,  1.42s/it]

Error processing file H:\Shared drives\16th_Oct_24_Data_Sharing\FENG\WBIB-F-D-CE-S4_FENG.xlsx: Index(['IGV_link'], dtype='object')


100%|██████████| 2562/2562 [1:00:28<00:00,  1.42s/it]
