In [1]:
import os
import gzip
import csv
import glob

input_dir = '/nfs/research/goldman/zihao/Datas/p1/File_5_annot/Downloads/'
output_dir = '/nfs/research/goldman/zihao/Datas/p1/File_5_annot/Decompress/'

file_paths = glob.glob(os.path.join(input_dir, '*.annot.vcf.gz'))

def parse_info_field(info_str):
    fields = info_str.split(';')
    parsed_info = {}
    for field in fields:
        if '=' in field:
            key, value = field.split('=')
            parsed_info[key] = value
        else:
            parsed_info[field] = True
    return parsed_info

def calculate_ratios(data, column_name):
    total = sum(data)
    mean = total / len(data) if len(data) > 0 else 0
    ratios = [value / mean if mean != 0 else 0 for value in data]
    return total, mean, ratios

for file_path in file_paths:

    output_file = os.path.join(output_dir, os.path.basename(file_path).replace('.annot.vcf.gz', '_annot.txt'))
    try:
        with gzip.open(file_path, 'rt') as f:
            reader = csv.reader(f, delimiter='\t')
            
            rows = []
            dp_data, af_data, sb_data = [], [], []
            
            for row in reader:
                if row[0].startswith('#'):
                    continue
                
                info = parse_info_field(row[7])
                dp = info.get('DP', '')
                if dp:
                    dp = float(dp)
                else:
                    dp = 0.0
                af = info.get('AF', '')
                if af:
                    af = float(af)
                else:
                    af = 0.0
                        
                sb = info.get('SB', '')
                if sb:
                    sb = float(sb)
                else:
                    sb = 0.0

                
                dp_data.append(dp)
                af_data.append(af)
                sb_data.append(sb)
                
                rows.append([row[1], dp, af, sb])
            
            dp_sum, dp_mean, dp_ratios = calculate_ratios(dp_data, 'DP')
            af_sum, af_mean, af_ratios = calculate_ratios(af_data, 'AF')
            sb_sum, sb_mean, sb_ratios = calculate_ratios(sb_data, 'SB')
            
            with open(output_file, 'w', newline='') as outfile:
                writer = csv.writer(outfile, delimiter='\t')
                writer.writerow(['POS', 'DP_RATIO', 'AF_RATIO', 'SB_RATIO'])
                
                for i, row in enumerate(rows):
                    writer.writerow([row[0], dp_ratios[i], af_ratios[i], sb_ratios[i]])

    except gzip.BadGzipFile:
        print(f"Skipping {file_path}: gzip decompression failed.")
        continue
    except IndexError as e:
        file_id = os.path.basename(file_path)[:10]
        print(f"Error processing {file_id}: {e}")
        continue
    except EOFError as e:
        file_id = os.path.basename(file_path)[:10]
        print(f"Error processing {file_id}: {e}")
        continue

Error processing SRR2030151: Compressed file ended before the end-of-stream marker was reached
Error processing SRR1994967: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2034282: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2091023: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2182246: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2087318: Compressed file ended before the end-of-stream marker was reached
Error processing SRR1994637: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2104599: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2179370: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2090843: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2087285: Compressed file ended

Error processing SRR2030135: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2034258: Compressed file ended before the end-of-stream marker was reached
Error processing SRR1994871: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2091070: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2091040: Compressed file ended before the end-of-stream marker was reached
Error processing SRR1970564: Compressed file ended before the end-of-stream marker was reached
Error processing SRR2104469: Compressed file ended before the end-of-stream marker was reached
Error processing ERR6125368: Compressed file ended before the end-of-stream marker was reached


##### Code block:
```bash
bsub -M 2000 
-e /nfs/research/goldman/zihao/errorsProject_1/Annot/Annot_deompress_errorChecking_error.txt  
'python3 /nfs/research/goldman/zihao/errorsProject_1/Annot/0_Apr.21_Decompress_and_save.py'
```

## For test

In [16]:
import os
import gzip
import csv
import glob

input_dir = '/nfs/research/goldman/zihao/Datas/p1/File_5_annot/Downloads/'
output_dir = '/homes/zihao/DATAS/TEST_for_annot'

file_paths = glob.glob(os.path.join(input_dir, '*.annot.vcf.gz'))

def parse_info_field(info_str):
    fields = info_str.split(';')
    parsed_info = {}
    for field in fields:
        if '=' in field:
            key, value = field.split('=')
            parsed_info[key] = value
        else:
            parsed_info[field] = True
    return parsed_info

def calculate_ratios(data, column_name):
    total = sum(data)
    mean = total / len(data) if len(data) > 0 else 0
    ratios = [value / mean if mean != 0 else 0 for value in data]
    return total, mean, ratios

for i, file_path in enumerate(file_paths):
    if i >= 1000:
        break

    output_file = os.path.join(output_dir, os.path.basename(file_path).replace('.annot.vcf.gz', '_annot.txt'))
    try:
        with gzip.open(file_path, 'rt') as f:
            reader = csv.reader(f, delimiter='\t')
            
            rows = []
            dp_data, af_data, sb_data = [], [], []
            
            for row in reader:
                if row[0].startswith('#'):
                    continue
                
                info = parse_info_field(row[7])
                dp = float(info.get('DP', 0))
                af = float(info.get('AF', 0))
                sb = float(info.get('SB', 0))
                
                dp_data.append(dp)
                af_data.append(af)
                sb_data.append(sb)
                
                rows.append([row[1], dp, af, sb])
            
            dp_sum, dp_mean, dp_ratios = calculate_ratios(dp_data, 'DP')
            af_sum, af_mean, af_ratios = calculate_ratios(af_data, 'AF')
            sb_sum, sb_mean, sb_ratios = calculate_ratios(sb_data, 'SB')
            
            with open(output_file, 'w', newline='') as outfile:
                writer = csv.writer(outfile, delimiter='\t')
                writer.writerow(['POS', 'DP_RATIO', 'AF_RATIO', 'SB_RATIO'])
                
                for i, row in enumerate(rows):
                    writer.writerow([row[0], dp_ratios[i], af_ratios[i], sb_ratios[i]])

    except gzip.BadGzipFile:
        print(f"Skipping {file_path}: gzip decompression failed.")
        continue
    except IndexError as e:
        file_id = os.path.basename(file_path)[:10]
        print(f"Error processing {file_id}: {e}")
        continue
    except EOFError as e:
        file_id = os.path.basename(file_path)[:10]
        print(f"Error processing {file_id}: {e}")
        continue