## sample (middle part)

In [42]:
import pandas as pd

# Define function: split each key-value pair in INFO into two columns
def parse_info_field(info_str):
    fields = info_str.split(';')
    keys = []
    values = []
    for field in fields:
        if '=' in field:
            key, value = field.split('=')
            keys.append(key)
            values.append(value)
        else:
            keys.append(field)
            values.append(True)
    return pd.Series(values, index=keys)

# Read vcf file
df = pd.read_csv('/nfs/research/goldman/zihao/Datas/p1/File_5_annot/SRR20358470.annot.vcf', delimiter='\t',comment='#', header=None,
                 dtype={0: str, 1: int, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})

# Set column names
df.columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']

# Split the INFO column into multiple key-value pairs
df_info = df['INFO'].apply(parse_info_field)

# Add the processed result to the original data frame
df = pd.concat([df, df_info], axis=1)

# Drop rows based on the value of the 'INDEL' column, if it is present
if 'INDEL' in df.columns:
    df = df.drop(df[df['INDEL'] == True].index)

# Create AF and SB columns and set them to 0 if they don't exist
if 'AF' not in df.columns:
    df['AF'] = 0
if 'SB' not in df.columns:
    df['SB'] = 0

# Transform AF > 0.5
df['AF'] = df['AF'].astype(float)
df.loc[df['AF'] > 0.5, 'AF'] = 1 - df['AF']


# Extract desired columns
df = df[['POS', 'REF', 'ALT', 'AF', 'SB']]

# 创建一个新的索引
new_index = pd.RangeIndex(start=1, stop=29904, step=1)

# 重新索引数据框
df = df.set_index('POS').reindex(new_index, fill_value=0).reset_index(drop=False).rename(columns={'index': 'POS'})

df

Unnamed: 0,POS,REF,ALT,AF,SB
0,1,0,0,0.0,0
1,2,0,0,0.0,0
2,3,0,0,0.0,0
3,4,0,0,0.0,0
4,5,0,0,0.0,0
...,...,...,...,...,...
29898,29899,0,0,0.0,0
29899,29900,0,0,0.0,0
29900,29901,0,0,0.0,0
29901,29902,0,0,0.0,0


## For test / also for final version

##### Code block:
```bash
bsub -M 2000 
-e /nfs/research/goldman/zihao/errorsProject_1/Annot/Annot_decompress_errorChecking_error.txt 
'python3 /nfs/research/goldman/zihao/errorsProject_1/Annot/Annot_Decompress_and_save.py'
```

##### csv版本_test

In [68]:
import csv
import gzip
import glob
import os

def parse_info_field(info_str):
    info_dict = {}
    for field in info_str.split(';'):
        if '=' in field:
            key, value = field.split('=')
            info_dict[key] = value
    return info_dict

def process_vcf_file(file_path):
    try:
        data = {}
        new_data = []

        with gzip.open(file_path, 'rt') as file:
            reader = csv.reader(file, delimiter='\t')

            for row in reader:
                if not row or row[0].startswith('#'):
                    continue

                chrom, pos, id_, ref, alt, qual, filter_, info = row
                pos = int(pos)
                info_dict = parse_info_field(info)

                if 'INDEL' in info_dict and info_dict['INDEL'] == True:
                    continue

                ref = ref if ref else '0'
                alt = alt if alt else '0'
                af = info_dict.get('AF', '0')
                sb = info_dict.get('SB', '0')
                
                # Transform AF > 0.5
                af = float(af)
                if af > 0.5:
                    af = 1 - af

                if pos not in data:
                    data[pos] = [pos, ref, alt, af, sb]

        for idx in range(1, 29904):
            if idx in data:
                new_data.append(data[idx])
            else:
                new_data.append([idx, 'NA', 'NA', '0', '0'])

        return new_data

    except Exception as e:
        print(f"Unknown error processing {file_path}: {e}")
        
        

In [69]:
# Get a list of all vcf files in the input directory
input_dir = '/nfs/research/goldman/zihao/Datas/p1/File_5_annot/Downloads/'
output_dir = '/homes/zihao/DATAS/TEST_for_annot/'
file_paths = glob.glob(os.path.join(input_dir, '*.annot.vcf.gz'))

# Process the first 10 vcf files
for i, file_path in enumerate(file_paths):
    if i >= 1000: #!!!!!!!!!!!正式版删除
        break

    # Define the output file path
    output_file = os.path.join(output_dir, os.path.basename(file_path).replace('.annot.vcf.gz', '_annot.txt'))

    # Process the vcf file
    result = process_vcf_file(file_path)
    if result is None:
        continue

    # Write the results to a text file
    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(['POS', 'REF', 'ALT', 'AF', 'SB'])
        writer.writerows(result)
