## Decompress and save coverage

In [10]:
import os
import gzip


class CoverageProcessor:
    def __init__(self, input_dir, output_dir):
        self.input_dir = input_dir
        self.output_dir = output_dir

    def get_file_paths(self):
        """Return all file paths that end with '.coverage.gz'."""
        return (entry.path for entry in os.scandir(self.input_dir) if entry.name.endswith('.coverage.gz'))

    def process_file(self, file_path, output_file):
        """
        Process the given file and calculate the mean coverage and coverage ratio for each position.
        Then, write the results to a txt file.
        """
        try:
            with gzip.open(file_path, 'rt') as f:
                lines = (line.strip() for line in f if not line.startswith('##'))
                data = [[int(pos), n, int(cov)] for line in lines for pos, n, cov in [line.split(',')]]
                total_coverage = sum(row[2] for row in data)
                mean_coverage = total_coverage / len(data)
                ratio_data = [[str(row[0]), row[1], str(row[2] / mean_coverage)] for row in data]

                with open(output_file, 'w') as out_f:
                    out_f.write('Position\tN\tRATIO\n')
                    out_f.writelines('\t'.join(row) + '\n' for row in ratio_data)

        except Exception as e:
            print(f"Unknown error processing {file_path}: {e}")

    def process_file_2(self):
        """
        Iterate over each file in the input directory and process it using the `process_file` method.
        Skip files if their output file already exists.
        """
        file_paths = self.get_file_paths()

        for i, file_path in enumerate(file_paths):
            
        # Remove this line for the final version！！！！
            if i >= 20:  
                break
        # Remove this line for the final version！！！！
        
            output_file = os.path.join(self.output_dir, os.path.basename(file_path).replace('.coverage.gz', '_coverage.txt'))

            if os.path.exists(output_file):
                print(f"{output_file} already exists. Skipping file {file_path}.")
                continue

            self.process_file(file_path, output_file)

In [13]:
if __name__ == '__main__':
    input_dir = '/nfs/research/goldman/zihao/Datas/p1/File_5_coverage/Downloads/'
    output_dir = '/homes/zihao/DATAS/TEST_for_cov_new_may/'
    processor = CoverageProcessor(input_dir, output_dir)
    processor.process_file_2()

### old version_for_apr.21

```python
import os
import gzip

input_dir = '/nfs/research/goldman/zihao/Datas/p1/File_5_coverage/Downloads/'
output_dir = '/homes/zihao/TEST/'

# Get all file paths that end with '.coverage.gz'
file_paths = (entry.path for entry in os.scandir(input_dir) if entry.name.endswith('.coverage.gz'))

# Iterate over each file and store the results in a list
for i, file_path in enumerate(file_paths):
    
    # Only process the first 10 files
    
    ### 正式版删除！！！！！！！！
    if i >= 20:
        break
    ### 正式版删除！！！！！！！！
        
    # Construct the output file name
    output_file = os.path.join(output_dir, os.path.basename(file_path).replace('.coverage.gz', '_coverage.txt'))

    # Check if the output file already exists and skip the file if it does
    if os.path.exists(output_file):
        print(f"{output_file} already exists. Skipping file {file_path}.")
        continue

    try:
        with gzip.open(file_path, 'rt') as f:
            # Read each line in the file and strip the newline character
            # Ignore lines that start with '##'
            lines = (line.strip() for line in f if not line.startswith('##'))
            # Split each line by the tab character and parse the data
            data = [[int(pos), n, int(cov)] for line in lines for pos, n, cov in [line.split(',')]]
            # Calculate the mean coverage and the coverage ratio for each position
            total_coverage = sum(row[2] for row in data)
            mean_coverage = total_coverage / len(data)
            ratio_data = [[str(row[0]), row[1], str(row[2] / mean_coverage)] for row in data]
            # Write the results to a text file
            with open(output_file, 'w') as out_f:
                out_f.write('Position\tN\tRATIO\n')
                out_f.writelines('\t'.join(row) + '\n' for row in ratio_data)
                    
    except Exception as e:
        print(f"Unknown error processing {file_path}: {e}")
```

### pandas version

```python
import os
import gzip
import pandas as pd
import glob

input_dir = '/nfs/research/goldman/zihao/Datas/p1/File_5_coverage/Downloads/'
output_dir = '/nfs/research/goldman/zihao/Datas/p1/File_5_coverage/Decompress/'

# Get all file paths that end with .coverage.gz
file_paths = glob.glob(os.path.join(input_dir, '*.coverage.gz'))

# Traverse each file and store the results in a list
for i, file_path in enumerate(file_paths):
    # Construct the output file name
    output_file = os.path.join(output_dir, os.path.basename(file_path).replace('.coverage.gz', '_coverage.txt'))
    try:
        with gzip.open(file_path, 'rt') as f:
            lines = [line for line in f if not line.startswith('##')]
            data = [line.strip().split('\t') for line in lines[0:]]
            df = pd.DataFrame(data)
            df[['Position', 'N', 'Coverage']] = df.iloc[:, 0].str.split(',', expand=True)
            df = df.drop(df.columns[[0, 2]], axis=1)
            df['SUM'] = df['Coverage'].astype(int).sum()
            df['MEAN'] = df['SUM']/len(df)
            df['RATIO'] = df['Coverage'].astype(int)/df['MEAN'].astype(int)
            df = df.drop(['SUM','MEAN'], axis=1)
            # Save the results to a text file
            df.to_csv(output_file, sep='\t', header=True, index=False)
            # Delete the DataFrame to free up memory
            del df
    except gzip.BadGzipFile:
        print(f"Skipping {file_path}: gzip decompression failed.")
        continue
    except IndexError as e:
        file_id = os.path.basename(file_path)[:10]
        print(f"Error processing {file_id}: {e}")
        continue
    except EOFError as e:
        file_id = os.path.basename(file_path)[:10]
        print(f"Error processing {file_id}: {e}")
        continue
```