In [1]:
import re

In [2]:
# Define the input and output file paths
input_file_path = 'All_Sequences.gb'
output_file_path = 'parsed_output.txt'

# Define the patterns to search for
patterns = [
    re.compile(r'^LOCUS\s+DBTL'),
    re.compile(r'/label="PP_'),
    re.compile(r'/label="J23119"')
]

# Define the pattern to ignore
ignore_patterns = [
    re.compile(r'/label="PP_RS'),
    re.compile(r'/label="PP_dCas9_gRNA_Array_F')
]


# Open the input file and the output file
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for line in infile:
        # Check if the line matches the ignore pattern
        if any(ignore_pattern.search(line) for ignore_pattern in ignore_patterns):
            continue
        # Check if the line matches any of the patterns
        if any(pattern.search(line) for pattern in patterns):
            outfile.write(line)

print(f'Parsing complete. Matching lines have been written to {output_file_path}')

Parsing complete. Matching lines have been written to parsed_output.txt


In [4]:
import re
import csv

# Define the input and output file paths
input_file_path = 'parsed_output.txt'
output_file_path = 'formatted_output.csv'

# Initialize variables
entries = []
current_entry = {'dbtl': None, 'pp_numbers': [], 'j23119_count': 0}

# Define the patterns to search for
dbtl_pattern = re.compile(r'^LOCUS\s+(DBTL\S*)')
pp_pattern = re.compile(r'/label="(PP_[^"]*)"')
j23119_pattern = re.compile(r'/label="(J23119[^"]*)"')

# Open the input file
with open(input_file_path, 'r') as infile:
    for line in infile:
        # Check for DBTL entry
        dbtl_match = dbtl_pattern.search(line)
        if dbtl_match:
            # Save the current entry if it exists
            if current_entry['dbtl']:
                entries.append(current_entry)
            # Start a new entry
            current_entry = {'dbtl': dbtl_match.group(1), 'pp_numbers': [], 'j23119_count': 0}
            continue

        # Check for PP_ entry
        pp_match = pp_pattern.search(line)
        if pp_match:
            current_entry['pp_numbers'].append(pp_match.group(1))
            continue

        # Check for J23119 entry
        j23119_match = j23119_pattern.search(line)
        if j23119_match:
            current_entry['j23119_count'] += 1
            continue

# Add the last entry if it exists
if current_entry['dbtl']:
    entries.append(current_entry)

# Write the formatted output as a CSV file
with open(output_file_path, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    # Write the header
    csvwriter.writerow(['DBTL', 'PP_Numbers', 'J23119_Count'])
    # Write the entries
    for entry in entries:
        row = [entry['dbtl']]
        row.extend(entry['pp_numbers'])
        row.append(entry['j23119_count'])
        csvwriter.writerow(row)

print(f'Formatting complete. Output has been written to {output_file_path}')

Formatting complete. Output has been written to formatted_output.csv
