V2 - removes sequences which are seen entirely in other sequences in the FASTA


In [3]:
def parse_fasta(all_patents):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

def filter_sequences(sequences, min_length=20):
    filtered_sequences = {}
    for seq_id, seq in sequences.items():
        if len(seq) >= min_length:
            filtered_sequences[seq_id] = seq
    return filtered_sequences

def write_fasta(output_file, sequences):
    with open(output_file, 'w') as f:
        for seq_id, seq in sequences.items():
            f.write(f'>{seq_id}\n')
            f.write(f'{seq}\n')

def filter_and_save(input_file, output_file):
    sequences = parse_fasta(input_file)
    filtered_sequences = filter_sequences(sequences)
    write_fasta(output_file, filtered_sequences)

if __name__ == "__main__":
    input_file = input("Enter input file path: ")
    output_file = input("Enter output file path: ")
    filter_and_save(input_file, output_file)


Enter input file path: /content/all_patents.FASTA
Enter output file path: /content/all_patents_out.FASTA


In [4]:
def parse_fasta(input_file):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

def filter_sequences(sequences, min_length=20):
    filtered_sequences = {}
    removed_duplicates = []
    for seq_id, seq in sequences.items():
        if len(seq) >= min_length:
            is_duplicate = False
            for other_id, other_seq in sequences.items():
                if seq != other_seq and seq in other_seq:
                    is_duplicate = True
                    removed_duplicates.append(seq_id)
                    break
            if not is_duplicate:
                filtered_sequences[seq_id] = seq
    return filtered_sequences, removed_duplicates

def write_fasta(output_file, sequences):
    with open(output_file, 'w') as f:
        for seq_id, seq in sequences.items():
            f.write(f'>{seq_id}\n')
            f.write(f'{seq}\n')

def filter_and_save(input_file, output_file):
    sequences = parse_fasta(input_file)
    filtered_sequences, removed_duplicates = filter_sequences(sequences)
    write_fasta(output_file, filtered_sequences)
    if removed_duplicates:
        print("Removed sequences where the entirety of the query matches a portion of the target:")
        for seq_id in removed_duplicates:
            print(seq_id)

if __name__ == "__main__":
    input_file = input("Enter input file path: ")
    output_file = input("Enter output file path: ")
    filter_and_save(input_file, output_file)


Enter input file path: /content/all_patents.FASTA
Enter output file path: /content/all_patents_dups.FASTA
Removed sequences where the entirety of the query matches a portion of the target:
QHX49160.1 Sequence 10 from patent US 10415028
WVS58503.1 Sequence 51 from patent US 11820831
WVS58499.1 Sequence 47 from patent US 11820831
WVS58497.1 Sequence 45 from patent US 11820831
WVS58495.1 Sequence 43 from patent US 11820831
WVS58477.1 Sequence 25 from patent US 11820831
UOL82841.1 Sequence 51 from patent US 11192959
UOL82837.1 Sequence 47 from patent US 11192959
UOL82835.1 Sequence 45 from patent US 11192959
UOL82833.1 Sequence 43 from patent US 11192959
UOL82815.1 Sequence 25 from patent US 11192959


In [6]:
def parse_fasta(input_file):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

def filter_sequences(sequences, min_length=20):
    filtered_sequences = {}
    removed_duplicates = []
    unique_patent_sequence_names = set()
    all_patent_sequence_names = set()
    for seq_id, seq in sequences.items():
        if len(seq) >= min_length:
            is_duplicate = False
            for other_id, other_seq in sequences.items():
                if seq != other_seq and seq in other_seq:
                    is_duplicate = True
                    removed_duplicates.append(seq_id)
                    break
            if not is_duplicate:
                filtered_sequences[seq_id] = seq
                if "patent" in seq_id:
                    all_patent_sequence_names.add(seq_id)
                    _, seq_name = seq_id.split("patent", 1)
                    unique_patent_sequence_names.add(seq_name.strip())
    return filtered_sequences, removed_duplicates, unique_patent_sequence_names

def write_fasta(output_file, sequences):
    with open(output_file, 'w') as f:
        for seq_id, seq in sequences.items():
            f.write(f'>{seq_id}\n')
            f.write(f'{seq}\n')

def filter_and_save(input_file, output_file):
    sequences = parse_fasta(input_file)
    filtered_sequences, removed_duplicates, unique_patent_sequence_names = filter_sequences(sequences)
    write_fasta(output_file, filtered_sequences)
    if removed_duplicates:
        print("Removed sequences where the entirety of the query matches a portion of the target:")
        for seq_id in removed_duplicates:
            print(seq_id)
    if unique_patent_sequence_names:
        print("Unique sequence names starting from 'patent' in the final set:")
        for name in unique_patent_sequence_names:
            print(name)

if __name__ == "__main__":
    input_file = input("Enter input file path: ")
    output_file = input("Enter output file path: ")
    filter_and_save(input_file, output_file)


Enter input file path: /content/all_patents.FASTA
Enter output file path: /content/all_patents_test.FASTA
Removed sequences where the entirety of the query matches a portion of the target:
QHX49160.1 Sequence 10 from patent US 10415028
WVS58503.1 Sequence 51 from patent US 11820831
WVS58499.1 Sequence 47 from patent US 11820831
WVS58497.1 Sequence 45 from patent US 11820831
WVS58495.1 Sequence 43 from patent US 11820831
WVS58477.1 Sequence 25 from patent US 11820831
UOL82841.1 Sequence 51 from patent US 11192959
UOL82837.1 Sequence 47 from patent US 11192959
UOL82835.1 Sequence 45 from patent US 11192959
UOL82833.1 Sequence 43 from patent US 11192959
UOL82815.1 Sequence 25 from patent US 11192959
Unique sequence names starting from 'patent' in the final set:
US 11192959
US 10415028
US 11820831


Now with printed numbers of original and removed sequences, and unique patents


In [7]:
def parse_fasta(input_file):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

def filter_sequences(sequences, min_length=20):
    original_count = len(sequences)
    filtered_sequences = {}
    removed_duplicates = []
    removed_length = []
    unique_patent_sequence_names = set()
    for seq_id, seq in sequences.items():
        if len(seq) >= min_length:
            is_duplicate = False
            for other_id, other_seq in sequences.items():
                if seq != other_seq and seq in other_seq:
                    is_duplicate = True
                    removed_duplicates.append(seq_id)
                    break
            if not is_duplicate:
                filtered_sequences[seq_id] = seq
                if "patent" in seq_id:
                    _, seq_name = seq_id.split("patent", 1)
                    unique_patent_sequence_names.add(seq_name.strip())
            else:
                removed_length.append(seq_id)
    remaining_count = len(filtered_sequences)
    removed_duplicates_count = len(removed_duplicates)
    removed_length_count = len(removed_length)
    return filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count

def write_fasta(output_file, sequences):
    with open(output_file, 'w') as f:
        for seq_id, seq in sequences.items():
            f.write(f'>{seq_id}\n')
            f.write(f'{seq}\n')

def filter_and_save(input_file, output_file):
    sequences = parse_fasta(input_file)
    filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count = filter_sequences(sequences)
    write_fasta(output_file, filtered_sequences)
    print("Number of original sequences:", original_count)
    print("Number removed based on identical sequences:", removed_duplicates_count)
    print("Number removed based on length:", removed_length_count)
    print("Number remaining:", remaining_count)

if __name__ == "__main__":
    input_file = input("Enter input file path: ")
    output_file = input("Enter output file path: ")
    filter_and_save(input_file, output_file)


Enter input file path: /content/all_patents.FASTA
Enter output file path: /content/all_patents_1092_out.FASTA
Number of original sequences: 1092
Number removed based on identical sequences: 120
Number removed based on length: 120
Number remaining: 559


Reintroducing unique patent numbers list

In [10]:
def parse_fasta(input_file):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

def filter_sequences(sequences, min_length=20):
    original_count = len(sequences)
    filtered_sequences = {}
    removed_duplicates = []
    removed_length = []
    unique_patent_sequence_names = set()
    for seq_id, seq in sequences.items():
        if len(seq) >= min_length:
            is_duplicate = False
            for other_id, other_seq in sequences.items():
                if seq_id != other_id and seq == other_seq:
                    is_duplicate = True
                    removed_duplicates.append(seq_id)
                    break
            if not is_duplicate:
                filtered_sequences[seq_id] = seq
                if "patent" in seq_id:
                    _, seq_name = seq_id.split("patent", 1)
                    unique_patent_sequence_names.add(seq_name.strip())
            else:
                removed_length.append(seq_id)
    remaining_count = original_count - len(removed_duplicates) - len(removed_length)
    removed_duplicates_count = len(removed_duplicates)
    removed_length_count = len(removed_length)
    return filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names

def write_fasta(output_file, sequences):
    with open(output_file, 'w') as f:
        for seq_id, seq in sequences.items():
            f.write(f'>{seq_id}\n')
            f.write(f'{seq}\n')

def filter_and_save(input_file, output_file):
    sequences = parse_fasta(input_file)
    filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names = filter_sequences(sequences)
    write_fasta(output_file, filtered_sequences)
    print("Number of original sequences:", original_count)
    print("Number removed based on identical sequences:", removed_duplicates_count)
    print("Number removed based on length:", removed_length_count)
    print("Number remaining:", remaining_count)
    print("List of unique patent numbers:")
    for patent_number in unique_patent_sequence_names:
        print(patent_number)

if __name__ == "__main__":
    input_file = input("Enter input file path: ")
    output_file = input("Enter output file path: ")
    filter_and_save(input_file, output_file)


Enter input file path: /content/all_patents_test.FASTA
Enter output file path: /content/all_patents_test2.FASTA
Number of original sequences: 275
Number removed based on identical sequences: 0
Number removed based on length: 0
Number remaining: 275
List of unique patent numbers:
US 9540625
US 7378091
US 8466263
US 10487153
US 8591885
US 5912333
US 10415028
US 10053702
US 10920213
US 9968885
US 5972684
US 9359446


Combining scripts


In [12]:
def parse_fasta(input_file):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

def filter_sequences(sequences, min_length=20):
    original_count = len(sequences)
    filtered_sequences = {}
    removed_duplicates = []
    removed_length = []
    unique_patent_sequence_names = set()
    for seq_id, seq in sequences.items():
        if len(seq) >= min_length:
            is_duplicate = False
            for other_id, other_seq in sequences.items():
                if seq_id != other_id and seq in other_seq:
                    is_duplicate = True
                    removed_duplicates.append(seq_id)
                    break
            if not is_duplicate:
                filtered_sequences[seq_id] = seq
                if "patent" in seq_id:
                    _, seq_name = seq_id.split("patent", 1)
                    unique_patent_sequence_names.add(seq_name.strip())
            else:
                removed_length.append(seq_id)
    remaining_count = len(filtered_sequences)
    removed_duplicates_count = len(removed_duplicates)
    removed_length_count = len(removed_length)
    return filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names

def write_fasta(output_file, sequences):
    with open(output_file, 'w') as f:
        for seq_id, seq in sequences.items():
            f.write(f'>{seq_id}\n')
            f.write(f'{seq}\n')

def filter_and_save(input_file, output_file):
    sequences = parse_fasta(input_file)
    filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names = filter_sequences(sequences)
    write_fasta(output_file, filtered_sequences)
    print("Number of original sequences:", original_count)
    print("Number removed based on identical sequences:", removed_duplicates_count)
    print("Number removed based on length:", removed_length_count)
    print("Number remaining:", remaining_count)
    print("List of unique patent numbers:")
    for patent_number in unique_patent_sequence_names:
        print(patent_number)

if __name__ == "__main__":
    input_file = input("Enter input file path: ")
    output_file = input("Enter output file path: ")
    filter_and_save(input_file, output_file)


Enter input file path: /content/all_patents_1092.FASTA
Enter output file path: /content/all_patents_1092_out.FASTA
Number of original sequences: 1092
Number removed based on identical sequences: 443
Number removed based on length: 443
Number remaining: 236
List of unique patent numbers:
US 9540625
US 7378091
US 8466263
US 10487153
US 8591885
US 5912333
US 10053702
US 10920213
US 9968885
US 5972684
US 9359446


In [19]:
import webbrowser

def parse_fasta(input_file):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

def filter_sequences(sequences, min_length=20):
    original_count = len(sequences)
    filtered_sequences = {}
    removed_duplicates = []
    removed_length = []
    unique_patent_sequence_names = set()
    for seq_id, seq in sequences.items():
        if len(seq) >= min_length:
            is_duplicate = False
            for other_id, other_seq in sequences.items():
                if seq_id != other_id and seq in other_seq:
                    is_duplicate = True
                    removed_duplicates.append(seq_id)
                    break
            if not is_duplicate:
                filtered_sequences[seq_id] = seq
                if "patent" in seq_id:
                    _, seq_name = seq_id.split("patent", 1)
                    unique_patent_sequence_names.add(seq_name.strip())
            else:
                removed_length.append(seq_id)
    remaining_count = len(filtered_sequences)
    removed_duplicates_count = len(removed_duplicates)
    removed_length_count = len(removed_length)
    return filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names

def write_fasta(output_file, sequences):
    with open(output_file, 'w') as f:
        for seq_id, seq in sequences.items():
            f.write(f'>{seq_id}\n')
            f.write(f'{seq}\n')

def scrape_patent_name(patent_number):
    # Construct the URL for the patent search
    url = f"https://patents.google.com/?q={patent_number.replace(' ', '')}"
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content of the response
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find the span element by ID and class
    span_element = soup.find('span', id='htmlContent', class_='style-scope raw-html')
    # Extract the text of the span element
    patent_name = span_element.text.strip() if span_element else "Patent name not found"
    return patent_name

def filter_and_save(input_file, output_file):
    sequences = parse_fasta(input_file)
    filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names = filter_sequences(sequences)
    write_fasta(output_file, filtered_sequences)
    print("Number of original sequences:", original_count)
    print("Number removed based on identical sequences:", removed_duplicates_count)
    print("Number removed based on length:", removed_length_count)
    print("Number remaining:", remaining_count)
    print("List of unique patent numbers:")
    for patent_number in unique_patent_sequence_names:
        print(patent_number.replace(' ', ''))
        patent_url = f"https://patents.google.com/?q={patent_number.replace(' ', '')}"
        print(f"Patent: {patent_number} - URL: {patent_url}")
if __name__ == "__main__":
    input_file = input("Enter input file path: ")
    output_file = input("Enter output file path: ")
    filter_and_save(input_file, output_file)


Enter input file path: /content/all_patents_1092.FASTA
Enter output file path: /content/all_patents_1092_2.FASTA
Number of original sequences: 1092
Number removed based on identical sequences: 443
Number removed based on length: 443
Number remaining: 236
List of unique patent numbers:
US9540625
Patent: US 9540625 - URL: https://patents.google.com/?q=US9540625
US7378091
Patent: US 7378091 - URL: https://patents.google.com/?q=US7378091
US8466263
Patent: US 8466263 - URL: https://patents.google.com/?q=US8466263
US10487153
Patent: US 10487153 - URL: https://patents.google.com/?q=US10487153
US8591885
Patent: US 8591885 - URL: https://patents.google.com/?q=US8591885
US5912333
Patent: US 5912333 - URL: https://patents.google.com/?q=US5912333
US10053702
Patent: US 10053702 - URL: https://patents.google.com/?q=US10053702
US10920213
Patent: US 10920213 - URL: https://patents.google.com/?q=US10920213
US9968885
Patent: US 9968885 - URL: https://patents.google.com/?q=US9968885
US5972684
Patent: US 

In [20]:
from bs4 import BeautifulSoup

def parse_fasta(input_file):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

def filter_sequences(sequences, min_length=20):
    original_count = len(sequences)
    filtered_sequences = {}
    removed_duplicates = []
    removed_length = []
    unique_patent_sequence_names = set()
    for seq_id, seq in sequences.items():
        if len(seq) >= min_length:
            is_duplicate = False
            for other_id, other_seq in sequences.items():
                if seq_id != other_id and seq in other_seq:
                    is_duplicate = True
                    removed_duplicates.append(seq_id)
                    break
            if not is_duplicate:
                filtered_sequences[seq_id] = seq
                if "patent" in seq_id:
                    _, seq_name = seq_id.split("patent", 1)
                    unique_patent_sequence_names.add(seq_name.strip())
            else:
                removed_length.append(seq_id)
    remaining_count = len(filtered_sequences)
    removed_duplicates_count = len(removed_duplicates)
    removed_length_count = len(removed_length)
    return filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names

def write_fasta(output_file, sequences):
    with open(output_file, 'w') as f:
        for seq_id, seq in sequences.items():
            f.write(f'>{seq_id}\n')
            f.write(f'{seq}\n')

def scrape_patent_name(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    # Find the span element by ID
    span_element = soup.find('span', id='htmlContent')
    # Extract the text of the span element
    patent_name = span_element.text.strip() if span_element else "Patent name not found"
    return patent_name

def filter_and_save(input_file, output_file):
    sequences = parse_fasta(input_file)
    filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names = filter_sequences(sequences)
    write_fasta(output_file, filtered_sequences)
    print("Number of original sequences:", original_count)
    print("Number removed based on identical sequences:", removed_duplicates_count)
    print("Number removed based on length:", removed_length_count)
    print("Number remaining:", remaining_count)
    print("List of unique patent numbers:")
    for patent_number in unique_patent_sequence_names:
        # Scrape the patent name from the provided HTML content
        patent_name = scrape_patent_name(patent_number)
        print(f"Patent: {patent_number} - Name: {patent_name}")

if __name__ == "__main__":
    input_file = input("Enter input file path: ")
    output_file = input("Enter output file path: ")
    filter_and_save(input_file, output_file)


Enter input file path: /content/all_patents_1092.FASTA
Enter output file path: /content/all_patents_1092_21.FASTA
Number of original sequences: 1092
Number removed based on identical sequences: 443
Number removed based on length: 443
Number remaining: 236
List of unique patent numbers:
Patent: US 9540625 - Name: Patent name not found
Patent: US 7378091 - Name: Patent name not found
Patent: US 8466263 - Name: Patent name not found
Patent: US 10487153 - Name: Patent name not found
Patent: US 8591885 - Name: Patent name not found
Patent: US 5912333 - Name: Patent name not found
Patent: US 10053702 - Name: Patent name not found
Patent: US 10920213 - Name: Patent name not found
Patent: US 9968885 - Name: Patent name not found
Patent: US 5972684 - Name: Patent name not found
Patent: US 9359446 - Name: Patent name not found


In [18]:
import requests
from bs4 import BeautifulSoup

def parse_fasta(input_file):
    sequences = {}
    with open(input_file, 'r') as f:
        current_id = None
        current_seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    sequences[current_id] = current_seq
                current_id = line[1:]
                current_seq = ''
            else:
                current_seq += line
        if current_id is not None:
            sequences[current_id] = current_seq
    return sequences

def filter_sequences(sequences, min_length=20):
    original_count = len(sequences)
    filtered_sequences = {}
    removed_duplicates = []
    removed_length = []
    unique_patent_sequence_names = set()
    for seq_id, seq in sequences.items():
        if len(seq) >= min_length:
            is_duplicate = False
            for other_id, other_seq in sequences.items():
                if seq_id != other_id and seq in other_seq:
                    is_duplicate = True
                    removed_duplicates.append(seq_id)
                    break
            if not is_duplicate:
                filtered_sequences[seq_id] = seq
                if "patent" in seq_id:
                    _, seq_name = seq_id.split("patent", 1)
                    unique_patent_sequence_names.add(seq_name.strip())
            else:
                removed_length.append(seq_id)
    remaining_count = len(filtered_sequences)
    removed_duplicates_count = len(removed_duplicates)
    removed_length_count = len(removed_length)
    return filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names

def write_fasta(output_file, sequences):
    with open(output_file, 'w') as f:
        for seq_id, seq in sequences.items():
            f.write(f'>{seq_id}\n')
            f.write(f'{seq}\n')

def scrape_patent_name(patent_number):
    # Construct the URL for the patent search
    url = f"https://patents.google.com/?q={patent_number.replace(' ', '')}"
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content of the response
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find the title tag in the HTML
    title_element = soup.find('title')
    # Extract the text of the title
    patent_name = title_element.text if title_element else "Patent name not found"
    return patent_name

def filter_and_save(input_file, output_file):
    sequences = parse_fasta(input_file)
    filtered_sequences, removed_duplicates_count, removed_length_count, remaining_count, original_count, unique_patent_sequence_names = filter_sequences(sequences)
    write_fasta(output_file, filtered_sequences)
    print("Number of original sequences:", original_count)
    print("Number removed based on identical sequences:", removed_duplicates_count)
    print("Number removed based on length:", removed_length_count)
    print("Number remaining:", remaining_count)
    print("List of unique patent numbers:")
    for patent_number in unique_patent_sequence_names:
        # Scrape the patent name from Google Patents
        patent_name = scrape_patent_name(patent_number)
        print(f"Patent: {patent_number} - Name: {patent_name}")



if __name__ == "__main__":
    input_file = input("Enter input file path: ")
    output_file = input("Enter output file path: ")
    filter_and_save(input_file, output_file)


Enter input file path: /content/all_patents_1092.FASTA
Enter output file path: /content/all_patents_1092_2.FASTA
Number of original sequences: 1092
Number removed based on identical sequences: 443
Number removed based on length: 443
Number remaining: 236
List of unique patent numbers:
Patent: US 9540625 - Name: Google Patents
Patent: US 7378091 - Name: Google Patents
Patent: US 8466263 - Name: Google Patents
Patent: US 10487153 - Name: Google Patents
Patent: US 8591885 - Name: Google Patents
Patent: US 5912333 - Name: Google Patents
Patent: US 10053702 - Name: Google Patents
Patent: US 10920213 - Name: Google Patents
Patent: US 9968885 - Name: Google Patents
Patent: US 5972684 - Name: Google Patents
Patent: US 9359446 - Name: Google Patents
