**Preface:**
This notebook takes a filtered set of uniprot IDs and uses it to filter the AFDB clusters full dataset. In the end this allows for the generation of structural clusters based on the AFDB for exploring structural diversity on a more focused set of enzymes.

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Use this cell to determine the number of CPU cores you've been assigned and use this to adjust the number of processes, pool = multiprocessing.Pool(processes=**your number of cores**). If you have 8 cores put 8 here. Typically on a high ram CPU runtime this will be 8 cores on a AMD EPYC 7B12 and a free cpu with be 2 cores on a Intel(R) Xeon(R). It is highly recommended to purchase ColabPro to run a high ram runtime otherwise this notebook will take an extremely long time to run.

In [None]:
import multiprocessing
print("You have " + str(multiprocessing.cpu_count()) + " cores in this runtime.")

This first cell takes an input of the uniprot IDs as a TSV file. To get this go to uniprot and use the advanced filters to generate the datset. Uniprot will by default add multiple factors other than just the uniprot ID, uncheck all of these before downloading the dataset.

In [None]:
import multiprocessing
import pandas as pd
import gzip
import time
from datetime import datetime

# File paths
uniprotkb_file_path = '/content/drive/MyDrive/NREL/uniprotkb_go_0016787_AND_length_200_TO_2024_06_03.tsv'
afdb_clusters_file_path = '/content/drive/MyDrive/NREL/5-allmembers-repId-entryId-cluFlag-taxId.tsv.gz'

#Generates output file with datetime to prevent overwriting previous runs
def generate_unique_filename(base_path, extension):
    # Get current date and time
    now = datetime.now()

    # Format as a string in the format 'YYYYMMDD_HHMMSS'
    timestamp_str = now.strftime('%Y%m%d_%H%M%S')

    unique_filename = f"{base_path}_{timestamp_str}.{extension}"
    return unique_filename
output_file_path = generate_unique_filename('/content/drive/MyDrive/NREL/FilteredAFDB', 'tsv')

# Create the file at output_file_path
with open(output_file_path, 'w') as f:
    pass


In [None]:
try:
    # Load the smaller dataset (uniprotkb) into a pandas DataFrame
    uniprotkb_df = pd.read_csv(uniprotkb_file_path, sep='\t', header=None)
    uniprotkb_set = set(uniprotkb_df[0])

    # Function to filter the large AFDB clusters dataset using the uniprotkb set
    def filter_large_file(chunk, uniprotkb_set, output_file_path):
        # Filter out rows where the 4th column is missing
        chunk = chunk[pd.notna(chunk[3])]

        filtered_chunk = chunk[chunk[1].isin(uniprotkb_set)]  # Change here: use column 1 instead of 0

        # Write the filtered chunk to the output file
        with open(output_file_path, 'a') as f:
            filtered_chunk.to_csv(f, sep='\t', index=False, header=False)

        print("Processed a chunk")

    # Run the filtering function in parallel
    def run_parallel():
        # Create a multiprocessing pool with x processes
        pool = multiprocessing.Pool(processes=8)

        # Read the larger dataset in chunks and filter in parallel, adjust chuck size as needed.
        with gzip.open(afdb_clusters_file_path, 'rt') as file:
            for chunk in pd.read_csv(file, sep='\t', header=None, chunksize=10000000):
                # Apply the filter_large_file function to each chunk in parallel
                pool.apply_async(filter_large_file, args=(chunk, uniprotkb_set, output_file_path))

        # Close the pool and wait for all processes to finish
        pool.close()
        pool.join()

    # Function to check and correct the output file
    def check_and_correct_output_file(output_file_path, afdb_clusters_file_path):
        # Read the output file into a DataFrame
        output_df = pd.read_csv(output_file_path, sep='\t', header=None)

        # Find rows with less than 4 columns
        incomplete_rows = output_df[output_df.count(axis=1) < 4]

        if not incomplete_rows.empty:
            # Read the afdb_clusters file into a DataFrame
            afdb_clusters_df = pd.read_csv(afdb_clusters_file_path, sep='\t', header=None)

            # For each incomplete row, find the corresponding row in the afdb_clusters file and replace the incomplete row
            for index, row in incomplete_rows.iterrows():
                identifier = row[1]
                correct_row = afdb_clusters_df[afdb_clusters_df[1] == identifier]
                output_df.loc[index] = correct_row.values[0]

            # Write the corrected DataFrame to the output file
            output_df.to_csv(output_file_path, sep='\t', index=False, header=False)

        print("Checked and corrected the output file")

    if __name__ == '__main__':
        multiprocessing.freeze_support()

        # Measure the execution time
        start_time = time.time()

        # Run the filtering function in parallel
        run_parallel()

        # Check and correct the output file
        check_and_correct_output_file(output_file_path, afdb_clusters_file_path)

        # Calculate the execution time
        execution_time = time.time() - start_time
        print(f"Execution time: {execution_time} seconds")

except Exception as e:
    print(f"An error occurred: {str(e)}")

**The following function removes all singletons, if you want singletons to be included do not run this cell.**

In [None]:
# Removing singletons
def remove_lines_with_4_in_third_column(filename):
    # Read the file into a pandas DataFrame
    df = pd.read_csv(filename, sep='\t', header=None)

    total_rows = len(df)
    checkpoints = [0.25 * total_rows, 0.5 * total_rows, 0.75 * total_rows]

    # Filter out rows where the third column contains a 4
    for i, row in df.iterrows():
        if row[2] == 4:
            df.drop(i, inplace=True)

        # Print a message when 25%, 50%, and 75% of the rows are processed
        if i in checkpoints:
            print(f"Processed {int((i / total_rows) * 100)}% of the rows")

    # Write the filtered DataFrame back to the file
    df.to_csv(filename, sep='\t', index=False, header=False)

# Call the function on the output file
remove_lines_with_4_in_third_column(output_file_path)

Due to issues, these file paths have to be updated manually.

In [4]:
import multiprocessing
import pandas as pd
import gzip
import time
from datetime import datetime
all_vs_all_clusters_path = '/content/drive/MyDrive/NREL/6-all-vs-all-similarity-queryId_targetId_eValue.tsv.gz'
filtered_AFDB_path = '/content/drive/MyDrive/NREL/FilteredAFDB_20240603_234046.tsv'
allvall_output_path = '/content/drive/MyDrive/NREL/Filtered_allvsall_alkaline.tsv'


Before running ensure the number of processes is correct. This is expected to take ~35 min when using 8 processors. There are ~1.1 billion lines in this dataset.

In [5]:
try:
    # Load the smaller dataset into a pandas DataFrame
    filtered_AFDB_df = pd.read_csv(filtered_AFDB_path, sep='\t', header=None)
    filtered_AFDB_set = set(filtered_AFDB_df[1])  # Use column 1 for filtering

    # Function to filter the large AFDB clusters dataset using the filtered_AFDB_set
    def filter_large_file(chunk, filtered_AFDB_set):
        filtered_chunk = chunk[chunk[1].isin(filtered_AFDB_set)]  # Use column 1 for filtering
        return filtered_chunk

    # Run the filtering function in parallel
    def run_parallel():
        # Create a multiprocessing pool with x processes
        pool = multiprocessing.Pool(processes=8)

        # Read the larger dataset in chunks and filter in parallel
        with gzip.open(all_vs_all_clusters_path, 'rt') as file:
            for chunk in pd.read_csv(file, sep='\t', header=None, chunksize=10000000):
                # Apply the filter_large_file function to each chunk in parallel
                result = pool.apply_async(filter_large_file, args=(chunk, filtered_AFDB_set))

                # Write the filtered chunk to the output file in the main process
                filtered_chunk = result.get()
                with open(allvall_output_path, 'a') as f:
                    filtered_chunk.to_csv(f, sep='\t', index=False, header=False)

        # Close the pool and wait for all processes to finish
        pool.close()
        pool.join()

    if __name__ == '__main__':
        multiprocessing.freeze_support()

        # Measure the execution time
        start_time = time.time()

        # Run the filtering function in parallel
        run_parallel()

        # Calculate the execution time
        execution_time = time.time() - start_time
        print(f"Execution time: {execution_time} seconds")

except Exception as e:
    print(f"An error occurred: {str(e)}")

Execution time: 2104.625983953476 seconds
