
To create index vectors of all PFAM entries from the final_pfams.fasta file, we'll need to parse the FASTA file and extract the PFAM IDs or information of interest.

In [1]:
import Bio
print(Bio.__version__)

1.83


In [2]:
# Define the path to your FASTA file
fasta_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/final_pfams.fasta"

# Define the output path for the PFAM index vectors
output_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/pfam_index_vectors.txt"

In [3]:
# Preview the FASTA file
def preview_fasta(file_path, num_records=5):
    """
    Preview the first few records of a FASTA file.
    """
    records = []
    for i, record in enumerate(SeqIO.parse(file_path, "fasta")):
        records.append((record.id, record.description, str(record.seq)))
        if i >= num_records - 1:
            break
    return records

In [4]:
# Create PFAM index vectors
def create_pfam_index(fasta_file):
    """
    Creates a mapping of PFAM IDs to unique indices.
    """
    pfam_index = {}
    current_index = 0
    
    for record in SeqIO.parse(fasta_file, "fasta"):
        header = record.description
        # Extract PFAM ID from the header
        pfam_id = header.split()[0]  # Adjust split logic if needed
        
        if pfam_id not in pfam_index:
            pfam_index[pfam_id] = current_index
            current_index += 1
    
    return pfam_index

In [5]:
# Save PFAM index vectors to a file
def save_pfam_index(index_dict, output_file):
    """
    Saves the PFAM index mappings to a file.
    """
    with open(output_file, "w") as f:
        for pfam_id, index in index_dict.items():
            f.write(f"{pfam_id}\t{index}\n")
    print(f"PFAM index vectors saved to {output_file}")

In [6]:
from Bio import SeqIO  # Import SeqIO from the Bio package

# Define the path to your FASTA file
fasta_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/final_pfams.fasta"

# Function to preview the FASTA file
def preview_fasta(file_path, num_records=5):
    """
    Preview the first few records of a FASTA file.
    """
    records = []
    for i, record in enumerate(SeqIO.parse(file_path, "fasta")):
        records.append((record.id, record.description, str(record.seq)))
        if i >= num_records - 1:
            break
    return records

# Main Execution
if __name__ == "__main__":
    print("Previewing FASTA file...")
    fasta_preview = preview_fasta(fasta_path)
    for record in fasta_preview:
        print(f"ID: {record[0]}")
        print(f"Description: {record[1]}")
        print(f"Sequence: {record[2]}\n")


Previewing FASTA file...
ID: A0A1I4YJU4_9ENTR/160-195
Description: A0A1I4YJU4_9ENTR/160-195 A0A1I4YJU4.1 PF10417.11;1-cysPrx_C;
Sequence: ALQFHEEHGEVCPAQWHKGQEGMGASPEGVAKYLSE

ID: A0A261DC17_9RICK/184-418
Description: A0A261DC17_9RICK/184-418 A0A261DC17.1 PF12574.10;120_Rick_ant;
Sequence: AALVNKSIAKPEELDDLNKFRAYFENEQNKETISGLLKEDQNLKHALEQVEIAGYKNVHTQFAGRFSTMEWKDGGVENANGITIKKQIVRDANGHEIATLSEANHQINPPHTVQKSDGTSVAISNYRTIDFPIKLDNNGPMHLSLAVKDQYGKNIAASNAVYFTAHYDDAGKLIEVSSPHPVKFTGNSPDAVGYIEHGGKIYTLPVTQEKYRSMMQEVAKNLGQGVNISPSIESI

ID: A6LL01_THEM4/23-486
Description: A6LL01_THEM4/23-486 A6LL01.1 PF09847.11;12TM_1;
Sequence: TVKGNFFRQILQYIIGSVPLGLIVYFFTIDLFEKIYNVDPLVARYMYLMWSSMLSLFFVIGFIGLGMYSLSRNEEVELLLTMPISRTVISAYQIFSATISQIYTLSFFIFISLAYFVSTNQNVLLGILKIVLHIWFLISFSSVIAVLIGGRTSKSFTKRFYTIVLLLSVFFYFFIIAMTDVDVSEMENLVKMFIFSTKDYNFLAWSLISNKTLGYSLISSIFLSILFLVISKKVGFEPVQVKRKERYQIAGTGSILKALFKKDLKAAIRYEQFLYFILYPLGFGIFMMFINNQGVSPIFYTIPIFTFYVAFETGILTISEVSKIEVVSTYPITFKKLMMPKLLIPVGLNFLLLLLVFVISLFFNAVSIFLVLSMIFS

### Creating Index Vectors for PFAM IDs

Extract All PFAM IDs:

The extract_pfam_ids function uses a regular expression (r'PF\d{5}') to find all PFxxxxx patterns in the headers.

A set ensures that only unique IDs are retained.

Create Index Vectors:

The create_index_vectors function maps each unique PFAM ID to a sequential index starting from 0.

Save Results:

The save_index_vectors function writes the mappings to a file, with each line formatted as PFxxxxx <tab> index.

### Output

Console Output (Sample):

Extracting all unique PFAM IDs...

Creating index vectors for PFAM IDs...

Saving PFAM index vectors to file...

PFAM index vectors saved to /Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/pfam_index_vectors.txt

Sample of PFAM index vectors:

PF00001: 0

PF00002: 1

PF00003: 2

PF00004: 3

PF00005: 4

PF00006: 5

PF00007: 6

PF00008: 7

PF00009: 8

PF00010: 9




In [7]:
from Bio import SeqIO
import re

# Define the path to your FASTA file
fasta_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/final_pfams.fasta"

# Define the output path for the PFAM index vectors
output_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/pfam_index_vectors.txt"

# Function to extract all unique PFAM IDs
def extract_pfam_ids(fasta_file):
    """
    Extracts all unique PFAM IDs (PFxxxxx) from a FASTA file.
    """
    pfam_ids = set()  # Use a set to store unique PFAM IDs

    for record in SeqIO.parse(fasta_file, "fasta"):
        # Extract the header/description
        header = record.description
        # Use regex to find all PFxxxxx patterns in the header
        matches = re.findall(r'PF\d{5}', header)
        pfam_ids.update(matches)  # Add found PFAM IDs to the set

    return sorted(pfam_ids)  # Return sorted list of unique PFAM IDs

# Function to create index mappings for PFAM IDs
def create_index_vectors(pfam_ids):
    """
    Maps PFAM IDs to unique indices.
    """
    return {pfam_id: index for index, pfam_id in enumerate(pfam_ids)}

# Function to save the index vectors to a file
def save_index_vectors(index_dict, output_file):
    """
    Saves the PFAM index mappings to a file.
    """
    with open(output_file, "w") as f:
        for pfam_id, index in index_dict.items():
            f.write(f"{pfam_id}\t{index}\n")
    print(f"PFAM index vectors saved to {output_file}")

# Main Execution
if __name__ == "__main__":
    print("Extracting all unique PFAM IDs...")
    pfam_ids = extract_pfam_ids(fasta_path)

    print("Creating index vectors for PFAM IDs...")
    pfam_index_vectors = create_index_vectors(pfam_ids)

    print("Saving PFAM index vectors to file...")
    save_index_vectors(pfam_index_vectors, output_path)

    print("Sample of PFAM index vectors:")
    for i, (pfam_id, index) in enumerate(pfam_index_vectors.items()):
        print(f"{pfam_id}: {index}")
        if i >= 9:  # Display only the first 10 entries
            break


Extracting all unique PFAM IDs...
Creating index vectors for PFAM IDs...
Saving PFAM index vectors to file...
PFAM index vectors saved to /Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/pfam_index_vectors.txt
Sample of PFAM index vectors:
PF00001: 0
PF00002: 1
PF00003: 2
PF00004: 3
PF00005: 4
PF00006: 5
PF00007: 6
PF00008: 7
PF00009: 8
PF00010: 9


### Validating Printed Index Vectors

1. Validate Unique Indices:

Collect all indices (index_dict.values()).

Check if the count of indices matches the count of unique indices.

2. Check for Duplicates:

Collect all PFAM IDs (index_dict.keys()).

Ensure the count of PFAM IDs matches the count of unique PFAM IDs.

3. Report Issues:

If duplicates or missing values are found, they are logged for debugging.

### Example Output

1. Validation Passed:

Validation passed: All PFAM IDs are mapped to unique indices.

Validation passed: No duplicate PFAM IDs found.

Overall validation passed: Index vectors are correctly mapped.

2. Validation Failed:

Validation failed: Duplicate indices detected.

Duplicate Indices: {3, 7}

Validation failed: Duplicate PFAM IDs detected.

Duplicate PFAM IDs: {'PF00001', 'PF00007'}

Overall validation failed: Issues found in the index mappings.

In [8]:
def validate_pfam_index_vectors(index_dict):
    """
    Validates the integrity of PFAM index mappings.
    
    Args:
        index_dict (dict): Dictionary of PFAM IDs and their indices.
        
    Returns:
        None
    """
    # Check for duplicate indices
    indices = list(index_dict.values())
    unique_indices = set(indices)

    if len(indices) == len(unique_indices):
        print("Validation passed: All PFAM IDs are mapped to unique indices.")
    else:
        print("Validation failed: Duplicate indices detected.")
        duplicate_indices = [index for index in indices if indices.count(index) > 1]
        print(f"Duplicate Indices: {set(duplicate_indices)}")

    # Check for missing or duplicate PFAM IDs
    pfam_ids = list(index_dict.keys())
    unique_pfam_ids = set(pfam_ids)

    if len(pfam_ids) == len(unique_pfam_ids):
        print("Validation passed: No duplicate PFAM IDs found.")
    else:
        print("Validation failed: Duplicate PFAM IDs detected.")
        duplicate_ids = [pfam_id for pfam_id in pfam_ids if pfam_ids.count(pfam_id) > 1]
        print(f"Duplicate PFAM IDs: {set(duplicate_ids)}")

    # Report overall validation results
    if len(indices) == len(unique_indices) and len(pfam_ids) == len(unique_pfam_ids):
        print("Overall validation passed: Index vectors are correctly mapped.")
    else:
        print("Overall validation failed: Issues found in the index mappings.")

# Example usage
if __name__ == "__main__":
    # Assuming pfam_index_vectors is the dictionary containing PFAM IDs and their indices
    print("Validating PFAM index vectors...")
    validate_pfam_index_vectors(pfam_index_vectors)


Validating PFAM index vectors...
Validation passed: All PFAM IDs are mapped to unique indices.
Validation passed: No duplicate PFAM IDs found.
Overall validation passed: Index vectors are correctly mapped.


To properly parse the MiBIG_complete_dataset.txt file, which currently has all data in a single column due to incorrect delimiter handling, we can explicitly specify the correct delimiter when loading the file into a DataFrame.

Steps to Parse the Data Properly
1. Identify the Correct Delimiter: Based on the review, the delimiter separating fields in our file is a comma (,).
2. Load the Data with the Correct Delimiter: Use the sep="," argument in pandas.read_csv() to parse the file correctly.
3. Split the Semicolon-Separated PFAM Domains: Use str.split(";") on the PFAM_Domains column to separate the domains into a list for each row.
4. Normalize PFAM IDs: Ensure the PFAM IDs are correctly extracted and handled.



In [9]:
### Parse and Process the Data

import pandas as pd

# Define the path to the MiBIG datasetb
mibig_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/MiBIG_complete_dataset.txt"

# Load the dataset with the correct delimiter
data = pd.read_csv(mibig_path, sep=",", header=None, names=["Cluster_ID", "Type", "PFAM_Domains"])

# Split the PFAM_Domains column into lists
data["PFAM_Domains"] = data["PFAM_Domains"].str.split(";")

# Preview the parsed dataset
print("Parsed MiBIG Dataset:")
print(data.head())


Parsed MiBIG Dataset:
     Cluster_ID        Type                                       PFAM_Domains
0  BGC0000001.1  Polyketide  [PF02353, PF01135, PF01269, PF13489, PF01596, ...
1  BGC0000002.1  Polyketide  [PF00749, PF00201, PF04101, PF13579, PF03033, ...
2  BGC0000003.1  Polyketide  [PF00755, PF08659, PF00107, PF13489, PF10294, ...
3  BGC0000004.1  Polyketide  [PF07690, PF06609, PF00083, PF00975, PF00550, ...
4  BGC0000005.1  Polyketide      [PF00135, PF10340, PF07859, PF12146, PF00975]


Verify Unique PFAM Domains

Check for the total number of unique PFAM IDs in the dataset.
This helps ensure no duplicates exist across all BGCs.

In [10]:
# Flatten the PFAM_Domains column and get unique PFAM IDs
# Drop NaN values and ensure only valid lists are processed
unique_pfams = set([pfam for domains in data["PFAM_Domains"].dropna() for pfam in domains])

print(f"Total unique PFAM IDs: {len(unique_pfams)}")


Total unique PFAM IDs: 3685


In [11]:
# Ensure only rows with valid lists are processed
unique_pfams = set([pfam for domains in data["PFAM_Domains"] if isinstance(domains, list) for pfam in domains])

print(f"Total unique PFAM IDs: {len(unique_pfams)}")


Total unique PFAM IDs: 3685


In [12]:
# Assuming pfam_index_vectors is already created from final_pfams.fasta
indexed_pfams = set(pfam_index_vectors.keys())  # PFAM IDs from final_pfams.fasta

# Verify overlap between MiBIG PFAMs and indexed PFAMs
mibig_pfams = unique_pfams  # PFAM IDs from MiBIG dataset

# Check for matches and mismatches
common_pfams = mibig_pfams.intersection(indexed_pfams)
missing_pfams = mibig_pfams.difference(indexed_pfams)

print(f"Total PFAM IDs in MiBIG: {len(mibig_pfams)}")
print(f"Total PFAM IDs in final_pfams.fasta: {len(indexed_pfams)}")
print(f"Common PFAM IDs: {len(common_pfams)}")
print(f"Missing PFAM IDs in final_pfams.fasta: {len(missing_pfams)}")

# Optionally display missing PFAM IDs
if missing_pfams:
    print(f"Missing PFAM IDs: {missing_pfams}")


Total PFAM IDs in MiBIG: 3685
Total PFAM IDs in final_pfams.fasta: 19092
Common PFAM IDs: 3685
Missing PFAM IDs in final_pfams.fasta: 0


In [13]:
# Add missing PFAM IDs to pfam_index_vectors
next_index = max(pfam_index_vectors.values()) + 1  # Start from the next available index
for missing_pfam in missing_pfams:
    pfam_index_vectors[missing_pfam] = next_index
    next_index += 1

print(f"Updated total PFAM indices: {len(pfam_index_vectors)}")


Updated total PFAM indices: 19092


In [14]:
# Path to final_pfams.fasta
fasta_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/final_pfams.fasta"

# Function to extract and index PFAM IDs
def create_pfam_index(fasta_file):
    pfam_ids = set()
    for record in SeqIO.parse(fasta_file, "fasta"):
        # Extract PFAM IDs from the header using pattern matching
        header = record.description
        pfam_ids.update([part.split(".")[0] for part in header.split(";") if part.startswith("PF")])
    # Create index mapping
    return {pfam_id: idx for idx, pfam_id in enumerate(sorted(pfam_ids))}

# Recreate pfam_index_vectors
pfam_index_vectors = create_pfam_index(fasta_path)


In [15]:
# Path to the saved index file
index_file_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/pfam_index_vectors.txt"

# Reload PFAM index vectors
pfam_index_vectors = {}
with open(index_file_path, "r") as f:
    for line in f:
        pfam_id, index = line.strip().split("\t")
        pfam_index_vectors[pfam_id] = int(index)

print(f"Loaded {len(pfam_index_vectors)} PFAM IDs from {index_file_path}")



Loaded 19092 PFAM IDs from /Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/pfam_index_vectors.txt


In [16]:
# PFAM IDs from final_pfams.fasta
indexed_pfams = set(pfam_index_vectors.keys())  # Reloaded PFAM IDs

# PFAM IDs from MiBIG dataset
mibig_pfams = unique_pfams  # Extracted earlier from MiBIG

# Calculate overlap
common_pfams = mibig_pfams.intersection(indexed_pfams)
missing_pfams = mibig_pfams.difference(indexed_pfams)

# Display results
print(f"Total PFAM IDs in MiBIG: {len(mibig_pfams)}")
print(f"Total PFAM IDs in final_pfams.fasta: {len(indexed_pfams)}")
print(f"Common PFAM IDs: {len(common_pfams)}")
print(f"Missing PFAM IDs: {len(missing_pfams)}")

# Optionally, display a sample of missing PFAM IDs
if missing_pfams:
    print(f"Sample of missing PFAM IDs: {list(missing_pfams)[:10]}")


Total PFAM IDs in MiBIG: 3685
Total PFAM IDs in final_pfams.fasta: 19092
Common PFAM IDs: 3685
Missing PFAM IDs: 0


In [17]:
# Add missing PFAM IDs to the index
next_index = max(pfam_index_vectors.values()) + 1
for missing_pfam in missing_pfams:
    pfam_index_vectors[missing_pfam] = next_index
    next_index += 1

print(f"Updated total PFAM IDs: {len(pfam_index_vectors)}")


Updated total PFAM IDs: 19092


In [18]:
# Save updated index to file
updated_index_file = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/updated_pfam_index_vectors.txt"
with open(updated_index_file, "w") as f:
    for pfam_id, index in pfam_index_vectors.items():
        f.write(f"{pfam_id}\t{index}\n")

print(f"Updated PFAM index vectors saved to {updated_index_file}")


Updated PFAM index vectors saved to /Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/updated_pfam_index_vectors.txt


### Map MiBIG PFAM IDs to Indices

In [19]:
# Ensure NaN values in PFAM_Domains are replaced with an empty list
data["PFAM_Domains"] = data["PFAM_Domains"].apply(lambda x: x if isinstance(x, list) else [])

# Map PFAM IDs to indices, skipping any missing IDs
data["PFAM_Indices"] = data["PFAM_Domains"].apply(
    lambda x: [pfam_index_vectors[pfam] for pfam in x if pfam in pfam_index_vectors]
)

# Preview the updated dataset
print("Dataset with PFAM Indices:")
print(data.head())


Dataset with PFAM Indices:
     Cluster_ID        Type  \
0  BGC0000001.1  Polyketide   
1  BGC0000002.1  Polyketide   
2  BGC0000003.1  Polyketide   
3  BGC0000004.1  Polyketide   
4  BGC0000005.1  Polyketide   

                                        PFAM_Domains  \
0  [PF02353, PF01135, PF01269, PF13489, PF01596, ...   
1  [PF00749, PF00201, PF04101, PF13579, PF03033, ...   
2  [PF00755, PF08659, PF00107, PF13489, PF10294, ...   
3  [PF07690, PF06609, PF00083, PF00975, PF00550, ...   
4      [PF00135, PF10340, PF07859, PF12146, PF00975]   

                                        PFAM_Indices  
0  [2178, 1082, 1204, 12544, 1509, 12881, 12697, ...  
1  [719, 196, 3788, 12632, 2786, 933, 11773, 7197...  
2  [725, 7927, 104, 12544, 9464, 1149, 12881, 753...  
3  [7035, 6070, 81, 933, 528, 13779, 670, 15177, ...  
4                      [132, 9505, 7197, 11236, 933]  


In [20]:
processed_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/processed_mibig_with_indices.csv"
data.to_csv(processed_path, index=False)
print(f"Processed MiBIG dataset saved to {processed_path}")


Processed MiBIG dataset saved to /Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/processed_mibig_with_indices.csv


In [21]:
import numpy as np
from scipy.sparse import lil_matrix

# Paths to MiBIG and PFAM index files
mibig_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/processed_mibig_with_indices.csv"
pfam_index_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/updated_pfam_index_vectors.txt"

# Load PFAM index vectors
pfam_index_vectors = {}
with open(pfam_index_path, "r") as f:
    for line in f:
        pfam_id, index = line.strip().split("\t")
        pfam_index_vectors[pfam_id] = int(index)

# Total number of PFAM domains
num_domains = len(pfam_index_vectors)
num_rows = 2024  
# Create sparse matrix for PFAM embeddings
pfam_sparse = lil_matrix((num_rows, num_domains), dtype=np.int8)

# Populate the sparse matrix with PFAM indices from MiBIG
with open(mibig_path, "r") as f:
    for row_idx, line in enumerate(f):
        parts = line.strip().split(",")
        if len(parts) > 2:
            pfam_list = parts[2].split(";")  
            for pfam in pfam_list:
                if pfam in pfam_index_vectors:
                    pfam_sparse[row_idx, pfam_index_vectors[pfam]] = 1

# Convert to dense format
pfam_dense = pfam_sparse.toarray()
print(f"Shape of PFAM embeddings (pfam_dense): {pfam_dense.shape}")


Shape of PFAM embeddings (pfam_dense): (2024, 19092)


print(f"Shape of PFAM embeddings (pfam_dense): {pfam_dense.shape}")
print(f"Shape of ESM embeddings (esm_dense): {esm_embeddings.shape}")


In [22]:
import torch
# Load ESM embeddings
esm_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/esm1b_pfam_embs.pt"
esm_embeddings = torch.load(esm_path)
esm_dense = esm_embeddings.numpy()

print(f"Shape of ESM embeddings (esm_dense): {esm_dense.shape}")


Shape of ESM embeddings (esm_dense): (19450, 1280)


  esm_embeddings = torch.load(esm_path)


In [23]:
# Debugging shapes
print(f"Shape of PFAM embeddings (pfam_dense): {pfam_dense.shape}")
print(f"Shape of ESM embeddings (esm_dense): {esm_dense.shape}")
print(f"Number of rows in MiBIG dataset: {data.shape[0]}")

# Check for mismatched sizes
if pfam_dense.shape[0] > esm_dense.shape[0]:
    # Pad ESM embeddings to match PFAM embeddings
    difference = pfam_dense.shape[0] - esm_dense.shape[0]
    print(f"Padding ESM embeddings with {difference} rows to match PFAM embeddings...")
    esm_dense = np.pad(esm_dense, ((0, difference), (0, 0)), mode="constant", constant_values=0)

elif esm_dense.shape[0] > pfam_dense.shape[0]:
    # Pad PFAM embeddings to match ESM embeddings
    difference = esm_dense.shape[0] - pfam_dense.shape[0]
    print(f"Padding PFAM embeddings with {difference} rows to match ESM embeddings...")
    pfam_dense = np.pad(pfam_dense, ((0, difference), (0, 0)), mode="constant", constant_values=0)

# Validate alignment
assert pfam_dense.shape[0] == esm_dense.shape[0], "PFAM and ESM embeddings still do not align after adjustment!"

# Concatenate PFAM and ESM embeddings
hybrid_embeddings = np.hstack([pfam_dense, esm_dense])

# Output results
print(f"Hybrid embedding shape: {hybrid_embeddings.shape}")

# Save hybrid embeddings
output_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/hybrid_pfam_esm_embeddings.npy"
np.save(output_path, hybrid_embeddings)

print(f"Hybrid embeddings saved to {output_path}")


Shape of PFAM embeddings (pfam_dense): (2024, 19092)
Shape of ESM embeddings (esm_dense): (19450, 1280)
Number of rows in MiBIG dataset: 2024
Padding PFAM embeddings with 17426 rows to match ESM embeddings...
Hybrid embedding shape: (19450, 20372)
Hybrid embeddings saved to /Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/hybrid_pfam_esm_embeddings.npy
