### Steps to Include All PFAM Domains

In [1]:
#Importing packages

import Bio
print(Bio.__version__)
from Bio import SeqIO  # Import SeqIO from the Bio package
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import torch
print(torch.__version__)

1.83
2.5.1


In [2]:
with open("/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/final_pfams.fasta", "r") as f:
    for i in range(10):  # Display the first 10 lines
        print(next(f).strip())


>A0A1I4YJU4_9ENTR/160-195 A0A1I4YJU4.1 PF10417.11;1-cysPrx_C;
ALQFHEEHGEVCPAQWHKGQEGMGASPEGVAKYLSE
>A0A261DC17_9RICK/184-418 A0A261DC17.1 PF12574.10;120_Rick_ant;
AALVNKSIAKPEELDDLNKFRAYFENEQNKETISGLLKEDQNLKHALEQVEIAGYKNVHT
QFAGRFSTMEWKDGGVENANGITIKKQIVRDANGHEIATLSEANHQINPPHTVQKSDGTS
VAISNYRTIDFPIKLDNNGPMHLSLAVKDQYGKNIAASNAVYFTAHYDDAGKLIEVSSPH
PVKFTGNSPDAVGYIEHGGKIYTLPVTQEKYRSMMQEVAKNLGQGVNISPSIESI
>A6LL01_THEM4/23-486 A6LL01.1 PF09847.11;12TM_1;
TVKGNFFRQILQYIIGSVPLGLIVYFFTIDLFEKIYNVDPLVARYMYLMWSSMLSLFFVI
GFIGLGMYSLSRNEEVELLLTMPISRTVISAYQIFSATISQIYTLSFFIFISLAYFVSTN


In [3]:
# Define the path to the uploaded FASTA file
fasta_file_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/final_pfams.fasta"

# Parse the FASTA file
fasta_records = list(SeqIO.parse(fasta_file_path, "fasta"))

# Extract summary information
num_sequences = len(fasta_records)
sequence_lengths = [len(record.seq) for record in fasta_records]

# Display summary statistics
fasta_summary = {
    "Total Sequences": num_sequences,
    "Shortest Sequence Length": min(sequence_lengths),
    "Longest Sequence Length": max(sequence_lengths),
    "Average Sequence Length": sum(sequence_lengths) / num_sequences
}
fasta_summary


{'Total Sequences': 19450,
 'Shortest Sequence Length': 4,
 'Longest Sequence Length': 1818,
 'Average Sequence Length': 153.0196915167095}

In [4]:
# Custom FASTA parser
def parse_fasta(file_path):
    sequences = {}
    with open(file_path, "r") as file:
        header = None
        sequence = []
        for line in file:
            line = line.strip()
            if line.startswith(">"):  # Header line
                if header:
                    sequences[header] = "".join(sequence)
                header = line[1:]  # Remove ">"
                sequence = []
            else:
                sequence.append(line)
        if header:
            sequences[header] = "".join(sequence)  # Add the last sequence
    return sequences

# Parse the provided FASTA file
fasta_data = parse_fasta(fasta_file_path)

# Extract summary statistics
num_sequences = len(fasta_data)
sequence_lengths = [len(seq) for seq in fasta_data.values()]

fasta_summary = {
    "Total Sequences": num_sequences,
    "Shortest Sequence Length": min(sequence_lengths),
    "Longest Sequence Length": max(sequence_lengths),
    "Average Sequence Length": sum(sequence_lengths) / num_sequences
}
fasta_summary


{'Total Sequences': 19450,
 'Shortest Sequence Length': 4,
 'Longest Sequence Length': 1818,
 'Average Sequence Length': 153.0196915167095}

In [5]:
# Extract PFAM IDs from FASTA headers and create index vectors
def create_pfam_index_vectors(fasta_data):
    # Extract PFAM IDs (assuming PFAM ID is part of the header, separated by whitespace or format-specific)
    pfam_ids = [header.split()[0] for header in fasta_data.keys()]  # Adjust this split if the format differs

    # Create a mapping of unique PFAM IDs to indices
    unique_pfam_ids = sorted(set(pfam_ids))
    pfam_to_index = {pfam_id: idx for idx, pfam_id in enumerate(unique_pfam_ids)}

    # Create index vectors for sequences
    index_vectors = [pfam_to_index[pfam_id] for pfam_id in pfam_ids]

    return pfam_to_index, index_vectors

# Generate PFAM index mappings and index vectors
pfam_to_index, index_vectors = create_pfam_index_vectors(fasta_data)

# Summary of results
pfam_index_summary = {
    "Total Unique PFAM IDs": len(pfam_to_index),
    "Total Index Vectors Created": len(index_vectors)
}
pfam_index_summary


{'Total Unique PFAM IDs': 19408, 'Total Index Vectors Created': 19450}

In [7]:
import torch

# Load the esm1b_pfam_embs.pt file
pfam_embeddings_path = "/Users/josephtsenum/Documents/PHA6935_AI_for_Drug_Discovery/Project/esm1b_pfam_embs.pt"  # Replace with the actual path
pfam_embeddings = torch.load(pfam_embeddings_path)

# Inspect the structure of the loaded embeddings
print(f"Type of Data: {type(pfam_embeddings)}")
if isinstance(pfam_embeddings, dict):
    print(f"Number of PFAM IDs: {len(pfam_embeddings)}")
    sample_key = list(pfam_embeddings.keys())[0]
    print(f"Sample Key: {sample_key}")
    print(f"Sample Embedding Shape: {pfam_embeddings[sample_key].shape}")


Type of Data: <class 'torch.Tensor'>


  pfam_embeddings = torch.load(pfam_embeddings_path)


In [8]:
# Inspect the shape and sample data from the tensor
print(f"Tensor Shape: {pfam_embeddings.shape}")

# View a sample embedding (e.g., the first row)
print(f"Sample Embedding (First Row): {pfam_embeddings[0][:10]}")  # Print first 10 dimensions


Tensor Shape: torch.Size([19450, 1280])
Sample Embedding (First Row): tensor([ 0.1017,  0.1174, -0.0780,  0.1481, -0.1972, -0.0667, -0.0279, -0.0513,
        -0.0196,  0.0401])


In [10]:
# Print the column names to verify the correct one
print(mibig_data.columns)


Index(['BGC0000001.1,Polyketide,PF02353;PF01135;PF01269;PF13489;PF01596;PF13847;PF13649;PF08241;PF00486;PF03704;PF00067;PF00196;PF13424;PF14559;PF13401;PF13191;PF00486;PF03704;PF13428;PF13424;PF07719;PF00515;PF13176;PF13432;PF14559;PF05593;PF00108;PF08545;PF08541;PF00550;PF00198;PF06500;PF12697;PF16197;PF00698;PF02801;PF14765;PF01370;PF02719;PF03435;PF00550;PF00109;PF08659;PF13561;PF00108;PF00106;PF08990;PF00109;PF16197;PF00698;PF00550;PF00108;PF02801;PF14765;PF08240;PF00107;PF13602;PF08659;PF00106;PF01370;PF08990;PF00109;PF00108;PF02801;PF16197;PF00698;PF00550;PF14246;PF00440;PF07690;PF00083;PF03209;PF00296;PF00496;PF00528;PF00528;PF13555;PF13191;PF13401;PF02463;PF13671;PF13304;PF00005;PF13481;PF08352;PF00067;PF06902;PF13459;PF13370;PF03358;PF02525;PF00561;PF00975;PF12697;PF12146;PF07859;PF13602;PF00107;PF00440'], dtype='object')


In [13]:
# Convert the column to strings and parse PFAM IDs
pfam_column = mibig_data.iloc[:, 0].astype(str)  # Assuming PFAM data is in the first column
pfam_strings = pfam_column.str.split(",", expand=True)[2]  # Extract third part (PFAM IDs)
pfam_strings = pfam_strings.str.split(";")  # Split PFAM IDs by semicolon into lists

# Display a sample of parsed PFAM strings
print(pfam_strings.head())


0    [PF00749, PF00201, PF04101, PF13579, PF03033, ...
1    [PF00755, PF08659, PF00107, PF13489, PF10294, ...
2    [PF07690, PF06609, PF00083, PF00975, PF00550, ...
3        [PF00135, PF10340, PF07859, PF12146, PF00975]
4    [PF07690, PF06609, PF00083, PF00975, PF12697, ...
Name: 2, dtype: object


In [14]:
import numpy as np

def create_row_embeddings(pfam_strings, pfam_to_index, pfam_embeddings):
    row_embeddings = []  # List to store embeddings for each row

    for pfam_ids in pfam_strings:
        # Initialize list for embeddings
        embeddings = []
        for pfam_id in pfam_ids:
            if pfam_id in pfam_to_index:
                # Get the index for this PFAM ID
                idx = pfam_to_index[pfam_id]
                embeddings.append(pfam_embeddings[idx].numpy())
            else:
                # Handle unknown PFAM IDs with a zero vector
                embeddings.append(np.zeros(1280))  # Shape (1280,)
        
        # Stack embeddings for this row
        row_embedding = np.stack(embeddings)  # Shape (len(PFAM_ids), 1280)
        row_embeddings.append(row_embedding)

    return row_embeddings

# Generate row embeddings
row_embeddings = create_row_embeddings(pfam_strings, pfam_to_index, pfam_embeddings)

# Example row embedding shape
print(f"Example Row Embedding Shape: {row_embeddings[0].shape}")


Example Row Embedding Shape: (217, 1280)


In [15]:
# Print the shape of embeddings for the first few rows
for i in range(5):  # Adjust range as needed
    print(f"Row {i + 1} Embedding Shape: {row_embeddings[i].shape}")


Row 1 Embedding Shape: (217, 1280)
Row 2 Embedding Shape: (26, 1280)
Row 3 Embedding Shape: (71, 1280)
Row 4 Embedding Shape: (5, 1280)
Row 5 Embedding Shape: (72, 1280)


In [16]:
# Inspect the embeddings of the first row
print(f"Row 1 Embedding Sample:\n{row_embeddings[0][:5]}")  # First 5 PFAM embeddings


Row 1 Embedding Sample:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [19]:
# Get the PFAM IDs for the first row
first_row_pfams = pfam_strings[0]
print(f"First Row PFAM IDs: {first_row_pfams}")


First Row PFAM IDs: ['PF00749', 'PF00201', 'PF04101', 'PF13579', 'PF03033', 'PF00975', 'PF12697', 'PF07859', 'PF00550', 'PF08659', 'PF00106', 'PF02719', 'PF00698', 'PF16197', 'PF00108', 'PF13561', 'PF01370', 'PF16363', 'PF14765', 'PF02801', 'PF00109', 'PF08990', 'PF08659', 'PF00106', 'PF13561', 'PF01370', 'PF13460', 'PF16363', 'PF04321', 'PF00698', 'PF16197', 'PF02801', 'PF00109', 'PF00550', 'PF00108', 'PF08990', 'PF00550', 'PF00106', 'PF01073', 'PF13602', 'PF00107', 'PF08240', 'PF00108', 'PF08659', 'PF04321', 'PF01370', 'PF16363', 'PF00698', 'PF14765', 'PF02719', 'PF02801', 'PF00109', 'PF05368', 'PF16197', 'PF08990', 'PF00550', 'PF08659', 'PF00106', 'PF16363', 'PF02719', 'PF00698', 'PF02801', 'PF00109', 'PF00108', 'PF01370', 'PF13561', 'PF03435', 'PF16197', 'PF00550', 'PF08659', 'PF00106', 'PF13561', 'PF00698', 'PF16197', 'PF02801', 'PF00109', 'PF01073', 'PF01370', 'PF02719', 'PF03435', 'PF16363', 'PF14765', 'PF00108', 'PF08990', 'PF00550', 'PF08659', 'PF00106', 'PF13561', 'PF01370', 

  first_row_pfams = pfam_strings[0]


In [20]:
# Check if the PFAM IDs exist in the mapping
missing_pfams = [pfam_id for pfam_id in first_row_pfams if pfam_id not in pfam_to_index]
print(f"Missing PFAM IDs in First Row: {missing_pfams}")



Missing PFAM IDs in First Row: ['PF00749', 'PF00201', 'PF04101', 'PF13579', 'PF03033', 'PF00975', 'PF12697', 'PF07859', 'PF00550', 'PF08659', 'PF00106', 'PF02719', 'PF00698', 'PF16197', 'PF00108', 'PF13561', 'PF01370', 'PF16363', 'PF14765', 'PF02801', 'PF00109', 'PF08990', 'PF08659', 'PF00106', 'PF13561', 'PF01370', 'PF13460', 'PF16363', 'PF04321', 'PF00698', 'PF16197', 'PF02801', 'PF00109', 'PF00550', 'PF00108', 'PF08990', 'PF00550', 'PF00106', 'PF01073', 'PF13602', 'PF00107', 'PF08240', 'PF00108', 'PF08659', 'PF04321', 'PF01370', 'PF16363', 'PF00698', 'PF14765', 'PF02719', 'PF02801', 'PF00109', 'PF05368', 'PF16197', 'PF08990', 'PF00550', 'PF08659', 'PF00106', 'PF16363', 'PF02719', 'PF00698', 'PF02801', 'PF00109', 'PF00108', 'PF01370', 'PF13561', 'PF03435', 'PF16197', 'PF00550', 'PF08659', 'PF00106', 'PF13561', 'PF00698', 'PF16197', 'PF02801', 'PF00109', 'PF01073', 'PF01370', 'PF02719', 'PF03435', 'PF16363', 'PF14765', 'PF00108', 'PF08990', 'PF00550', 'PF08659', 'PF00106', 'PF13561', 

In [21]:
# Print some keys from pfam_to_index
print(f"Sample Keys from pfam_to_index: {list(pfam_to_index.keys())[:10]}")


Sample Keys from pfam_to_index: ['1105L_ASFB7/10-82', '1A_PZSVT/429-597', '2B_PEBV/1-117', '2B_TAV/7-91', '3601L_ASFB7/98-310', '5054R_ASFB7/87-280', 'A0A010R8G7_9PEZI/247-784', 'A0A010RL20_9PEZI/12-130', 'A0A010RUI5_9PEZI/166-439', 'A0A010RXA2_9PEZI/404-628']


In [22]:
# Check if the PFAM IDs exist in the mapping
missing_pfams = [pfam_id for pfam_id in first_row_pfams if pfam_id not in pfam_to_index]
print(f"Missing PFAM IDs in First Row: {missing_pfams}")
print(f"Number of Missing PFAM IDs: {len(missing_pfams)}")


Missing PFAM IDs in First Row: ['PF00749', 'PF00201', 'PF04101', 'PF13579', 'PF03033', 'PF00975', 'PF12697', 'PF07859', 'PF00550', 'PF08659', 'PF00106', 'PF02719', 'PF00698', 'PF16197', 'PF00108', 'PF13561', 'PF01370', 'PF16363', 'PF14765', 'PF02801', 'PF00109', 'PF08990', 'PF08659', 'PF00106', 'PF13561', 'PF01370', 'PF13460', 'PF16363', 'PF04321', 'PF00698', 'PF16197', 'PF02801', 'PF00109', 'PF00550', 'PF00108', 'PF08990', 'PF00550', 'PF00106', 'PF01073', 'PF13602', 'PF00107', 'PF08240', 'PF00108', 'PF08659', 'PF04321', 'PF01370', 'PF16363', 'PF00698', 'PF14765', 'PF02719', 'PF02801', 'PF00109', 'PF05368', 'PF16197', 'PF08990', 'PF00550', 'PF08659', 'PF00106', 'PF16363', 'PF02719', 'PF00698', 'PF02801', 'PF00109', 'PF00108', 'PF01370', 'PF13561', 'PF03435', 'PF16197', 'PF00550', 'PF08659', 'PF00106', 'PF13561', 'PF00698', 'PF16197', 'PF02801', 'PF00109', 'PF01073', 'PF01370', 'PF02719', 'PF03435', 'PF16363', 'PF14765', 'PF00108', 'PF08990', 'PF00550', 'PF08659', 'PF00106', 'PF13561', 

In [23]:
# Print some keys from the pfam_to_index mapping
print(f"Sample Keys from pfam_to_index: {list(pfam_to_index.keys())[:10]}")


Sample Keys from pfam_to_index: ['1105L_ASFB7/10-82', '1A_PZSVT/429-597', '2B_PEBV/1-117', '2B_TAV/7-91', '3601L_ASFB7/98-310', '5054R_ASFB7/87-280', 'A0A010R8G7_9PEZI/247-784', 'A0A010RL20_9PEZI/12-130', 'A0A010RUI5_9PEZI/166-439', 'A0A010RXA2_9PEZI/404-628']


In [24]:
# Normalize PFAM IDs in pfam_strings and mapping
pfam_strings = pfam_strings.apply(lambda x: [pfam_id.strip().upper() for pfam_id in x])
pfam_to_index = {pfam_id.strip().upper(): idx for pfam_id, idx in pfam_to_index.items()}


In [25]:
for missing_id in missing_pfams:
    pfam_to_index[missing_id] = len(pfam_to_index)  # Assign a new index
    pfam_embeddings = torch.cat([pfam_embeddings, torch.zeros(1, 1280)])  # Add a zero vector


In [26]:
# Normalize PFAM IDs in the dataset and the mapping
pfam_strings = pfam_strings.apply(lambda x: [pfam_id.strip().upper() for pfam_id in x])
pfam_to_index = {pfam_id.strip().upper(): idx for pfam_id, idx in pfam_to_index.items()}


In [27]:
# Check again for missing PFAM IDs after normalization
missing_pfams = [pfam_id for pfam_id in first_row_pfams if pfam_id not in pfam_to_index]
print(f"Missing PFAM IDs after normalization: {missing_pfams}")
print(f"Number of Missing PFAM IDs: {len(missing_pfams)}")


Missing PFAM IDs after normalization: []
Number of Missing PFAM IDs: 0


In [28]:
# Recreate embeddings for the first row
row_embeddings = create_row_embeddings(pfam_strings, pfam_to_index, pfam_embeddings)

# Inspect the first row embeddings
print(f"Row 1 Embedding Shape: {row_embeddings[0].shape}")
print(f"Row 1 Embedding Sample:\n{row_embeddings[0][:5]}")  # First 5 embeddings



Row 1 Embedding Shape: (217, 1280)
Row 1 Embedding Sample:
[[-0.13371986  0.19606242 -0.08810301 ...  0.00279715  0.03579494
   0.12748712]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [29]:
import re

# Extract PFAM IDs from the keys
new_pfam_to_index = {}
for key, idx in pfam_to_index.items():
    match = re.search(r"(PF\d+)", key)  # Look for PFAM ID pattern
    if match:
        pfam_id = match.group(1)  # Extract PFAM ID
        new_pfam_to_index[pfam_id] = idx

# Print a sample of the new mapping
print(f"Sample Keys from new_pfam_to_index: {list(new_pfam_to_index.keys())[:10]}")


Sample Keys from new_pfam_to_index: ['PF2', 'PF4', 'PF8', 'PF1', 'PF0', 'PF6', 'PF9', 'PF31', 'PF7', 'PF77']


In [30]:
# Recheck missing PFAM IDs
missing_pfams = [pfam_id for pfam_id in first_row_pfams if pfam_id not in new_pfam_to_index]
print(f"Missing PFAM IDs after updating mapping: {missing_pfams}")
print(f"Number of Missing PFAM IDs: {len(missing_pfams)}")


Missing PFAM IDs after updating mapping: []
Number of Missing PFAM IDs: 0


In [31]:
# Generate row embeddings using the updated mapping
row_embeddings = create_row_embeddings(pfam_strings, new_pfam_to_index, pfam_embeddings)

# Inspect the first row embeddings
print(f"Row 1 Embedding Shape: {row_embeddings[0].shape}")
print(f"Row 1 Embedding Sample:\n{row_embeddings[0][:5]}")  # First 5 embeddings


Row 1 Embedding Shape: (217, 1280)
Row 1 Embedding Sample:
[[-0.13371986  0.19606242 -0.08810301 ...  0.00279715  0.03579494
   0.12748712]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [32]:
import re

# Extract PFAM IDs from the keys (if they exist)
extracted_pfam_to_index = {}
for key, idx in pfam_to_index.items():
    match = re.search(r"(PF\d+)", key)  # Look for PFAM ID patterns like PF00749
    if match:
        pfam_id = match.group(1)
        extracted_pfam_to_index[pfam_id] = idx

# Print a sample of the new extracted mapping
print(f"Sample Keys from extracted_pfam_to_index: {list(extracted_pfam_to_index.keys())[:10]}")


Sample Keys from extracted_pfam_to_index: ['PF2', 'PF4', 'PF8', 'PF1', 'PF0', 'PF6', 'PF9', 'PF31', 'PF7', 'PF77']


In [33]:
# Check for missing PFAM IDs in the new mapping
missing_pfams = [pfam_id for pfam_id in first_row_pfams if pfam_id not in extracted_pfam_to_index]
print(f"Missing PFAM IDs after extraction: {missing_pfams}")
print(f"Number of Missing PFAM IDs: {len(missing_pfams)}")


Missing PFAM IDs after extraction: []
Number of Missing PFAM IDs: 0


In [34]:
# Generate row embeddings using the updated mapping
row_embeddings = create_row_embeddings(pfam_strings, extracted_pfam_to_index, pfam_embeddings)

# Inspect the first row embeddings
print(f"Row 1 Embedding Shape: {row_embeddings[0].shape}")
print(f"Row 1 Embedding Sample:\n{row_embeddings[0][:5]}")  # First 5 embeddings


Row 1 Embedding Shape: (217, 1280)
Row 1 Embedding Sample:
[[-0.13371986  0.19606242 -0.08810301 ...  0.00279715  0.03579494
   0.12748712]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [39]:
import re
import numpy as np

def create_row_embeddings_with_locations(pfam_strings, pfam_to_index, pfam_embeddings):
    row_embeddings = []  # List to store embeddings for each row

    for pfam_string in pfam_strings:
        # Ensure pfam_string is a semicolon-separated string
        if isinstance(pfam_string, list):  # If pfam_string is a list, join it
            pfam_string = ";".join(pfam_string)

        # Split the PFAM_string into individual PFAM IDs with locations
        pfam_entries = pfam_string.split(";")  # Assuming PFAM IDs and locations are semicolon-separated

        embeddings = []
        for entry in pfam_entries:
            # Extract PFAM ID and domain location (if available)
            match = re.match(r"(?P<pfam_id>PF\d+)(/(?P<location>\d+-\d+))?", entry)
            if match:
                pfam_id = match.group("pfam_id")
                location = match.group("location")  # Optional domain location

                # Construct the key with domain location if available
                key = f"{pfam_id}/{location}" if location else pfam_id

                # Check if the key exists in pfam_to_index
                if key in pfam_to_index:
                    idx = pfam_to_index[key]
                    embeddings.append(pfam_embeddings[idx].numpy())
                elif pfam_id in pfam_to_index:  # Fallback to PFAM ID without location
                    idx = pfam_to_index[pfam_id]
                    embeddings.append(pfam_embeddings[idx].numpy())
                else:
                    # Handle missing keys with zero vector
                    embeddings.append(np.zeros(1280))  # Shape (1280,)

        # Stack embeddings for this row
        row_embedding = np.stack(embeddings) if embeddings else np.zeros((0, 1280))  # Shape (len(PFAM_ids), 1280)
        row_embeddings.append(row_embedding)

    return row_embeddings

# Generate row embeddings with domain location
row_embeddings_with_locations = create_row_embeddings_with_locations(pfam_strings, pfam_to_index, pfam_embeddings)

# Inspect the first row
print(f"Row 1 Embedding Shape: {row_embeddings_with_locations[0].shape}")
print(f"Row 1 Embedding Sample:\n{row_embeddings_with_locations[0][:5]}")


Row 1 Embedding Shape: (217, 1280)
Row 1 Embedding Sample:
[[-0.13371986  0.19606242 -0.08810301 ...  0.00279715  0.03579494
   0.12748712]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [42]:
import pickle

# Save the embeddings using pickle
with open("row_embeddings_with_locations.pkl", "wb") as f:
    pickle.dump(row_embeddings_with_locations, f)

print("Row embeddings saved successfully to 'row_embeddings_with_locations.pkl'.")


Row embeddings saved successfully to 'row_embeddings_with_locations.pkl'.


In [45]:
# Reload embeddings
with open("row_embeddings_with_locations.pkl", "rb") as f:
    row_embeddings_with_locations = pickle.load(f)


In [47]:
import pandas as pd

# Assuming row_embeddings_with_locations is loaded
# Convert Row 1 embeddings to a DataFrame
row_1_embeddings_df = pd.DataFrame(row_embeddings_with_locations[0])

# Display the DataFrame
print(row_1_embeddings_df)


        0         1         2         3         4         5         6     \
0   -0.13372  0.196062 -0.088103 -0.003177  0.059648  0.059623 -0.231602   
1    0.00000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2    0.00000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3    0.00000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4    0.00000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
..       ...       ...       ...       ...       ...       ...       ...   
212  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
213  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
214  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
215  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
216  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

         7         8         9     ...      1270      1271      1272  \
0   -0.008725 -