In [1]:
!which python

/nfs/production/literature/santosh_tirunagari/transformers_env/bin/python


In [4]:
path_floret_model = '/nfs/production/literature/santosh_tirunagari/BACKUP/work/github/source_data/floret_embeddings/en_floret_model'

In [3]:
# ! python -m pip install floret 'spacy~=3.4.0' pandas --quiet

In [4]:
import spacy

# This is the spaCy pipeline with floret vectors
nlp_fl = spacy.load(path_floret_model)

In [5]:
word_1 = nlp_fl.vocab["sars"]
word_2 = nlp_fl.vocab["sars-cove-2"]

word_1.similarity(word_2)

0.43325987458229065

In [78]:
tokens = nlp_fl("sars cov 2")
    
print(tokens.text, tokens.vector[:3], tokens.vector_norm) # Only the first three components of the vector 
    
for token in tokens:
    print(token.text, token.vector[:3], token.vector_norm)

sars cov 2 [-3.5737886  2.3306377  4.0841546] 55.919779433640024
sars [ 2.3757248  -1.0899751   0.76489997] 42.306488
cov [-1.0599601  1.1186376  3.3393645] 57.941483
2 [-12.03713   6.96325   8.1482 ] 152.65103


In [10]:
# !pip install pronto

You should consider upgrading via the '/nfs/production/literature/santosh_tirunagari/transformers_env/bin/python -m pip install --upgrade pip' command.[0m


In [36]:
import csv
import pickle
import numpy as np
from pronto import Ontology
import spacy
import faiss
from tqdm import tqdm
import warnings
import gc
import re
import pandas

warnings.simplefilter("ignore")

# Load the spaCy model
nlp = spacy.load(path_floret_model)


def create_quantized_index(embeddings_np, d):
    """Create a trained IVFPQ index."""
    nlist = 1000
    m = 30
    quantizer = faiss.IndexFlatL2(d)
    index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
    index.train(embeddings_np)
    return index

#
def get_average_embeddings_batched(terms):
    """Return average embeddings for terms."""
    docs = list(nlp.pipe(terms))
    embeddings = []

    for doc in docs:
        # Filtering out tokens without vectors or with unexpected vector sizes
        valid_vectors = [token.vector for token in doc if token.has_vector and token.vector_norm != 0 and token.vector.shape[0] == 300]

        # If no valid vectors, append a zero vector
        if len(valid_vectors) == 0:
            embeddings.append(np.zeros((300,)))
        else:
            average_embedding = np.mean(valid_vectors, axis=0)
            embeddings.append(average_embedding)

    return embeddings

In [62]:
# Filenames

path__ = "/nfs/production/literature/santosh_tirunagari/BACKUP/"
INPUT_FILENAME = path__+"work/github/source_data/knowledge_base/bao/BAO.csv"
OUTPUT_PICKLE_FILENAME = path__+"/work/github/CAPITAL/normalisation/dictionary/bao.pkl"
OUTPUT_LIST = path__+"work/github/CAPITAL/normalisation/dictionary/bao_list.txt"
FAISS_INDEX_FILENAME = path__+"work/github/CAPITAL/normalisation/dictionary/bao_terms.index"
# OUTPUT_INDEXED_TERMS_FILENAME = path__+"work/github/ML_annotations/normalisation/dictionary/bao_indexed_terms.pkl"


In [63]:
def process_column_content(s):
    """Clean and strip unwanted characters."""
    return re.sub(r'\(.*?\)|\".*?\"|\[.*?\]', '', s).strip().lower()


df = pd.read_csv(input_filename, usecols=['Class ID', 'Preferred Label', 'Synonyms', 'Definitions', 'alternative term'], 
                 sep=',', engine='python', on_bad_lines='skip')


term_to_id = {}
embeddings = []  
indexed_terms = []

In [64]:
# Each Class ID's variations will be treated as separate entries
print("Loading ontology...")

flattened_data = []
for _, row in df.iterrows():
    term_id = row['Class ID']
    for col in ['Preferred Label', 'Synonyms', 'Definitions', 'alternative term']:
        term_name = row[col]
        if pd.notnull(term_name):  # Check if the term_name is not NaN
            flattened_data.append((term_id, term_name))

# Convert flattened data to a DataFrame for easier manipulation
flattened_df = pd.DataFrame(flattened_data, columns=['Class ID', 'Term Name'])


Loading ontology...


In [65]:
flattened_df

Unnamed: 0,Class ID,Term Name
0,http://purl.obolibrary.org/obo/CHEBI_50444,adenosine phosphodiesterase inhibitor
1,http://purl.obolibrary.org/obo/CHEBI_131787,dopamine receptor D2 antagonist
2,http://purl.obolibrary.org/obo/CHEBI_131787,D2R antagonist|D2 receptor antagonist
3,http://purl.obolibrary.org/obo/CHEBI_131789,RUNX1 inhibitor
4,http://purl.obolibrary.org/obo/CHEBI_131789,acute myeloid leukemia 1 protein inhibitors|co...
...,...,...
16050,http://purl.obolibrary.org/obo/CHEBI_77962,food antioxidant
16051,http://purl.obolibrary.org/obo/CHEBI_77962,food antioxidants
16052,http://purl.obolibrary.org/obo/DOID_3953,adrenal gland cancer
16053,http://purl.obolibrary.org/obo/DOID_3953,neoplasm of adrenal gland|adrenal neoplasm|adr...


In [66]:
BATCH_SIZE = 100
term_batches = []
id_batches = []
current_batch_terms = []
current_batch_ids = []

for _, row in tqdm(flattened_df.iterrows(), total=flattened_df.shape[0], desc="Processing terms"):
    term_id = row['Class ID']
    term_name = row['Term Name']

    # Assuming process_column_content is a function you've defined to process the term_name
    term_name = process_column_content(term_name)

    # Check for empty or single character terms and skip them
    if not term_name or len(term_name) <= 1:
        continue

    current_batch_terms.append(term_name)
    current_batch_ids.append(term_id)

    if len(current_batch_terms) == BATCH_SIZE:
        term_batches.append(current_batch_terms)
        id_batches.append(current_batch_ids)
        current_batch_terms = []
        current_batch_ids = []

# Catch any remaining terms not added to a batch
if current_batch_terms:
    term_batches.append(current_batch_terms)
    id_batches.append(current_batch_ids)

for term_batch, id_batch in tqdm(zip(term_batches, id_batches), total=len(term_batches),
                                 desc="Generating Embeddings"):
    batch_embeddings = get_average_embeddings_batched(term_batch)

    for term, term_id, embedding in zip(term_batch, id_batch, batch_embeddings):
        norm = np.linalg.norm(embedding)

        # Check if the embedding is a zero vector
        if norm == 0:
            print(f"Term '{term}' with ID '{term_id}' has a zero vector.")

        # Normalizing the vector
        normalized_embedding = embedding if norm == 0 else embedding / norm
        embeddings.append(normalized_embedding)
        term_to_id[term] = term_id
        indexed_terms.append(term)

        # Clear out the current batch to free up memory
    del term_batch, id_batch, batch_embeddings
    gc.collect()

d = 300
embeddings_np = np.array(embeddings).astype('float32')
index = create_quantized_index(embeddings_np, d)
index.add(embeddings_np)

# Free up memory after using embeddings_np
del embeddings, embeddings_np
gc.collect()

print("Saving quantized faiss index...")
faiss.write_index(index, FAISS_INDEX_FILENAME)

# print("Saving term to ID mapping...")
# with open(OUTPUT_PICKLE_FILENAME, "wb") as outfile:
#     pickle.dump(term_to_id, outfile)

print("Saving term to ID mapping and indexed terms...")
with open(OUTPUT_PICKLE_FILENAME, "wb") as outfile:
    pickle.dump({"term_to_id": term_to_id, "indexed_terms": indexed_terms}, outfile)


print("Writing terms to a txt file...")
with open(OUTPUT_LIST, "w") as txt_file:
    for term in term_to_id.keys():
        txt_file.write(term + "\n")

Processing terms: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16055/16055 [00:01<00:00, 14756.85it/s]
Generating Embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 161/161 [01:06<00:00,  2.41it/s]


Saving quantized faiss index...
Saving term to ID mapping and indexed terms...
Writing terms to a txt file...


# TESTING

In [54]:
import numpy as np

def get_average_embedding(term):
    tokens = term.split()
    # Get embeddings for each token
    embeddings = [nlp_fl.vocab[token].vector for token in tokens if token in nlp_fl.vocab]
    # Compute the average embedding
    average_embedding = np.mean(embeddings, axis=0)
    return average_embedding


word_1 = nlp.vocab["cyclothymic disorder"]
word_2 = nlp.vocab["Cyclothymic personality"]
word_4 = nlp.vocab["Affective personality disorder"]


word_1.similarity(word_2), word_1.similarity(word_4)

(0.5980432629585266, 0.605099081993103)

In [57]:
def get_average_embedding(term):
    tokens = term.split()
    embeddings = [nlp.vocab[token].vector for token in tokens if token in nlp.vocab]
    average_embedding = np.mean(embeddings, axis=0)
    return average_embedding

def cosine_similarity(vec1, vec2):
    # Compute cosine similarity between two vectors
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

word_1_vec = nlp.vocab["cyclothymic disorder"].vector
word_2_vec = nlp.vocab["Cyclothymic personality"].vector
word_3_vec = get_average_embedding("Affective personality disorder")
# For word_4, we get the vector of the entire phrase
word_4_doc = nlp("Affective personality disorder")
word_4_vec = word_4_doc.vector

print(cosine_similarity(word_1_vec, word_2_vec))
print(cosine_similarity(word_1_vec, word_3_vec))
print(cosine_similarity(word_1_vec, word_4_vec))

0.5980433
0.69048494
0.67023414


In [58]:
word_2_vec.shape

(300,)

In [67]:
import faiss
import pickle
import spacy
import numpy as np
from fuzzywuzzy import fuzz

# Load spaCy model
nlp = spacy.load(path_floret_model)

# def get_average_embedding(term):
#     """Get the average word embedding for a term."""
#     tokens = term.split()
#     valid_vectors = [nlp.vocab[token].vector for token in tokens if
#                      nlp.vocab[token].has_vector and nlp.vocab[token].vector.shape[0] == 300]

#     if len(valid_vectors) == 0:
#         return np.zeros((300,))

#     average_embedding = np.mean(valid_vectors, axis=0)
#     return average_embedding


def get_average_embeddings_batched(terms):
    """Return average embeddings for terms."""
    docs = list(nlp.pipe(terms))
    embeddings = []

    for doc in docs:
        # Filtering out tokens without vectors or with unexpected vector sizes
        valid_vectors = [token.vector for token in doc if token.has_vector and token.vector_norm != 0 and token.vector.shape[0] == 300]

        # If no valid vectors, append a zero vector
        if len(valid_vectors) == 0:
            embeddings.append(np.zeros((300,)))
        else:
            average_embedding = np.mean(valid_vectors, axis=0)
            embeddings.append(average_embedding)

    return embeddings

# Load the term to ID mapping and indexed terms
with open(OUTPUT_PICKLE_FILENAME, "rb") as infile:
    data = pickle.load(infile)
    term_to_id = data["term_to_id"]
    indexed_terms = data["indexed_terms"]

# Load the FAISS index
index = faiss.read_index(FAISS_INDEX_FILENAME)

def retrieve_similar_terms(query, k=5):
    """Retrieve top k similar terms given a query."""
    # Convert query to lowercase
    query = query.lower()
    
    # Get average embedding of the query
    query_embedding = get_average_embeddings_batched([query])
    
    norm = np.linalg.norm(query_embedding)
    query_embedding = query_embedding if norm == 0 else query_embedding / norm
    query_embedding = query_embedding.reshape(1, -1).astype('float32')

    # Search the index
    D, I = index.search(query_embedding, k)
    
    similar_terms = []
    for i in range(k):
        term = indexed_terms[I[0][i]]
        score = D[0][i]
        term_id = term_to_id[term]
        similar_terms.append((term, term_id, score))
    
    return similar_terms


def retrieve_similar_terms_with_fuzzy(query, k):
    """Retrieve k terms similar to the query."""
    query = query.lower()
    
    # Get average embedding of the query
    query_embedding = get_average_embeddings_batched([query])
    
    norm = np.linalg.norm(query_embedding)
    query_embedding = query_embedding if norm == 0 else query_embedding / norm
    query_embedding = query_embedding.reshape(1, -1).astype('float32')

    # Search the index
    D, I = index.search(query_embedding, k)
    
    # Retrieve the terms from the indexed_terms list
    candidate_terms = [indexed_terms[i] for i in I[0]]
    
    # Get fuzzy matching scores for these terms
    scores = [fuzz.ratio(query, term) for term in candidate_terms]
    
    # Pair up terms with their scores
    term_score_pairs = list(zip(candidate_terms, scores))
    
    # Rank these pairs based on scores
    ranked_term_score_pairs = sorted(term_score_pairs, key=lambda x: x[1], reverse=True)
    
    return ranked_term_score_pairs[:k]


In [93]:
# Example usage
query = "Kinomescan"# "nucleosome"
results = retrieve_similar_terms(query, 5)

for term, term_id, score in results:
    print(f"Term: {term}, ID: {term_id}, Score: {score}")

Term: wgs|whole genome sequencing|wgs |whole genome sequencing, ID: http://purl.obolibrary.org/obo/NCIT_C101294, Score: 0.7766202092170715
Term: whole genome sequencing, ID: http://purl.obolibrary.org/obo/NCIT_C101294, Score: 0.7983897924423218
Term: treefinder phylogenetic analysis, ID: http://www.bioassayontology.org/bao#BAO_0002209, Score: 0.8390119671821594
Term: computational phylogenetic analysis, ID: http://www.bioassayontology.org/bao#BAO_0002206, Score: 0.8426339626312256
Term: diverset, ID: http://www.bioassayontology.org/bao#BAO_0000735, Score: 0.8775628805160522


In [88]:
# For demonstration:
# query = "thermal shift " #"nucleosome"
results = retrieve_similar_terms_with_fuzzy(query, 10)
for term, score in results:
    print(f"Term: {term}, Score: {score}")

Term: thermal shift, Score: 96
Term: transmittance, Score: 44
Term: electrical current density, Score: 40
Term: solid state laser, Score: 32
Term: excitation filter, Score: 32
Term: light source, Score: 31
Term: emission wavelength, Score: 30
Term: resonant waveguide grating, Score: 25
Term: excitation wavelength, Score: 23
Term: impedance, Score: 17


In [148]:
import faiss
import pickle
import spacy
import numpy as np
from fuzzywuzzy import fuzz

# Load spaCy model
nlp = spacy.load(path_floret_model)


def get_average_embeddings_batched(terms):
    """Return average embeddings for terms."""
    docs = list(nlp.pipe(terms))
    embeddings = []

    for doc in docs:
        # Filtering out tokens without vectors or with unexpected vector sizes
        valid_vectors = [token.vector for token in doc if token.has_vector and token.vector_norm != 0 and token.vector.shape[0] == 300]

        # If no valid vectors, append a zero vector
        if len(valid_vectors) == 0:
            embeddings.append(np.zeros((300,)))
        else:
            average_embedding = np.mean(valid_vectors, axis=0)
            embeddings.append(average_embedding)

    return embeddings

# Load the term to ID mapping and indexed terms
with open("/home/stirunag/work/github/ML_annotations/normalisation/dictionary/uniprot_terms.pkl", "rb") as infile:
    data = pickle.load(infile)
    term_to_id = data["term_to_id"]
    indexed_terms = data["indexed_terms"]

# Load the FAISS index
index = faiss.read_index("/home/stirunag/work/github/ML_annotations/normalisation/dictionary/uniprot_terms.index")

def retrieve_similar_terms(query, k=5):
    """Retrieve top k similar terms given a query."""
    # Convert query to lowercase
    query = query.lower()
    
    # Get average embedding of the query
    query_embedding = get_average_embeddings_batched([query])
    
    norm = np.linalg.norm(query_embedding)
    query_embedding = query_embedding if norm == 0 else query_embedding / norm
    query_embedding = query_embedding.reshape(1, -1).astype('float32')

    # Search the index
    D, I = index.search(query_embedding, k)
    
    similar_terms = []
    for i in range(k):
        term = indexed_terms[I[0][i]]
        score = D[0][i]
        term_id = term_to_id[term]
        similar_terms.append((term, term_id, score))
    
    return similar_terms


def retrieve_similar_terms_with_fuzzy(query, k):
    """Retrieve k terms similar to the query."""
    query = query
    
    # Get average embedding of the query
    query_embedding = get_average_embeddings_batched([query])
    
    norm = np.linalg.norm(query_embedding)
    query_embedding = query_embedding if norm == 0 else query_embedding / norm
    query_embedding = query_embedding.reshape(1, -1).astype('float32')

    # Search the index
    D, I = index.search(query_embedding, k)
    
    # Retrieve the terms from the indexed_terms list
    candidate_terms = [indexed_terms[i] for i in I[0]]
    
    # Get fuzzy matching scores for these terms
    scores = [fuzz.ratio(query, term) for term in candidate_terms]
    
    # Pair up terms with their scores
    term_score_pairs = list(zip(candidate_terms, scores))
    
    # Rank these pairs based on scores
    ranked_term_score_pairs = sorted(term_score_pairs, key=lambda x: x[1], reverse=True)
    
    return ranked_term_score_pairs[:k]


In [157]:
# For demonstration:
query = "IL-1"
results = retrieve_similar_terms_with_fuzzy(query, 10)
for term, score in results:
    print(f"Term: {term}, Score: {score}")

Term: cil-1, Score: 89
Term: natSil-1, Score: 67
Term: epl-1, Score: 67
Term: blos-1, Score: 60
Term: Msil_2912, Score: 46
Term: Ccr1l1, Score: 40
Term: Meg1, Score: 25
Term: Atperox P61, Score: 13
Term: bax, Score: 0
Term: norpA, Score: 0


In [20]:
import faiss
import pickle
import spacy
import numpy as np
from fuzzywuzzy import fuzz

# Load spaCy model
nlp = spacy.load(path_floret_model)

# Define mapping of annotation type to corresponding file paths
file_mapping = {
    'CD': ('chebi_terms.index', 'chebi_terms.pkl'),
    'OG': ('NCBI_terms.index', 'NCBI_terms.pkl'),
    'DS': ('umls_terms.index', 'umls_terms.pkl'),
    'GP': ('uniprot_terms.index', 'uniprot_terms.pkl')
}

# Dictionary to hold the loaded data for each annotation type
loaded_data = {}

# Load all necessary files at the beginning
base_path = "/home/stirunag/work/github/CAPITAL/normalisation/dictionary/"
for annotation_type, (index_file, pkl_file) in file_mapping.items():
    with open(base_path + pkl_file, "rb") as infile:
        data = pickle.load(infile)
    index = faiss.read_index(base_path + index_file)
    loaded_data[annotation_type] = {
        "term_to_id": data["term_to_id"],
        "indexed_terms": data["indexed_terms"],
        "index": index,
        "indexed_terms_ids": [(term, data["term_to_id"][term]) for term in data["indexed_terms"]]
    }
    print(f"Loaded data for {annotation_type}")

def get_average_embeddings_batched(terms):
    docs = list(nlp.pipe(terms))
    embeddings = []
    for doc in docs:
        valid_vectors = [token.vector for token in doc if token.has_vector and token.vector_norm != 0 and token.vector.shape[0] == 300]
        embeddings.append(np.mean(valid_vectors, axis=0) if valid_vectors else np.zeros((300,)))
    return embeddings

def retrieve_similar_terms_with_fuzzy_batched(terms, annotation_type, k=3):
    data = loaded_data[annotation_type]
    term_to_id, indexed_terms, index, indexed_terms_ids = data["term_to_id"], data["indexed_terms"], data["index"], data["indexed_terms_ids"]

    # Map transformed terms to original terms
    original_to_transformed = {}
    transformed_terms = []

    # Check for entity groups that need transformation
    for term in terms:
        if annotation_type in ['CD', 'OG', 'DS']:
            transformed_term = term.lower()
            original_to_transformed[transformed_term] = term
            transformed_terms.append(transformed_term)
        else:
            original_to_transformed[term] = term
            transformed_terms.append(term)

    term_embeddings = get_average_embeddings_batched(transformed_terms)
    normalized_embeddings = [emb / np.linalg.norm(emb) if np.linalg.norm(emb) != 0 else emb for emb in term_embeddings]
    D, I = index.search(np.array(normalized_embeddings).astype('float32'), k)

    results = {}
    for idx, transformed_term in enumerate(transformed_terms):
        original_term = original_to_transformed[transformed_term]
        candidate_terms_and_ids = [indexed_terms_ids[i] for i in I[idx]]
        candidate_terms, candidate_ids = zip(*candidate_terms_and_ids)
        scores = [fuzz.ratio(transformed_term, c_term.lower()) for c_term in candidate_terms]
        results[original_term] = sorted(list(zip(candidate_terms, scores, candidate_ids)), key=lambda x: x[1], reverse=True)[:k]

    return results

Loaded data for CD
Loaded data for OG
Loaded data for DS
Loaded data for GP


In [34]:
# terms = ['hypertension', 'covid-19', 'Coronavirus']
# annotation_type = 'DS'

terms = ['p53', 'P53']
annotation_type = 'GP'


results = retrieve_similar_terms_with_fuzzy_batched(terms, annotation_type, k=2)
results

{'p53': [('p53', 100, 'Q8JN60'), ('Phosphoprotein p53', 29, 'P04637')],
 'P53': [('P53', 67, 'Q42578'), ('Atperox P53', 29, 'Q42578')]}

In [109]:
# !pip install fuzzywuzzy python-Levenshtein

Collecting fuzzywuzzy
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.22.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.22.0
  Downloading Levenshtein-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (172 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.9/172.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting rapidfuzz<4.0.0,>=2.3.0
  Downloading rapidfuzz-3.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: fuzzywuzzy, rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.22.0 fuzzywuzzy-0.18.0 python-Levenshtein-0.22.0 rapidfuzz-3.3.1

[1m[[0m[34;49mnotic

In [38]:
# !pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [97]:
import numpy as np

def Embed(A, L):
    T = len(A)
    K = T - L + 1
    N = len(A)
    X = np.zeros((L, K))
    
    for i in range(K):
        X[:, i] = A[i:L+i]
    
    TM = np.hstack([X[:, 0].reshape(-1, 1), X])
    TM = TM.T
    
    return TM, K

# Example of use:
A = np.array([1,2,3,4,5,6,7,8,9,10])
L = 3
TM, K = Embed(A, L)
print("TM:")
print(TM[1::])
print("K:", K)


TM:
[[ 1.  2.  3.]
 [ 2.  3.  4.]
 [ 3.  4.  5.]
 [ 4.  5.  6.]
 [ 5.  6.  7.]
 [ 6.  7.  8.]
 [ 7.  8.  9.]
 [ 8.  9. 10.]]
K: 8


In [101]:
import numpy as np


def Hankelize(TM):
    N, M = TM.shape

    # Prepare an array to store summation and count of each diagonal
    summation = np.zeros(N + M - 1)
    count = np.zeros(N + M - 1)

    # Iterate over the TM matrix to populate the summation and count arrays
    for i in range(N):
        for j in range(M):
            summation[i + j] += TM[i, j]
            count[i + j] += 1

    # Element-wise division to get the average
    HM = summation / count

    return HM


# Example of use:

result = Hankelize(TM[1::])
print(result)


[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
