In [31]:
import os
import numpy as np

DATA_DIR = '../data'

### Try TF-IDF

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Simulate your data as a dictionary for simplicity
data = {
    'Domain_ID': 'c100075',
    'Short_Name':  'HATPase Superfamily',
    'Full_Name': 'Histidine kinase-like ATPase domain',
    'Description': 'This superfamily includes the histidine kinase-like ATPase (HATPase) domains of several ATP-binding proteins such as histidine kinase, DNA gyrase B, topoisomerases, heat shock protein 90 (HSP90), phytochrome-like ATPases and DNA mismatch repair proteins. Domains belonging to this superfamily are also referred to as GHKL (gyrase, heat-shock protein 90, histidine kinase, MutL) ATPase domains.',
    'Curated_CD_Hierarchy': ['cd16915', 'cd16916', 'cd16917']
}

# TF-IDF on textual features
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([data['Short_Name'], data['Full_Name'], data['Description']])

print(tfidf_matrix.shape)
vectorizer.get_feature_names_out()

# One-hot encoding of CDs
# from sklearn.preprocessing import OneHotEncoder
# cd_encoder = OneHotEncoder(handle_unknown='ignore') 
# cd_matrix = cd_encoder.fit_transform(df[['Curated_CD_Hierarchy']])

# # ... More advanced techniques for text embeddings and hierarchies

# # Concatenate the results (assuming column alignment):
# import numpy as np
# feature_vector = np.concatenate([tfidf_matrix.toarray(), cd_matrix.toarray()], axis=1)


(3, 38)


array(['90', 'also', 'and', 'are', 'as', 'atp', 'atpase', 'atpases',
       'belonging', 'binding', 'dna', 'domain', 'domains', 'ghkl',
       'gyrase', 'hatpase', 'heat', 'histidine', 'hsp90', 'includes',
       'kinase', 'like', 'mismatch', 'mutl', 'of', 'phytochrome',
       'protein', 'proteins', 'referred', 'repair', 'several', 'shock',
       'such', 'superfamily', 'the', 'this', 'to', 'topoisomerases'],
      dtype=object)

### Try Hugging Face's tokenizers

In [33]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')  # Download the punkt tokenizer models if you haven't already
from nltk.tokenize import word_tokenize

# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE())

# Customize pre-tokenization
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer
trainer = trainers.BpeTrainer(vocab_size=1000, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
tokenizer.train(files=["../data/cl00075"], trainer=trainer)

# Save the trained tokenizer
tokenizer.save("my_tokenizer.json")

# Load the trained tokenizer
tokenizer = Tokenizer.from_file("my_tokenizer.json")

# Function to tokenize text using the loaded tokenizer
def tokenize_sentence(text):
    encoding = tokenizer.encode(text)
    return encoding.tokens

# Tokenize the corpus text file
tokenized_corpus = []
with open("../data/cl00075", "r", encoding="utf-8") as f:
    for line in f:
        tokenized_line = tokenize_sentence(line.strip())
        tokenized_corpus.append(tokenized_line)

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Save the trained Word2Vec model
model.save("word2vec_model.bin")

# print(tokenized_corpus)

# # Finding similar words
# similar_words = model.wv.most_similar('cl00075')
# print(similar_words)

# # Getting the vector representation of a word
word_vector = model.wv['cl00075']
print(word_vector, type(word_vector))

# # Vector operations
# result = model.wv.most_similar(positive=['cl00075', 'HATPase'], negative=[])
# print(result)

domain_name = 'Histidine kinase-like ATPase domain'
enco = tokenizer.encode(domain_name)
print(enco.tokens)

def vectorize_sentence(sentence, word2vec_model):
    tokens = tokenize_sentence(sentence)
    sentence_embedding = np.zeros(word2vec_model.vector_size)  # Initialize sentence embedding vector
    num_tokens = 0  # Keep track of the number of tokens with embeddings
    for token in tokens:
        if token in word2vec_model.wv:
            sentence_embedding += word2vec_model.wv[token]
            num_tokens += 1
    if num_tokens > 0:
        sentence_embedding /= num_tokens  # Average the embeddings
    return sentence_embedding

print(vectorize_sentence(domain_name, model))




[ 1.3406968e-03  6.6458713e-03  1.0026930e-02  9.0288976e-03
 -8.0215409e-03  6.3489946e-03 -5.5392282e-03 -7.7658350e-04
  3.1031377e-04  6.5166638e-03  4.4480595e-03  4.5155901e-03
  9.4607985e-03  4.7957644e-04 -6.0302089e-03 -6.2805046e-03
  6.4116628e-03 -5.2691679e-03 -3.0001425e-03  3.9094579e-03
 -2.2884510e-03 -5.9413384e-03 -2.2987670e-03  1.1743887e-03
  2.1965255e-03  6.0816943e-03 -5.1992629e-03  2.9741039e-03
  7.2287843e-03  2.1823258e-03  5.4821507e-03 -4.8716874e-03
  6.2371404e-03 -7.6808650e-03  3.4848591e-03 -9.2068017e-03
 -2.4689827e-03 -9.0560261e-03 -1.5378548e-03 -5.5116522e-03
 -3.9607170e-03  1.1452249e-03  2.7757741e-03 -1.5183004e-03
 -8.1028361e-03 -5.9405873e-03  7.6286914e-04 -3.9096614e-03
 -9.3703652e-03 -7.6693407e-04  6.6344640e-03  5.9785587e-03
 -9.9103302e-03  3.0971617e-03 -6.0076900e-03 -9.1484087e-03
  1.9001009e-04 -3.9398295e-04 -7.0375144e-03 -6.1901347e-03
 -2.3787369e-03  7.1782465e-03 -7.5053875e-03  7.6678963e-03
 -4.9450807e-04  1.17

[nltk_data] Downloading package punkt to /home/mingzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
