# Word embedding and PCA

In [None]:
# import packages
import pandas as pd
import os
import numpy as np
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertModel
import torch
from sklearn.decomposition import PCA

In [None]:
# import internal modules
import file_path_management as fpath
import public_library as plib
import parameters as params

## Predefined functions:

In [None]:
# Load the BERT tokenizer
model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

In [None]:
def complete_sent(sent):
    if not (sent.endswith('.') or sent.endswith('?') or sent.endswith('!')):
        sent += '.'
    
    return sent
# # --test--
# text = "distribution of the dopamine innervation in the macaque and human thalamus,"
# complete_sentence = complete_sent(text)
# print(complete_sentence)
# # --test--

In [None]:
def split_text_into_short_segments(text, max_tokens=400):
    # Split text into sentences
    text = plib.process_text(text, lower=True)
    sentences = sent_tokenize(text)

    # Group sentences into segments
    segments = []
    current_segment = ""

    for sentence in sentences:
        sentence = complete_sent(sentence)
        # Check token count if this sentence is added
        potential_segment = (current_segment + " " + sentence).strip()
        tokens = bert_tokenizer.tokenize(potential_segment)
        if len(tokens) > max_tokens:
            # If limit exceeded, start a new segment
            segments.append(current_segment.strip())
            current_segment = sentence
        else:
            # Otherwise, add sentence to current segment
            current_segment = potential_segment
    
    # Add the last segment if it's not empty
    if current_segment:
        segments.append(current_segment.strip())

    return segments
# # --test--
# text = "distribution of the dopamine innervation in the macaque and human thalamus. fax: +34 91 497 53 15. we recently defined the thalamic dopaminergic system in primates; it arises from numerous dopaminergic cell groups and selectively targets numerous thalamic nuclei. given the central position of the thalamus in subcortical and cortical interplay, and the functional relevance of dopamine neuromodulation in the brain, detailing dopamine distribution in the thalamus should supply important information. to this end we performed immunohistochemistry for dopamine and the dopamine transporter in the thalamus of macaque monkeys and humans to generate maps, in the stereotaxic coronal plane, of the distribution of dopaminergic axons. the dopamine innervation of the thalamus follows the same pattern in both species and is most dense in midline limbic nuclei, the mediodorsal and lateral posterior association nuclei, and in the ventral lateral and ventral anterior motor nuclei. this distribution suggests that thalamic dopamine has a prominent role in emotion, attention, cognition and complex somatosensory and visual processing, as well as in motor control. most thalamic dopaminergic axons are thin and varicose and target both the neuropil and small blood vessels, suggesting that, besides neuronal modulation, thalamic dopamine may have a direct influence on microcirculation. the maps provided here should be a useful reference in future experimental and neuroimaging studies aiming at clarifying the role of the thalamic dopaminergic system in health and in conditions involving brain dopamine, including parkinsons disease, drug addiction and schizophrenia. keywords dopamine thalamus monkey human primate dopamine transporter parkinson schizophrenia addiction introduction the thalamus is made up of multiple nuclei relaying information from subcortical centers or from other cortices to the cerebral cortex (sherman and guillery, 2005), as well as the striatum, the nucleus accumbens and the amygdala (steriade et al., 1997). in addition to specific subcortical and cortical afferents, the primate thalamus receives axons containing the neuromodulators acetylcholine (heckers et al., 1992), histamine (manning et al., 1996), serotonin (morrison and foote, 1986; lavoie and parent, 1991), and the catecholamines adrenaline (rico and cavada, 1998a), noradrenaline (morrison and foote, 1986; ginsberg et al., 1993) and dopamine (snchez-gonzlez et al., 2005). until recently, the existence of significant dopamine innervation in the primate thalamus has been largely ignored, probably because dopamine innervation of the rodent thalamus is very scant (groenewegen, 1988; papadopoulos and parnavelas, 1990). however, fragmentary data scattered through the literature endorse the presence of dopamine innervation in the primate thalamus. postmortem biochemical studies showed the presence of dopamine in the thalamus of macaques (brown et al., 1979; goldman-rakic and brown, 1981; pifl et al., 1990, 1991) and human subjects (oke and adams, 1987). later, receptor binding and in situ hybridization analyses detected the presence of dopamine d2-like (joyce et al., 1991; kessler et al., 1993; hall et al., 1996; langer et al., 1999; rieck et al., 2004) and d3-like receptors (gurevich and joyce, 1999) in several human thalamic nuclei. positron emission tomography (pet) radioligand studies have also demonstrated the presence of the dopamine transporter (dat) (wang et al., 1995; halldin et al., 1996; helfenbein et al., 1999; brownell et al., 2003) and of d2-like receptors (farde et al., 1997; langer et al., 1999; okubo et al., 1999; brownell et al., 2003; rieck et al., 2004) in the human and macaque thalamus. in the course of pet studies focusing on schizophrenia, d2- and d3-like radioligand binding was also found in the thalamus of control subjects (talvik et al., 2003; yasuno et al., 2004). finally, an immunohistochemical study using anti-dat antibodies detected the presence of dopaminergic axons in the mediodorsal nucleus (md) of the macaque thalamus (melchitzky and lewis, 2001)."
# segments = split_text_into_short_segments(text, max_tokens=400)
# for i, segment in enumerate(segments):
#     print(i, segment)
# # --test--

In [None]:
def embed_text_segment(text_segment):
    tokens = bert_tokenizer(text_segment, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = bert_model(**tokens)
        text_segment_embedding = torch.mean(outputs.last_hidden_state, dim=1)
    
    return text_segment_embedding
# # --test--
# text_segment = "distribution of the dopamine innervation in the macaque and human thalamus. fax: +34 91 497 53 15. we recently defined the thalamic dopaminergic system in primates; it arises from numerous dopaminergic cell groups and selectively targets numerous thalamic nuclei. given the central position of the thalamus in subcortical and cortical interplay, and the functional relevance of dopamine neuromodulation in the brain, detailing dopamine distribution in the thalamus should supply important information."
# text_segment_embedding = embed_text_segment(text_segment)
# print(text_segment_embedding.shape)
# print(text_segment_embedding)
# # --test--

In [None]:
def embedd_text(text):
    # Preprocess the text
    text = plib.process_text(text, lower=True)
    # Split the text into sentences
    shorter_text_segements = split_text_into_short_segments(text, max_tokens=400)  # You may need to use a more robust sentence tokenizer

    # Initialize a list to store sentence embeddings
    text_segment_embeddings = []

    # Tokenize and embed each sentence
    for text_segment in shorter_text_segements:
        embedding = embed_text_segment(text_segment)
        text_segment_embeddings.append(embedding)

    # Average pooling to obtain a single vector for the entire document
    text_embedding = torch.mean(torch.stack(text_segment_embeddings), dim=0)
    
    return text_embedding
# # --test--
# text = "distribution of the dopamine innervation in the macaque and human thalamus. fax: +34 91 497 53 15. we recently defined the thalamic dopaminergic system in primates; it arises from numerous dopaminergic cell groups and selectively targets numerous thalamic nuclei. given the central position of the thalamus in subcortical and cortical interplay, and the functional relevance of dopamine neuromodulation in the brain, detailing dopamine distribution in the thalamus should supply important information."
# text_embedding = embedd_text(text)
# print(text_embedding.shape)
# print(text_embedding)
# # --test--

In [None]:
def get_t_a_k(index):
    db_path = fpath.poten_litera_db
    df = pd.read_csv(db_path, header=0, sep="\t")
    
    # locate the title in the row where column INDEX has the value index
    title = df.loc[df["INDEX"].astype(int) == index, "TITLE"].values[0]
    abstract = df.loc[df["INDEX"].astype(int) == index, "ABSTRACT"].values[0]
    keywords = df.loc[df["INDEX"].astype(int) == index, "KEYWORDS"].values[0]
    
    if title != title:
        title = ""
        
    if abstract != abstract:
        abstract = ""
        
    if keywords != keywords:
        keywords = ""
    
    return title, abstract, keywords
# # --------------------Start of test code--------------------
# index = 2
# title, abstract, keywords = get_t_a_k(index)
# print(title)
# print(abstract)
# print(keywords)
# # ---------------------End of test code---------------------

In [None]:
def get_text(index, title, abstract, keywords):
    txt_file_name = str(index) + ".txt"
    txt_500_path = os.path.join(fpath.processed_texts_of_length_500_folder, txt_file_name)
    text_relevant_path = os.path.join(fpath.relevant_text_folder, txt_file_name)

    text_tak = ""
    text_500 = ""
    text_tak_500 = ""
    text_relevant = ""
    
    # text_tak
    if abstract == "":
        pass
    else:
        if title != "":
            text_tak = text_tak + complete_sent(title) + " "
        else:
            pass  
        if abstract != "":
            text_tak = text_tak + complete_sent(abstract) + " "
        else:
            pass
        if keywords != "":
            text_tak = text_tak + complete_sent(keywords) + " "
        else:
            pass
        
        text_tak = plib.process_text(text_tak, lower=True)
    
    # text_500
    if os.path.exists(txt_500_path):
        with open(txt_500_path, "r", encoding='ascii') as f:
            text_500 = f.read()    
        text_500 = plib.process_text(text_500, lower=True)
    else:
        text_500 = ""
    
    # text_relevant
    if os.path.exists(text_relevant_path):
        with open(text_relevant_path, "r", encoding='ascii') as f:
            text_relevant = f.read()    
        text_relevant = plib.process_text(text_relevant, lower=True)
    else:
        text_relevant = ""
    # print(text_relevant)
    
    # text_tak_500
    if text_tak != "":
        text_tak_500 = text_tak
    elif text_500 != "":
        text_tak_500 = text_500
    else:
        text_tak_500 = ""
        
    return text_tak_500, text_relevant
# # --------------------Start of test code--------------------
# index = 0
# title, abstract, keywords = get_t_a_k(index)
# text_tak_500, text_relevant = get_text(index, title, abstract, keywords)
# print(text_tak_500)
# print(text_relevant)
# # ---------------------End of test code---------------------

In [None]:
def save_embedding(data_array, index_list, name, save_path):
    np.save(os.path.join(save_path, name+'.npy'), data_array)    # .npy extension is added if not given
    with open(os.path.join(save_path, name+'_index_list.txt'), 'w') as f:
        for index, item in enumerate(index_list):
            f.write(f"{item}")
            if index < len(index_list) - 1:
                f.write(",")

In [None]:
def transform_count_list_and_save(input_file_name):
    save_path = fpath.embedding_and_pca_folder
    input_file_path = os.path.join(save_path, input_file_name+'.npy')
    output_file_path = os.path.join(save_path, 'trans_'+input_file_name+ '.npy')
    
    # Load the array from the input .npy file
    array = np.load(input_file_path)

    # Apply the transformation
    transformed_array = np.log10(np.minimum(array + 1, 10))

    # Save the transformed array to the output .npy file
    np.save(output_file_path, transformed_array)

## Main program:

Q: How does BERT deal with words it unseen before? <br>

A: BERT does not provide word-level representations, but subword representations. This implies that when an unseen word is presented to BERT, it will slice it into multiple subwords, even reaching character subwords if needed. That is how it deals with unseen words. Therefore, BERT can handle out-of-vocabulary words. Some other questions and answers in this site can help you with the implementation details of BERT's subword tokenization, e.g. this, this or this.

### Text embedding for testing_set_1000.csv

In [None]:
# test_1000 = fpath.poten_litera_testing_set_1000_labeled
# df_1000 = pd.read_csv(test_1000, header=0, sep=",")

# # tak_embeddings_1000 and tak_embeddings_index_list_1000
# tak_500_embeddings_1000 = []
# tak_500_embeddings_index_list_1000 = []

# # relevant_text_embeddings_1000 and relevant_text_embeddings_index_list_1000
# relevant_text_embeddings_1000 = []
# relevant_text_embeddings_index_list_1000 = []

# for ind in df_1000.index:
#     # get the text
#     index = int(df_1000.at[ind, "INDEX"])
#     title, abstract, keywords = get_t_a_k(index)
#     text_tak_500, text_relevant = get_text(index, title, abstract, keywords)
    
#     # embed text_tak_500
#     if text_tak_500 == text_tak_500 and text_tak_500 != "":
#         embedding = embedd_text(text_tak_500)
#         tak_500_embeddings_1000.append(embedding)
#         tak_500_embeddings_index_list_1000.append(index)
    
#     # embed relevant_text
#     if text_relevant == text_relevant and text_relevant != "":
#         embedding = embedd_text(text_relevant)
#         relevant_text_embeddings_1000.append(embedding)
#         relevant_text_embeddings_index_list_1000.append(index)

# # Save the results
# save_path = fpath.embedding_and_pca_folder
# # Save the tak_embeddings_1000
# save_embedding(tak_500_embeddings_1000, tak_500_embeddings_index_list_1000, 'tak_500_embeddings_1000', save_path)
# # Save the relevant_text_embeddings_1000
# save_embedding(relevant_text_embeddings_1000, relevant_text_embeddings_index_list_1000, 'relevant_text_embeddings_1000', save_path)

### Text embedding for poten_litera_db.csv

In [None]:
# db_path = fpath.poten_litera_db
# df_db = pd.read_csv(db_path, header=0, sep="\t")

# # tak embeddings and index list
# tak_embeddings_db = []
# tak_embeddings_index_list_db = []

# # relevant text embeddings and index list
# relevant_text_embeddings_db = []
# relevant_text_embeddings_index_list_db = []

# for ind in df_db.index:
#     # get the text
#     index = int(df_db.at[ind, "INDEX"])
#     title, abstract, keywords = get_t_a_k(index)
#     text_tak_500, text_relevant = get_text(index, title, abstract, keywords)
    
#     if text_tak_500 == text_tak_500 and text_tak_500 != "":
#         embedding = embedd_text(text_tak_500)
#         tak_embeddings_db.append(embedding)
#         tak_embeddings_index_list_db.append(index)
        
#     if text_relevant == text_relevant and text_relevant != "":
#         embedding = embedd_text(text_relevant)
#         relevant_text_embeddings_db.append(embedding)
#         relevant_text_embeddings_index_list_db.append(index)

# # Save the results
# save_path = fpath.embedding_and_pca_folder
# # Save the tak_embeddings_db
# save_embedding(tak_embeddings_db, tak_embeddings_index_list_db, 'tak_embeddings_db', save_path)
# # Save the relevant_text_embeddings_db     
# save_embedding(relevant_text_embeddings_db, relevant_text_embeddings_index_list_db, 'relevant_text_embeddings_db', save_path)

### Keywords count transformation for testing_set_1000.csv

In [None]:
# db_count_path = fpath.poten_litera_db_kw_count
# df_count = pd.read_csv(db_count_path, header=0, sep='\t')

# test_1000_path = fpath.poten_litera_testing_set_1000_labeled
# df_1000 = pd.read_csv(test_1000_path, header=0, sep=",")

# key_list = list(params.ranking_kw_groups.keys())

# count_list_tak_1000 = []
# count_list_tak_index_list_1000 = []
# count_list_500_1000 = []
# count_list_500_index_list_1000 = []
# count_list_full_text_1000 = []
# count_list_full_text_index_list_1000 = []

# for ind in df_1000.index:
#     index = int(df_1000.at[ind, "INDEX"])
    
#     # Get the ind in the df_count with the same index in the "INDEX" column
#     ind_in_df_count = df_count.index[df_count["INDEX"].astype(int) == index].tolist()[0]
    
#     # Get count_list_tak
#     count_tak = []
#     if df_count.at[ind_in_df_count, "MACAQUE_COUNT_IN_TAK"] == df_count.at[ind_in_df_count, "MACAQUE_COUNT_IN_TAK"]:
#         count_list_tak_index_list_1000.append(index)
#         for key in key_list:
#             count_tak.append(int(df_count.at[ind_in_df_count, key+"_COUNT_IN_TAK"]))
#         count_list_tak_1000.append(count_tak)
    
#     # Get count_list_500
#     count_500 = []
#     if df_count.at[ind_in_df_count, "MACAQUE_COUNT_IN_500"] == df_count.at[ind_in_df_count, "MACAQUE_COUNT_IN_500"]:
#         count_list_500_index_list_1000.append(index)
#         for key in key_list:
#             count_500.append(int(df_count.at[ind_in_df_count, key+"_COUNT_IN_500"]))
#         count_list_500_1000.append(count_500)

#     # Get count_list_full_text
#     count_full_text = []      
#     if df_count.at[ind_in_df_count, "MACAQUE_COUNT_IN_FULL_TEXT"] == df_count.at[ind_in_df_count, "MACAQUE_COUNT_IN_FULL_TEXT"]:
#         count_list_full_text_index_list_1000.append(index)
#         for key in key_list:
#             count_full_text.append(int(df_count.at[ind_in_df_count, key+"_COUNT_IN_FULL_TEXT"]))
#         count_list_full_text_1000.append(count_full_text)
    
# # Save the results
# save_path = fpath.embedding_and_pca_folder
# # Save the count_list_tak_1000
# count_list_tak_1000_array = np.array(count_list_tak_1000)
# save_embedding(count_list_tak_1000_array, count_list_tak_index_list_1000, 'count_list_tak_1000', save_path)
# # Save the count_list_500_1000
# count_list_500_1000_array = np.array(count_list_500_1000)
# save_embedding(count_list_500_1000_array, count_list_500_index_list_1000, 'count_list_500_1000', save_path)    
# # Save the count_list_full_text_1000
# count_list_full_text_1000_array = np.array(count_list_full_text_1000)
# save_embedding(count_list_full_text_1000_array, count_list_full_text_index_list_1000, 'count_list_full_text_1000', save_path)    

### Keywords count transformation for poten_litera_db.csv

In [None]:
# input_path = fpath.poten_litera_db_kw_count
# df_count = pd.read_csv(input_path, header=0, sep='\t')

# key_list = list(params.ranking_kw_groups.keys())

# count_list_tak_db = []
# count_list_tak_index_list_db = []
# count_list_500_db = []
# count_list_500_index_list_db = []
# count_list_full_text_db = []
# count_list_full_text_index_list_db = []

# for ind in df_count.index:
#     index = int(df_count.at[ind, "INDEX"])
    
#     # Get count_list_tak
#     count_tak = []
#     if df_count.at[ind, "MACAQUE_COUNT_IN_TAK"] == df_count.at[ind, "MACAQUE_COUNT_IN_TAK"]:
#         count_list_tak_index_list_db.append(index)
#         for key in key_list:
#             count_tak.append(int(df_count.at[ind, key+"_COUNT_IN_TAK"]))
#         count_list_tak_db.append(count_tak)
    
#     # Get count_list_500
#     count_500 = []
#     if df_count.at[ind, "MACAQUE_COUNT_IN_500"] == df_count.at[ind, "MACAQUE_COUNT_IN_500"]:
#         count_list_500_index_list_db.append(index)
#         for key in key_list:
#             count_500.append(int(df_count.at[ind, key+"_COUNT_IN_500"]))
#         count_list_500_db.append(count_500)

#     # Get count_list_full_text
#     count_full_text = []      
#     if df_count.at[ind, "MACAQUE_COUNT_IN_FULL_TEXT"] == df_count.at[ind, "MACAQUE_COUNT_IN_FULL_TEXT"]:
#         count_list_full_text_index_list_db.append(index)
#         for key in key_list:
#             count_full_text.append(int(df_count.at[ind, key+"_COUNT_IN_FULL_TEXT"]))
#         count_list_full_text_db.append(count_full_text)
    
# # Save the results
# save_path = fpath.embedding_and_pca_folder
# # Save the count_list_tak_db
# count_list_tak_db_array = np.array(count_list_tak_db)
# save_embedding(count_list_tak_db_array, count_list_tak_index_list_db, 'count_list_tak_db', save_path)
# # Save the count_list_500_db
# count_list_500_db_array = np.array(count_list_500_db)
# save_embedding(count_list_500_db_array, count_list_500_index_list_db, 'count_list_500_db', save_path)    
# # Save the count_list_full_text_db
# count_list_full_text_db_array = np.array(count_list_full_text_db)
# save_embedding(count_list_full_text_db_array, count_list_full_text_index_list_db, 'count_list_full_text_db', save_path)    

In [None]:
# # Transform the counts and save the array
# transform_count_list_and_save('count_list_tak')
# transform_count_list_and_save('count_list_500')
# transform_count_list_and_save('count_list_full_text')

### PCA

In [None]:
# from sklearn.preprocessing import StandardScaler

# def perform_pca_and_save(original_array_name, n_components):
#     save_path = fpath.embedding_and_pca_folder
#     original_path = os.path.join(save_path, original_array_name + '.npy')
#     pca_file_path = os.path.join(save_path, 'pca_' + original_array_name + str(n_components) + '.npy')
    
#     array = np.load(original_path)
    
#     # Standardizing the Data
#     scaler = StandardScaler()
#     data_scaled = scaler.fit_transform(array)
    
#     # PCA model and fit
#     pca = PCA(n_components=n_components)
#     pca.fit(array)
    
#     # Transform the data
#     transformed_data = pca.transform(array)
    
#     # save the pca
#     np.save(pca_file_path, transformed_data)

In [None]:
# def perform_pca_data_list(data_list, n_components):
#     for data_name in data_list:
#         perform_pca_and_save(data_name, n_components)

In [None]:
# data_list = [
#     'tak_500_embeddings_1000', 
#     'relevant_text_embeddings_1000',
     
#     'tak_embeddings_db', 
#     'relevant_text_embeddings_db', 
    
#     'count_list_tak_1000', 
#     'count_list_500_1000', 
#     'count_list_full_text_1000', 
    
#     'trans_count_list_tak_1000',
#     'trans_count_list_500_1000',
#     'trans_count_list_full_text_1000',
    
#     'count_list_tak_db', 
#     'count_list_500_db', 
#     'count_list_full_text_db', 
    
#     'trans_count_list_tak_db',
#     'trans_count_list_500_db',
#     'trans_count_list_full_text_db'
#     ]
# perform_pca_data_list('data_list', 2)
# perform_pca_data_list('data_list', 3)