# Word embedding and PCA

In [None]:
# import packages
import pandas as pd
import os
import numpy as np
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertModel
import torch
from sklearn.decomposition import PCA

In [None]:
# import internal modules
import file_path_management as fpath
import public_library as plib
import parameters as params

## Predefined functions:

In [None]:
# Load the BERT tokenizer
model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

In [None]:
def complete_sent(sent):
    if not (sent.endswith('.') or sent.endswith('?') or sent.endswith('!')):
        sent += '.'
    
    return sent
# # --test--
# text = "distribution of the dopamine innervation in the macaque and human thalamus,"
# complete_sentence = complete_sent(text)
# print(complete_sentence)
# # --test--

In [None]:
def split_text_into_short_segments(text, max_tokens=400):
    # Split text into sentences
    text = plib.process_text(text, lower=True)
    sentences = sent_tokenize(text)

    # Group sentences into segments
    segments = []
    current_segment = ""

    for sentence in sentences:
        sentence = complete_sent(sentence)
        # Check token count if this sentence is added
        potential_segment = (current_segment + " " + sentence).strip()
        tokens = bert_tokenizer.tokenize(potential_segment)
        if len(tokens) > max_tokens:
            # If limit exceeded, start a new segment
            segments.append(current_segment.strip())
            current_segment = sentence
        else:
            # Otherwise, add sentence to current segment
            current_segment = potential_segment
    
    # Add the last segment if it's not empty
    if current_segment:
        segments.append(current_segment.strip())

    return segments
# # --test--
# text = "distribution of the dopamine innervation in the macaque and human thalamus. fax: +34 91 497 53 15. we recently defined the thalamic dopaminergic system in primates; it arises from numerous dopaminergic cell groups and selectively targets numerous thalamic nuclei. given the central position of the thalamus in subcortical and cortical interplay, and the functional relevance of dopamine neuromodulation in the brain, detailing dopamine distribution in the thalamus should supply important information. to this end we performed immunohistochemistry for dopamine and the dopamine transporter in the thalamus of macaque monkeys and humans to generate maps, in the stereotaxic coronal plane, of the distribution of dopaminergic axons. the dopamine innervation of the thalamus follows the same pattern in both species and is most dense in midline limbic nuclei, the mediodorsal and lateral posterior association nuclei, and in the ventral lateral and ventral anterior motor nuclei. this distribution suggests that thalamic dopamine has a prominent role in emotion, attention, cognition and complex somatosensory and visual processing, as well as in motor control. most thalamic dopaminergic axons are thin and varicose and target both the neuropil and small blood vessels, suggesting that, besides neuronal modulation, thalamic dopamine may have a direct influence on microcirculation. the maps provided here should be a useful reference in future experimental and neuroimaging studies aiming at clarifying the role of the thalamic dopaminergic system in health and in conditions involving brain dopamine, including parkinsons disease, drug addiction and schizophrenia. keywords dopamine thalamus monkey human primate dopamine transporter parkinson schizophrenia addiction introduction the thalamus is made up of multiple nuclei relaying information from subcortical centers or from other cortices to the cerebral cortex (sherman and guillery, 2005), as well as the striatum, the nucleus accumbens and the amygdala (steriade et al., 1997). in addition to specific subcortical and cortical afferents, the primate thalamus receives axons containing the neuromodulators acetylcholine (heckers et al., 1992), histamine (manning et al., 1996), serotonin (morrison and foote, 1986; lavoie and parent, 1991), and the catecholamines adrenaline (rico and cavada, 1998a), noradrenaline (morrison and foote, 1986; ginsberg et al., 1993) and dopamine (snchez-gonzlez et al., 2005). until recently, the existence of significant dopamine innervation in the primate thalamus has been largely ignored, probably because dopamine innervation of the rodent thalamus is very scant (groenewegen, 1988; papadopoulos and parnavelas, 1990). however, fragmentary data scattered through the literature endorse the presence of dopamine innervation in the primate thalamus. postmortem biochemical studies showed the presence of dopamine in the thalamus of macaques (brown et al., 1979; goldman-rakic and brown, 1981; pifl et al., 1990, 1991) and human subjects (oke and adams, 1987). later, receptor binding and in situ hybridization analyses detected the presence of dopamine d2-like (joyce et al., 1991; kessler et al., 1993; hall et al., 1996; langer et al., 1999; rieck et al., 2004) and d3-like receptors (gurevich and joyce, 1999) in several human thalamic nuclei. positron emission tomography (pet) radioligand studies have also demonstrated the presence of the dopamine transporter (dat) (wang et al., 1995; halldin et al., 1996; helfenbein et al., 1999; brownell et al., 2003) and of d2-like receptors (farde et al., 1997; langer et al., 1999; okubo et al., 1999; brownell et al., 2003; rieck et al., 2004) in the human and macaque thalamus. in the course of pet studies focusing on schizophrenia, d2- and d3-like radioligand binding was also found in the thalamus of control subjects (talvik et al., 2003; yasuno et al., 2004). finally, an immunohistochemical study using anti-dat antibodies detected the presence of dopaminergic axons in the mediodorsal nucleus (md) of the macaque thalamus (melchitzky and lewis, 2001)."
# segments = split_text_into_short_segments(text, max_tokens=400)
# for i, segment in enumerate(segments):
#     print(i, segment)
# # --test--

In [None]:
def embed_text_segment(text_segment):
    tokens = bert_tokenizer(text_segment, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = bert_model(**tokens)
        text_segment_embedding = torch.mean(outputs.last_hidden_state, dim=1)
    
    return text_segment_embedding
# # --test--
# text_segment = "distribution of the dopamine innervation in the macaque and human thalamus. fax: +34 91 497 53 15. we recently defined the thalamic dopaminergic system in primates; it arises from numerous dopaminergic cell groups and selectively targets numerous thalamic nuclei. given the central position of the thalamus in subcortical and cortical interplay, and the functional relevance of dopamine neuromodulation in the brain, detailing dopamine distribution in the thalamus should supply important information."
# text_segment_embedding = embed_text_segment(text_segment)
# print(text_segment_embedding.shape)
# print(text_segment_embedding)
# # --test--

In [None]:
def embedd_text(text):
    # Preprocess the text
    text = plib.process_text(text, lower=True)
    # Split the text into sentences
    shorter_text_segements = split_text_into_short_segments(text, max_tokens=400)  # You may need to use a more robust sentence tokenizer

    # Initialize a list to store sentence embeddings
    text_segment_embeddings = []

    # Tokenize and embed each sentence
    for text_segment in shorter_text_segements:
        embedding = embed_text_segment(text_segment)
        text_segment_embeddings.append(embedding)

    # Average pooling to obtain a single vector for the entire document
    text_embedding = torch.mean(torch.stack(text_segment_embeddings), dim=0)
    
    return text_embedding
# # --test--
# text = "distribution of the dopamine innervation in the macaque and human thalamus. fax: +34 91 497 53 15. we recently defined the thalamic dopaminergic system in primates; it arises from numerous dopaminergic cell groups and selectively targets numerous thalamic nuclei. given the central position of the thalamus in subcortical and cortical interplay, and the functional relevance of dopamine neuromodulation in the brain, detailing dopamine distribution in the thalamus should supply important information."
# text_embedding = embedd_text(text)
# print(text_embedding.shape)
# print(text_embedding)
# # --test--

In [None]:
def get_t_a_k(index):
    db_path = fpath.poten_litera_db
    df = pd.read_csv(db_path, header=0, sep="\t")
    
    # locate the title in the row where column INDEX has the value index
    title = df.loc[df["INDEX"].astype(int) == index, "TITLE"].values[0]
    abstract = df.loc[df["INDEX"].astype(int) == index, "ABSTRACT"].values[0]
    keywords = df.loc[df["INDEX"].astype(int) == index, "KEYWORDS"].values[0]
    
    if title != title:
        title = ""
        
    if abstract != abstract:
        abstract = ""
        
    if keywords != keywords:
        keywords = ""
    
    return title, abstract, keywords
# # --------------------Start of test code--------------------
# index = 2
# title, abstract, keywords = get_t_a_k(index)
# print(title)
# print(abstract)
# print(keywords)
# # ---------------------End of test code---------------------

In [None]:
def get_text(index, title, abstract, keywords):
    txt_file_name = str(index) + ".txt"
    txt_500_path = os.path.join(fpath.processed_texts_of_length_500_folder, txt_file_name)
    text_relevant_path = os.path.join(fpath.relevant_text_folder, txt_file_name)

    text_tak = ""
    text_500 = ""
    text_tak_500 = ""
    text_relevant = ""
    
    # text_tak
    if abstract == "":
        pass
    else:
        if title != "":
            text_tak = text_tak + complete_sent(title) + " "
        else:
            pass  
        if abstract != "":
            text_tak = text_tak + complete_sent(abstract) + " "
        else:
            pass
        if keywords != "":
            text_tak = text_tak + complete_sent(keywords) + " "
        else:
            pass
        
        text_tak = plib.process_text(text_tak, lower=True)
    
    # text_500
    if os.path.exists(txt_500_path):
        with open(txt_500_path, "r", encoding='ascii') as f:
            text_500 = f.read()    
        text_500 = plib.process_text(text_500, lower=True)
    else:
        text_500 = ""
    
    # text_relevant
    if os.path.exists(text_relevant_path):
        with open(text_relevant_path, "r", encoding='ascii') as f:
            text_relevant = f.read()    
        text_relevant = plib.process_text(text_relevant, lower=True)
    else:
        text_relevant = ""
    # print(text_relevant)
    
    # text_tak_500
    if text_tak != "":
        text_tak_500 = text_tak
    elif text_500 != "":
        text_tak_500 = text_500
    else:
        text_tak_500 = ""
        
    return text_tak_500, text_relevant
# # --------------------Start of test code--------------------
# index = 0
# title, abstract, keywords = get_t_a_k(index)
# text_tak_500, text_relevant = get_text(index, title, abstract, keywords)
# print(text_tak_500)
# print(text_relevant)
# # ---------------------End of test code---------------------

## Main program:

Q: How does BERT deal with words it unseen before? <br>

A: BERT does not provide word-level representations, but subword representations. This implies that when an unseen word is presented to BERT, it will slice it into multiple subwords, even reaching character subwords if needed. That is how it deals with unseen words. Therefore, BERT can handle out-of-vocabulary words. Some other questions and answers in this site can help you with the implementation details of BERT's subword tokenization, e.g. this, this or this.

### Text embedding for testing_set_1000.csv

In [None]:
test_1000 = fpath.poten_litera_testing_set_1000_labeled
df_1000 = pd.read_csv(test_1000, header=0, sep=",")

# tak embeddings and index list
tak_embeddings = []
tak_embeddings_index_list = []

# relevant text embeddings and index list
relevant_text_embeddings = []
relevant_text_embeddings_index_list = []

for ind in df_1000.index:
    # get the text
    index = int(df_1000.at[ind, "INDEX"])
    title, abstract, keywords = get_t_a_k(index)
    text_tak_500, text_relevant = get_text(index, title, abstract, keywords)
    
    if text_tak_500 == text_tak_500 and text_tak_500 != "":
        tak_embedding = embedd_text(text_tak_500)
        tak_embeddings.append(tak_embedding)
        tak_embeddings_index_list.append(index)
        
    if text_relevant == text_relevant and text_relevant != "":
        relevant_text_embedding = embedd_text(text_relevant)
        relevant_text_embeddings.append(relevant_text_embedding)
        relevant_text_embeddings_index_list.append(index)

# Save the results
datasets_folder = fpath.datasets_folder

# Save the tak embeddings
data_array_tak_embeddings = np.array(tak_embeddings)
np.save(os.path.join(datasets_folder, 'tak_embeddings.npy'), data_array_tak_embeddings)    # .npy extension is added if not given
with open(os.path.join(datasets_folder, 'tak_embeddings_index_list.txt'), 'w') as f:
    for item in tak_embeddings_index_list:
        f.write("%s\n" % item)

# Save the relevant text embeddings     
data_array_relevant_text_embeddings = np.array(relevant_text_embeddings)
np.save(os.path.join(datasets_folder, 'relevant_text_embeddings.npy'), data_array_relevant_text_embeddings)    # .npy extension is added if not given
with open(os.path.join(datasets_folder, 'relevant_text_embeddings_index_list.txt'), 'w') as f:
    for item in relevant_text_embeddings_index_list:
        f.write("%s\n" % item)

### Text embedding for testing_set_1000.csv

### Keywords count transformation

In [None]:
# # Iterate tht rows of poten_litera_db_kw_count and perform a function on the number of keywords in each row
# input_path = fpath.poten_litera_db_kw_count
# df = pd.read_csv(input_path, header=0, sep='\t')

# key_list = list(params.ranking_kw_groups.keys())

# count_list_tak = []
# count_list_tak_index_list = []
# count_list_500 = []
# count_list_500_index_list = []
# count_list_full_text = []
# count_list_full_text_index_list = []

# for ind in df.index:
#     index = int(df.at[ind, "INDEX"])
#     # print(index)
    
#     count_tak = []
#     count_500 = []
#     count_full_text = []
    
#     if df.at[ind, "MACAQUE_COUNT_IN_TAK"] == df.at[ind, "MACAQUE_COUNT_IN_TAK"]:
#         count_list_tak_index_list.append(index)
#         for key in key_list:
#             count_tak.append(int(df.at[ind, key+"_COUNT_IN_TAK"]))
#         count_list_tak.append(count_tak)
    
#     if df.at[ind, "MACAQUE_COUNT_IN_500"] == df.at[ind, "MACAQUE_COUNT_IN_500"]:
#         count_list_500_index_list.append(index)
#         for key in key_list:
#             count_500.append(int(df.at[ind, key+"_COUNT_IN_500"]))
#         count_list_500.append(count_500)
                
#     if df.at[ind, "MACAQUE_COUNT_IN_FULL_TEXT"] == df.at[ind, "MACAQUE_COUNT_IN_FULL_TEXT"]:
#         count_list_full_text_index_list.append(index)
#         for key in key_list:
#             count_full_text.append(int(df.at[ind, key+"_COUNT_IN_FULL_TEXT"]))
#         count_list_full_text.append(count_full_text)
    
# # Save the results
# datasets_folder = fpath.datasets_folder

# # Save the count_list_tak
# data_array_count_list_tak = np.array(count_list_tak)
# np.save(os.path.join(datasets_folder, 'count_list_tak.npy'), data_array_count_list_tak)    # .npy extension is added if not given
# with open(os.path.join(datasets_folder, 'count_list_tak_index_list.txt'), 'w') as f:
#     for item in count_list_tak_index_list:
#         f.write("%s\n" % item)

# # Save the count_list_500
# # for l in count_list_500:
# #     print(l)    
# data_array_count_list_500 = np.array(count_list_500)
# # print(count_list_500)
# np.save(os.path.join(datasets_folder, 'count_list_500.npy'), data_array_count_list_500)    # .npy extension is added if not given
# with open(os.path.join(datasets_folder, 'count_list_500_index_list.txt'), 'w') as f:
#     for item in count_list_500_index_list:
#         f.write("%s\n" % item)
        
# # Save the count_list_full_text     
# data_array_count_list_full_text = np.array(count_list_full_text)
# np.save(os.path.join(datasets_folder, 'count_list_full_text.npy'), data_array_count_list_full_text)    # .npy extension is added if not given
# with open(os.path.join(datasets_folder, 'count_list_full_text_index_list.txt'), 'w') as f:
#     for item in count_list_full_text_index_list:
#         f.write("%s\n" % item)

In [None]:
# def transform_and_save_array(input_file):
#     input_file_path = os.path.join(datasets_folder, input_file+'.npy')
#     output_file_path = os.path.join(datasets_folder, 'trans_'+input_file + '.npy')
    
#     # Load the array from the input .npy file
#     array = np.load(input_file_path)

#     # Apply the transformation
#     transformed_array = np.log(np.minimum(array + 1, 10)) / np.log(10)

#     # Save the transformed array to the output .npy file
#     np.save(output_file_path, transformed_array)

In [None]:
# # Transform the counts and save the array
# trans_count_list_tak = transform_and_save_array('count_list_tak')
# trans_count_list_500 = transform_and_save_array('count_list_500')
# trans_count_full_text = transform_and_save_array('count_list_full_text')

### PCA

In [None]:
from sklearn.preprocessing import StandardScaler

def perform_pca(original_array_name, pca_array_name, n_components):
    original_path = os.path.join(datasets_folder, original_array_name + '.npy')
    array = np.load(original_path)
    # print(array)
    
    # # Standardizing the Data
    # scaler = StandardScaler()
    # data_scaled = scaler.fit_transform(array)
    
    pca = PCA(n_components=n_components)
    pca.fit(array)
    
    # Transform the data
    transformed_data = pca.transform(array)
    
    # save the pca
    pca_file_path = os.path.join(datasets_folder, pca_array_name + '.npy')
    np.save(pca_file_path, transformed_data)

In [None]:
# # PCA on the tak_embeddings
# perform_pca('tak_embeddings', 'pca_tak_embeddings_2', 2)
# perform_pca('tak_embeddings', 'pca_tak_embeddings_3', 3)

# # PCA on the relevant_text_embeddings
# perform_pca('relevant_text_embeddings', 'pca_relevant_text_embeddings_2', 2)
# perform_pca('relevant_text_embeddings', 'pca_relevant_text_embeddings_3', 3)

# PCA on the count_list_tak
perform_pca('count_list_tak', 'pca_count_list_tak_2', 2)
perform_pca('count_list_tak', 'pca_count_list_tak_3', 3)

# PCA on the count_list_500
perform_pca('count_list_500', 'pca_count_list_500_2', 2)
perform_pca('count_list_500', 'pca_count_list_500_3', 3)

# PCA on the count_list_full_text
perform_pca('count_list_full_text', 'pca_count_list_full_text_2', 2)
perform_pca('count_list_full_text', 'pca_count_list_full_text_3', 3)

# PCA on the trans_count_list_tak
perform_pca('trans_count_list_tak', 'pca_trans_count_list_tak_2', 2)
perform_pca('trans_count_list_tak', 'pca_trans_count_list_tak_3', 3)

# PCA on the trans_count_list_500
perform_pca('trans_count_list_500', 'pca_trans_count_list_500_2', 2)
perform_pca('trans_count_list_500', 'pca_trans_count_list_500_3', 3)

# PCA on the trans_count_list_full_text
perform_pca('trans_count_list_full_text', 'trans_pca_count_list_full_text_2', 2)
perform_pca('trans_count_list_full_text', 'trans_pca_count_list_full_text_3', 3)