# Word embedding and PCA

In [1]:
# import packages
import pandas as pd
import os
import math
import numpy as np
from matplotlib import pyplot as plt
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertModel
import torch
from sklearn.decomposition import PCA
import numpy as np

In [2]:
# import internal modules
import file_path_management as fpath
import public_library as plib
import parameters as params
import dataframe_columns as df_col



## Predefined functions:

In [3]:
# Load the BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [4]:
def sentence_embedding(sentence):
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model(**tokens)
        sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1)
    
    return sentence_embedding

In [5]:
def text_embedding(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)  # You may need to use a more robust sentence tokenizer

    # Initialize a list to store sentence embeddings
    sentence_embeddings = []

    # Tokenize and embed each sentence
    for sentence in sentences:
        embedding = sentence_embedding(sentence)
        sentence_embeddings.append(embedding)

    # Average pooling to obtain a single vector for the entire document
    text_embedding = torch.mean(torch.stack(sentence_embeddings), dim=0)
    
    return text_embedding

In [6]:
def complete_sent(sent):
    if not (sent.endswith('.') or sent.endswith('?') or sent.endswith('!')):
        sent += '.'
    
    return sent

In [7]:
def get_t_a_k(index):
    db_path = fpath.poten_litera_db
    df = pd.read_csv(db_path, header=0, sep="\t")
    
    # locate the title in the row where column INDEX has the value index
    title = df.loc[df["INDEX"].astype(int) == index, "TITLE"].values[0]
    abstract = df.loc[df["INDEX"].astype(int) == index, "ABSTRACT"].values[0]
    keywords = df.loc[df["INDEX"].astype(int) == index, "KEYWORDS"].values[0]
    
    if title != title:
        title = ""
        
    if abstract != abstract:
        abstract = ""
        
    if keywords != keywords:
        keywords = ""
    
    return title, abstract, keywords
# --------------------Start of test code--------------------
# index = 4
# title, abstract, keywords = get_t_a_k(index)
# print(title)
# print(abstract)
# print(keywords)
# ---------------------End of test code---------------------

In [8]:
def get_text(index, title, abstract, keywords):
    txt_file_name = str(index) + ".txt"
    txt_500_path = os.path.join(fpath.processed_texts_of_length_500_folder, txt_file_name)
    text_relevant_path = os.path.join(fpath.relevant_text_folder, txt_file_name)

    text_tak = ""
    text_500 = ""
    text_tak_500 = ""
    text_relevant = ""
    
    # text_tak
    if abstract == "":
        pass
    else:
        if title != "":
            text_tak = text_tak + complete_sent(title) + " "
        else:
            pass  
        if abstract != "":
            text_tak = text_tak + complete_sent(abstract) + " "
        else:
            pass
        if keywords != "":
            text_tak = text_tak + complete_sent(keywords) + " "
        else:
            pass
        
        text_tak = plib.process_text(text_tak, lower=True)
    
    # text_500
    if os.path.exists(txt_500_path):
        with open(txt_500_path, "r", encoding='ascii') as f:
            text_500 = f.read()    
        text_500 = plib.process_text(text_500, lower=True)
    else:
        text_500 = ""
    
    # text_relevant
    if os.path.exists(text_relevant_path):
        with open(text_relevant_path, "r", encoding='ascii') as f:
            text_relevant = f.read()    
        text_relevant = plib.process_text(text_relevant, lower=True)
    else:
        text_relevant = ""
    # print(text_relevant)
    
    # text_tak_500
    if text_tak != "":
        text_tak_500 = text_tak
    elif text_500 != "":
        text_tak_500 = text_500
    else:
        text_tak_500 = ""
        
    return text_tak_500, text_relevant
# --------------------Start of test code--------------------
# index = 0
# title, abstract, keywords = get_t_a_k(index)
# text_tak_500, text_relevant = get_text(index, title, abstract, keywords)
# print(text_tak_500)
# print(text_relevant)
# ---------------------End of test code---------------------

## Main program:

In [9]:
# # Check if my words are within the BERT vocabulary
# db_path = fpath.poten_litera_db
# df = pd.read_csv(db_path, header=None, sep=",")
# df.columns = df_col.db_columns

# for ind in df.index:
#     index = int(df.at[ind, "INDEX"])
    
#     txt_file_name = str(index) + ".txt"
#     txt_500_path = os.path.join(fpath.processed_texts_of_length_500_folder, txt_file_name)
    
#     if os.path.exists(txt_500_path):
#         with open(txt_500_path, "r", encoding='ascii') as f:
#             txt_500 = f.read()
        
#         sentences = sent_tokenize(txt_500)
#         for sentence in sentences:
#             words_to_check = tokenizer(txt_500)
#             print(words_to_check)

#             # Check if each word is in the BERT vocabulary
#             for word in words_to_check:
#                 if word in tokenizer.get_vocab():
#                     # print(f"'{word}' is in the BERT vocabulary.")
#                     pass
#                 else:
#                     print(word)

### Text embedding

In [10]:
# test_1000 = fpath.poten_litera_testing_set_1000_labeled
# df_1000 = pd.read_csv(test_1000, header=0, sep=",")

# # tak embeddings and index list
# tak_embeddings = []
# tak_embeddings_index_list = []

# # relevant text embeddings and index list
# relevant_text_embeddings = []
# relevant_text_embeddings_index_list = []

# for ind in df_1000.index:
#     # get the text
#     index = int(df_1000.at[ind, "INDEX"])
#     title, abstract, keywords = get_t_a_k(index)
#     text_tak_500, text_relevant = get_text(index, title, abstract, keywords)
    
#     if text_tak_500 == text_tak_500 and text_tak_500 != "":
#         tak_embedding = text_embedding(text_tak_500)
#         tak_embeddings.append(tak_embedding)
#         tak_embeddings_index_list.append(index)
        
#     if text_relevant == text_relevant and text_relevant != "":
#         relevant_text_embedding = text_embedding(text_relevant)
#         relevant_text_embeddings.append(relevant_text_embedding)
#         relevant_text_embeddings_index_list.append(index)

# # Save the results
# datasets_folder = fpath.datasets_folder

# # Save the tak embeddings
# data_array_tak_embeddings = np.array(tak_embeddings)
# np.save(os.path.join(datasets_folder, 'tak_embeddings.npy'), data_array_tak_embeddings)    # .npy extension is added if not given
# with open(os.path.join(datasets_folder, 'tak_embeddings_index_list.txt'), 'w') as f:
#     for item in tak_embeddings_index_list:
#         f.write("%s\n" % item)

# # Save the relevant text embeddings     
# data_array_relevant_text_embeddings = np.array(relevant_text_embeddings)
# np.save(os.path.join(datasets_folder, 'relevant_text_embeddings.npy'), data_array_relevant_text_embeddings)    # .npy extension is added if not given
# with open(os.path.join(datasets_folder, 'relevant_text_embeddings_index_list.txt'), 'w') as f:
#     for item in relevant_text_embeddings_index_list:
#         f.write("%s\n" % item)

### Keywords count transformation

In [11]:
# Iterate tht rows of poten_litera_db_kw_count and perform a function on the number of keywords in each row
input_path = fpath.poten_litera_db_kw_count
df = pd.read_csv(input_path, header=0, sep='\t')

key_list = list(params.ranking_kw_groups.keys())

count_list_tak = []
count_list_tak_index_list = []
count_list_500 = []
count_list_500_index_list = []
count_list_full_text = []
count_list_full_text_index_list = []

for ind in df.index:
    index = int(df.at[ind, "INDEX"])
    # print(index)
    
    count_tak = []
    count_500 = []
    count_full_text = []
    
    if df.at[ind, "MACAQUE_COUNT_IN_TAK"] == df.at[ind, "MACAQUE_COUNT_IN_TAK"]:
        count_list_tak_index_list.append(index)
        for key in key_list:
            count_tak.append(int(df.at[ind, key+"_COUNT_IN_TAK"]))
        count_list_tak.append(count_tak)
    
    if df.at[ind, "MACAQUE_COUNT_IN_500"] == df.at[ind, "MACAQUE_COUNT_IN_500"]:
        count_list_500_index_list.append(index)
        for key in key_list:
            count_500.append(int(df.at[ind, key+"_COUNT_IN_500"]))
        count_list_500.append(count_500)
                
    if df.at[ind, "MACAQUE_COUNT_IN_FULL_TEXT"] == df.at[ind, "MACAQUE_COUNT_IN_FULL_TEXT"]:
        count_list_full_text_index_list.append(index)
        for key in key_list:
            count_full_text.append(int(df.at[ind, key+"_COUNT_IN_FULL_TEXT"]))
        count_list_full_text.append(count_full_text)
    
# Save the results
datasets_folder = fpath.datasets_folder

# Save the count_list_tak
data_array_count_list_tak = np.array(count_list_tak)
np.save(os.path.join(datasets_folder, 'count_list_tak.npy'), data_array_count_list_tak)    # .npy extension is added if not given
with open(os.path.join(datasets_folder, 'count_list_tak_index_list.txt'), 'w') as f:
    for item in count_list_tak_index_list:
        f.write("%s\n" % item)

# Save the count_list_500
# for l in count_list_500:
#     print(l)    
data_array_count_list_500 = np.array(count_list_500)
# print(count_list_500)
np.save(os.path.join(datasets_folder, 'count_list_500.npy'), data_array_count_list_500)    # .npy extension is added if not given
with open(os.path.join(datasets_folder, 'count_list_500_index_list.txt'), 'w') as f:
    for item in count_list_500_index_list:
        f.write("%s\n" % item)
        
# Save the count_list_full_text     
data_array_count_list_full_text = np.array(count_list_full_text)
np.save(os.path.join(datasets_folder, 'count_list_full_text.npy'), data_array_count_list_full_text)    # .npy extension is added if not given
with open(os.path.join(datasets_folder, 'count_list_full_text_index_list.txt'), 'w') as f:
    for item in count_list_full_text_index_list:
        f.write("%s\n" % item)

In [12]:
def transform_and_save_array(input_file):
    input_file_path = os.path.join(datasets_folder, input_file+'.npy')
    output_file_path = os.path.join(datasets_folder, 'trans_'+input_file + '.npy')
    
    # Load the array from the input .npy file
    array = np.load(input_file_path)

    # Apply the transformation
    transformed_array = np.log(np.minimum(array + 1, 10)) / np.log(10)

    # Save the transformed array to the output .npy file
    np.save(output_file_path, transformed_array)

In [13]:
# Transform the counts and save the array
trans_count_list_tak = transform_and_save_array('count_list_tak')
trans_count_list_500 = transform_and_save_array('count_list_500')
trans_count_full_text = transform_and_save_array('count_list_full_text')

### PCA

In [14]:
from sklearn.preprocessing import StandardScaler

def perform_pca(original_array_name, pca_array_name, n_components):
    original_path = os.path.join(datasets_folder, original_array_name + '.npy')
    array = np.load(original_path)
    # print(array)
    
    # Standardizing the Data
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(array)
    
    pca = PCA(n_components=n_components)
    pca.fit(array)
    
    # Transform the data
    transformed_data = pca.transform(array)
    
    # save the pca
    pca_file_path = os.path.join(datasets_folder, pca_array_name + '.npy')
    np.save(pca_file_path, transformed_data)

In [15]:
# # PCA on the tak_embeddings
# perform_pca('tak_embeddings', 'pca_tak_embeddings_2', 2)
# perform_pca('tak_embeddings', 'pca_tak_embeddings_3', 3)

# # PCA on the relevant_text_embeddings
# perform_pca('relevant_text_embeddings', 'pca_relevant_text_embeddings_2', 2)
# perform_pca('relevant_text_embeddings', 'pca_relevant_text_embeddings_3', 3)

# PCA on the count_list_tak
perform_pca('count_list_tak', 'pca_count_list_tak_2', 2)
perform_pca('count_list_tak', 'pca_count_list_tak_3', 3)

# PCA on the count_list_500
perform_pca('count_list_500', 'pca_count_list_500_2', 2)
perform_pca('count_list_500', 'pca_count_list_500_3', 3)

# PCA on the count_list_full_text
perform_pca('count_list_full_text', 'pca_count_list_full_text_2', 2)
perform_pca('count_list_full_text', 'pca_count_list_full_text_3', 3)

# PCA on the trans_count_list_tak
perform_pca('trans_count_list_tak', 'pca_trans_count_list_tak_2', 2)
perform_pca('trans_count_list_tak', 'pca_trans_count_list_tak_3', 3)

# PCA on the trans_count_list_500
perform_pca('trans_count_list_500', 'pca_trans_count_list_500_2', 2)
perform_pca('trans_count_list_500', 'pca_trans_count_list_500_3', 3)

# PCA on the trans_count_list_full_text
perform_pca('trans_count_list_full_text', 'trans_pca_count_list_full_text_2', 2)
perform_pca('trans_count_list_full_text', 'trans_pca_count_list_full_text_3', 3)