# Word embedding and PCA

In [1]:
# import packages
import pandas as pd
import os
import math
import numpy as np
from matplotlib import pyplot as plt
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertModel
import torch
from sklearn.decomposition import PCA
import numpy as np

In [2]:
# import internal modules
import file_path_management as fpath
import public_library as plib
import parameters as params
import dataframe_columns as df_col



In [3]:
# Load the BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

## Predefined functions:

In [4]:
def sentence_embedding(sentence):
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model(**tokens)
        sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1)
    
    return sentence_embedding

In [5]:
def text_embedding(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)  # You may need to use a more robust sentence tokenizer

    # Initialize a list to store sentence embeddings
    sentence_embeddings = []

    # Tokenize and embed each sentence
    for sentence in sentences:
        embedding = sentence_embedding(sentence)
        sentence_embeddings.append(embedding)

    # Average pooling to obtain a single vector for the entire document
    text_embedding = torch.mean(torch.stack(sentence_embeddings), dim=0)
    
    return text_embedding

## Main program:

In [6]:
# # Check if my words are within the BERT vocabulary
# db_path = fpath.poten_litera_db
# df = pd.read_csv(db_path, header=None, sep=",")
# df.columns = df_col.db_columns

# for ind in df.index:
#     index = int(df.at[ind, "INDEX"])
    
#     txt_file_name = str(index) + ".txt"
#     txt_500_path = os.path.join(fpath.processed_texts_of_length_500_folder, txt_file_name)
    
#     if os.path.exists(txt_500_path):
#         with open(txt_500_path, "r", encoding='ascii') as f:
#             txt_500 = f.read()
        
#         sentences = sent_tokenize(txt_500)
#         for sentence in sentences:
#             words_to_check = tokenizer(txt_500)
#             print(words_to_check)

#             # Check if each word is in the BERT vocabulary
#             for word in words_to_check:
#                 if word in tokenizer.get_vocab():
#                     # print(f"'{word}' is in the BERT vocabulary.")
#                     pass
#                 else:
#                     print(word)

In [7]:
def get_text(index, title, abstract, keywords):
    txt_file_name = str(index) + ".txt"
    txt_500_path = os.path.join(fpath.processed_texts_of_length_500_folder, txt_file_name)
    
    text_tak = ""
    text_500 = ""
    
    if title == title:
        text_tak = text_tak + title + " "
    else:
        pass  
    if abstract == abstract:
        text_tak = text_tak + abstract + " "
    else:
        pass
    if keywords == keywords:
        text_tak = text_tak + keywords + " "
    else:
        pass
    
    text_tak = plib.process_text(text_tak, lower=True)
    
    if os.path.exists(txt_500_path):
        with open(txt_500_path, "r", encoding='ascii') as f:
            text_500 = f.read()    
        text_500 = plib.process_text(text_500, lower=True)
        
    if len(text_tak.split()) >= len(text_500.split()):
        text = text_tak
    else:
        text = text_500
    
    return text

### Text embedding

In [8]:
db_path = fpath.poten_litera_db
df = pd.read_csv(db_path, header=None, sep=",")
df.columns = df_col.db_columns

text_embeddings = []

for ind in df.index:
    index = int(df.at[ind, "INDEX"])
    
    title = df.at[ind, "TITLE"]
    abstract = df.at[ind, "ABSTRACT"]
    keywords = df.at[ind, "KEYWORDS"]
    
    text = get_text(index, title, abstract, keywords)
        
    text_embed = text_embedding(text)
    text_embeddings.append(text_embed)

print(len(text_embeddings))
print(text_embeddings)

### Keywords count transformation

In [None]:
# Iterate tht rows of poten_litera_db_kw_count and perform a function on the number of keywords in each row
input_path = fpath.poten_litera_db_kw_count
df = pd.read_csv(input_path, header=0, sep=',')
df.columns = df_col.db_count_columns

key_list = list(params.ranking_kw_groups.keys())

count_500_list = []
trans_count_500_list = []
# count_full_text_list = []
# trans_count_full_text_list = []

for ind in df.index:
    count_500 = []
    # count_full_text = []
    
    # Get the lists of counts for both text_500 and text_txt
    for key in key_list:
        count_500.append(df.at[ind, key+"_COUNT_IN_500"])
        # count_full_text.append(df.at[ind, key+"_COUNT_IN_FULL_TEXT"])
    
    trans_count_500 = []
    trans_count_full_text = []
    
    # Transform the counts
    for i in range(len(count_500)):
        trans_count_500[i] = math.log(min(count_500[i]+1, 4), 4)
        # trans_count_full_text = math.log10(min(count_full_text[i]+1, 10), 10) 
    # print(count_list)

print(count_500_list)
print(trans_count_500_list)

### PCA

In [None]:
# Define the number of components you want to keep (e.g., 2 for 2D PCA)
n_components = 2

# Create PCA models
pca_2 = PCA(n_components=2)
pca_3 = PCA(n_components=3)

# dataset folder
datasets_folder = fpath.datasets_folder

# PCA on the embeddings
data_array_embeddings = np.array(text_embeddings)
embeddings_dim_2 = pca_2.fit_transform(data_array_embeddings)
embeddings_dim_3 = pca_3.fit_transform(data_array_embeddings)

# Save the results
np.save(os.path.join(datasets_folder, 'pca_embeddings_dim_3.npy'), embeddings_dim_2)    # .npy extension is added if not given
np.save(os.path.join(datasets_folder, 'pca_embeddings_dim_3.npy'), embeddings_dim_3)

# PCA on the count_500
data_array_count_500 = np.array(count_500_list)
count_500_dim_2 = pca_2.fit_transform(data_array_count_500)
count_500_dim_3 = pca_3.fit_transform(data_array_count_500)

# Save the results of the PCA
np.save(os.path.join(datasets_folder, 'pca_count_500_dim_2.npy'), count_500_dim_2)    # .npy extension is added if not given
np.save(os.path.join(datasets_folder, 'pca_count_500_dim_3.npy'), count_500_dim_3)

# PCA on the trans_count_500
data_array_trans_count_500 = np.array(trans_count_500_list)
trans_count_500_dim_2 = pca_2.fit_transform(data_array_trans_count_500)
trans_count_500_dim_3 = pca_3.fit_transform(data_array_trans_count_500)

# Save the results of the PCA
np.save(os.path.join(datasets_folder, 'pca_trans_count_500_dim_2.npy'), trans_count_500_dim_2)    # .npy extension is added if not given
np.save(os.path.join(datasets_folder, 'pca_trans_count_500_dim_3.npy'), trans_count_500_dim_3)

# # PCA on the count_full_text
# data_array_count_full_text = np.array(count_full_text_list)
# count_full_text_dim_2 = pca_2.fit_transform(data_array_count_full_text)
# count_full_text_dim_3 = pca_3.fit_transform(data_array_count_full_text)

# # Save the results of the PCA
# np.save(os.path.join(datasets_folder, 'pca_count_full_text_dim_2.npy'), count_full_text_dim_2)    # .npy extension is added if not given
# np.save(os.path.join(datasets_folder, 'pca_count_full_text_dim_3.npy'), count_full_text_dim_3)

# # PCA on the trans_count_full_text
# data_array_trans_count_full_text = np.array(trans_count_full_text_list)
# trans_count_full_text_dim_2 = pca_2.fit_transform(data_array_trans_count_full_text)
# trans_count_full_text_dim_3 = pca_3.fit_transform(data_array_trans_count_full_text)

# # Save the results of the PCA
# np.save(os.path.join(datasets_folder, 'pca_trans_count_full_text_dim_2.npy'), trans_count_full_text_dim_2)    # .npy extension is added if not given
# np.save(os.path.join(datasets_folder, 'pca_trans_count_full_text_dim_3.npy'), trans_count_full_text_dim_3)