Semantic = vector embeddings

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re

In [8]:
df = pd.read_csv('v4_final_training_scopus.csv')

In [9]:
df.head(2)

Unnamed: 0,author_id,Institution,article_id,total_citations,doi,title,abstract,keywords,full name
0,6,Uniwersytet Jagielloński,827305,1,10.34105/j.kmel.2021.13.016,Effects of synchronized and asynchronized e-fe...,Hong Kong Bao Long Accounting And Secretarial ...,"[Synchronized e-feedback interaction,Asynchron...","Indurkhya, Bipin"
1,7,Universität zu Lübeck,149492,1,10.3390/app11114927,Short-term load forecasting using an attended ...,The paper presents a new approach for the pred...,"[Encoder decoder,Recurrent neural network,Shor...","Grzegorzek, Marcin"


Clean text data

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy

In [11]:
def remove_special_characters(text):
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text


In [12]:
def to_lowercase(text):
    return text.lower()

In [13]:
# remove "this" and "that"
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [14]:
def tokenize_text(text):
    return word_tokenize(text)

In [15]:
def stem_text(tokens):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in tokens]
    return stemmed_words

In [16]:
# Using NLTK
# def lemmatize_text(tokens):
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
#     return lemmatized_words

# Using spaCy (more accurate)
def lemmatize_text_spacy(text):
    # python -m spacy download en_core_web_sm
    # import spacy
    # nlp = spacy.load('en_core_web_sm')
    # print("Model loaded successfully!")
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc]
    return lemmatized_words

In [17]:
def clean_and_normalize_text(text, use_spacy=False):
    # Step 1: Remove special characters
    text = remove_special_characters(text)
    
    # Step 2: Convert to lowercase
    text = to_lowercase(text)
    
    # Step 3: Remove stopwords
    text = remove_stopwords(text)
    
    # Step 4: Tokenize
    tokens = tokenize_text(text)
    
    # Step 5: Stem or Lemmatize
    lemmatized_words = lemmatize_text_spacy(' '.join(tokens))

    # Join tokens back into a single string
    cleaned_text = ' '.join(lemmatized_words)
    return cleaned_text

In [33]:
#Example abstract
# abstract = "Hong Kong Bao Long Accounting And Secretarial! We conducted a comparative study to evaluate the efficacy of synchronous and asynchronous interaction modes when providing feedback for improving academic writing, achievement motivation, and critical thinking."

# # Clean and normalize
# cleaned_text = clean_and_normalize_text(abstract, use_spacy=True)
# print(cleaned_text)

hong kong bao long accounting secretarial conduct comparative study evaluate efficacy synchronous asynchronous interaction mode provide feedback improve academic writing achievement motivation critical thinking


In [19]:
# Combine title, abstract, and keywords into a single text
def combine_text(row):
    title = row['title']
    abstract = row['abstract']
    keywords = ' '.join(row['keywords'])  # Convert list of keywords to a single string
    combined_text = f"{title}. {abstract}. {keywords}"
    return combined_text

# Apply to the DataFrame
df['combined_text'] = df.apply(combine_text, axis=1)

# Display the combined text
print(df[['combined_text']])

                                           combined_text
0      Effects of synchronized and asynchronized e-fe...
1      Short-term load forecasting using an attended ...
2      Comparison of 2.4 ghz wifi ftm-and rssi-based ...
3      Recognition of typical locomotion activities b...
4      Sleep stage classification for child patients ...
...                                                  ...
34736  Ultra-broadband and highly sensitive surface p...
34737  A new type of ultra-broadband microstructured ...
34738  Design of Sub wavelength-Grating-Coupled Fano ...
34739  Numerical exploration of external sensing sche...
34740  Design and analysis of a gold-coated dual-core...

[34741 rows x 1 columns]


In [20]:
df.head(1)

Unnamed: 0,author_id,Institution,article_id,total_citations,doi,title,abstract,keywords,full name,combined_text
0,6,Uniwersytet Jagielloński,827305,1,10.34105/j.kmel.2021.13.016,Effects of synchronized and asynchronized e-fe...,Hong Kong Bao Long Accounting And Secretarial ...,"[Synchronized e-feedback interaction,Asynchron...","Indurkhya, Bipin",Effects of synchronized and asynchronized e-fe...


In [21]:
import torch

print(torch.cuda.device_count())
print(torch.cuda.get_device_name())

1
NVIDIA GeForce RTX 3060 Laptop GPU


In [36]:
print(spacy.prefer_gpu())
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

True


In [37]:
def lemmatize_text_spacy(text):
    doc = nlp(text)  # This will use the GPU if enabled
    lemmatized_words = [token.lemma_ for token in doc]
    return lemmatized_words

In [38]:
def clean_and_normalize_text(text, use_spacy=True):
    # Step 1: Remove special characters
    text = remove_special_characters(text)
    
    # Step 2: Convert to lowercase
    text = to_lowercase(text)
    
    # Step 3: Remove stopwords
    text = remove_stopwords(text)
    
    # Step 4: Tokenize
    tokens = tokenize_text(text)
    
    # Step 5: Stem or Lemmatize
    if use_spacy:
        lemmatized_words = lemmatize_text_spacy(' '.join(tokens))
    else:
        lemmatized_words = tokens  # Fallback to tokens if spaCy is not used

    # Join tokens back into a single string
    cleaned_text = ' '.join(lemmatized_words)
    return cleaned_text

# Batch process the 'combined_text' column
batch_size = 50  # Adjust based on your GPU memory
cleaned_texts = []

for i in range(0, len(df), batch_size):
    batch = df['combined_text'][i:i + batch_size].tolist()
    cleaned_batch = [clean_and_normalize_text(text, use_spacy=True) for text in batch]
    cleaned_texts.extend(cleaned_batch)

# Add the cleaned text to the DataFrame
df['cleaned_combined_text'] = cleaned_texts

In [39]:
print(df['cleaned_combined_text'].head(2))

0    effect synchronize asynchronize efeedback inte...
1    shortterm load forecasting use attend sequenti...
Name: cleaned_combined_text, dtype: object


In [41]:
df.head(2)

Unnamed: 0,author_id,Institution,article_id,total_citations,doi,title,abstract,keywords,full name,combined_text,cleaned_combined_text
0,6,Uniwersytet Jagielloński,827305,1,10.34105/j.kmel.2021.13.016,Effects of synchronized and asynchronized e-fe...,Hong Kong Bao Long Accounting And Secretarial ...,"[Synchronized e-feedback interaction,Asynchron...","Indurkhya, Bipin",Effects of synchronized and asynchronized e-fe...,effect synchronize asynchronize efeedback inte...
1,7,Universität zu Lübeck,149492,1,10.3390/app11114927,Short-term load forecasting using an attended ...,The paper presents a new approach for the pred...,"[Encoder decoder,Recurrent neural network,Shor...","Grzegorzek, Marcin",Short-term load forecasting using an attended ...,shortterm load forecasting use attend sequenti...


In [44]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [45]:
def get_bert_embedding(text, tokenizer, model, device):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    
    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling over tokens
    
    return embeddings.cpu().numpy()  # Move embeddings back to CPU and convert to numpy array

# Generate BERT embeddings for all rows
df['bert_embedding'] = df['cleaned_combined_text'].apply(lambda x: get_bert_embedding(x, tokenizer, model, device))

# Print the first few embeddings
print(df['bert_embedding'].head())

  attn_output = torch.nn.functional.scaled_dot_product_attention(


0    [[-0.020778287, 0.25120834, 0.3591713, 0.03110...
1    [[-0.0071619516, 0.091283746, 0.43095434, 0.10...
2    [[-0.040520646, 0.17373112, 0.54977536, -0.088...
3    [[-0.0971694, 0.20765214, 0.5416962, -0.069820...
4    [[-0.19749704, 0.23164165, 0.45177722, -0.0930...
Name: bert_embedding, dtype: object


In [51]:
df.rename(columns={
    'full name':'full_name'
}, inplace=True)

In [52]:
df.head(1)

Unnamed: 0,author_id,Institution,article_id,total_citations,doi,title,abstract,keywords,full_name,combined_text,cleaned_combined_text,bert_embedding
0,6,Uniwersytet Jagielloński,827305,1,10.34105/j.kmel.2021.13.016,Effects of synchronized and asynchronized e-fe...,Hong Kong Bao Long Accounting And Secretarial ...,"[Synchronized e-feedback interaction,Asynchron...","Indurkhya, Bipin",Effects of synchronized and asynchronized e-fe...,effect synchronize asynchronize efeedback inte...,"[[-0.020778287, 0.25120834, 0.3591713, 0.03110..."


# Further processing for Neo4j GDS cosine similarity function

In [19]:
import pandas as pd
test = pd.read_csv('v5_final_training_scopus.csv')

In [3]:
test.head(1)

Unnamed: 0,author_id,Institution,article_id,total_citations,doi,title,abstract,keywords,full_name,combined_text,cleaned_combined_text,bert_embedding
0,6,Uniwersytet Jagielloński,827305,1,10.34105/j.kmel.2021.13.016,Effects of synchronized and asynchronized e-fe...,Hong Kong Bao Long Accounting And Secretarial ...,"[Synchronized e-feedback interaction,Asynchron...","Indurkhya, Bipin",Effects of synchronized and asynchronized e-fe...,effect synchronize asynchronize efeedback inte...,[[-2.07782872e-02 2.51208335e-01 3.59171301e...


In [4]:
print(test['full_name'].isnull().sum())
print(test['Institution'].isnull().sum())


1
81


In [22]:
for embedding in test['bert_embedding'].head(10):
    print(embedding)


[-2.07782872e-02, 2.51208335e-01, 3.59171301e-01, 3.11031342e-02, 1.48762930e-02, -6.71597570e-02, 3.05411994e-01, 2.98144855e-02, -2.65341960e-02, -2.29581267e-01, -9.24098492e-02, -3.47595930e-01, -5.91908954e-02, 2.18936741e-01, -8.31507444e-02, 3.28590721e-01, 1.98517591e-01, -1.39567509e-01, -7.69081637e-02, 2.61502881e-02, 2.82032907e-01, 2.00775206e-01, 6.67389529e-03, 1.96997285e-01, 4.19263422e-01, 9.96832401e-02, -5.66901527e-02, -2.98626810e-01, -2.46209875e-02, -1.92277461e-01, 4.79637474e-01, 3.48747335e-02, -5.41738123e-02, 9.68509316e-02, -1.35142878e-01, -8.49746075e-03, -6.00004010e-02, -2.02361727e-03, -4.04722523e-04, -8.52999836e-02, -2.62100279e-01, -2.34807104e-01, -9.62657258e-02, 2.42163111e-02, -1.13864779e-01, -4.91849959e-01, -4.25626129e-01, 8.68494287e-02, -6.31624311e-02, -2.45949805e-01, -2.34370232e-01, 2.59680063e-01, 3.90838161e-02, -9.82646048e-02, -2.47810125e-01, 3.60958010e-01, 4.92155924e-02, -4.06989127e-01, -1.50018111e-01, -2.08829820e-01, 1.08

In [23]:
# test['bert_embedding'] = test['bert_embedding'].apply(lambda x: x.replace(' ', ',') if pd.notnull(x) else '[]')

In [21]:
import re
import pandas as pd

def clean_embedding(embedding):
    if pd.isnull(embedding):
        return '[]'
    # Replace newlines and multiple spaces with a single space
    cleaned = re.sub(r'\s+', ' ', embedding.strip())
    # Ensure it is a valid JSON array by replacing spaces with commas
    cleaned = cleaned.replace(' ', ',')
    # Remove extra brackets if necessary
    if cleaned.startswith('[') and cleaned.endswith(']'):
        cleaned = cleaned[1:-1]
    # Remove any extra commas
    cleaned = re.sub(r',+', ', ', cleaned)
    cleaned = f'{cleaned}'
    cleaned = cleaned.replace('[, ', '[')
    return cleaned

# Apply the cleaning function to the bert_embedding column
test['bert_embedding'] = test['bert_embedding'].apply(clean_embedding)


In [59]:
test = test.dropna(subset=['full_name', 'Institution'])

In [26]:
# Save the cleaned dataframe to a new CSV file
test.to_csv('v6_final_training_scopus.csv', index=False)

Neo4j expects a clean [..] sqaure brackets and a comma to seperate values for the JSON parser

# Bert embedding consine similarity calculation

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import ast

In [23]:
import numpy as np
bert_embeddings = test['bert_embedding'].apply(lambda x: np.array(ast.literal_eval(x)))

In [24]:
embedding_1 = bert_embeddings.iloc[0]
embedding_2 = bert_embeddings.iloc[1]

similarity = cosine_similarity([embedding_1], [embedding_2])
print(f'Similarity: {similarity[0][0]}')

Similarity: 0.9318686528611834


In [29]:
from sklearn.decomposition import PCA

embeddings = np.vstack(bert_embeddings)
pca = PCA(n_components = 50)
pca_embeddings = pca.fit_transform(embeddings)
test['pca_embedding'] = list(pca_embeddings)
test.to_csv('v7_PCA_final_training_scopus.csv', index=False)


   author_id               Institution  article_id  total_citations  \
0          6  Uniwersytet Jagielloński      827305                1   

                           doi  \
0  10.34105/j.kmel.2021.13.016   

                                               title  \
0  Effects of synchronized and asynchronized e-fe...   

                                            abstract  \
0  Hong Kong Bao Long Accounting And Secretarial ...   

                                            keywords         full_name  \
0  [Synchronized e-feedback interaction,Asynchron...  Indurkhya, Bipin   

                                       combined_text  \
0  Effects of synchronized and asynchronized e-fe...   

                               cleaned_combined_text  \
0  effect synchronize asynchronize efeedback inte...   

                                      bert_embedding  \
0  [-2.07782872e-02, 2.51208335e-01, 3.59171301e-...   

                                       pca_embedding  
0  [0.2245909477109

In [19]:
# since its saved to csv, need to convert it to a list
test = pd.read_csv('v7_PCA_final_training_scopus.csv')


In [14]:
for embedding in test['pca_embedding'].head(3):
    print(embedding)

[ 0.22459095 -1.02234292  1.15186429 -0.26911293 -0.63117372 -0.69475715
 -0.09535893 -0.71158451  0.078894    0.08378371 -0.03151379 -0.14048062
 -0.1145968  -0.14185168  0.32362081 -0.09890457 -0.22665945  0.13610362
  0.01117077  0.09123554 -0.09413617 -0.08726883  0.11567183  0.16099877
 -0.23826552 -0.36195916 -0.25336243 -0.24208371 -0.06386271 -0.11877637
  0.18199437  0.11276205  0.10088152  0.08307612 -0.07520114 -0.05918994
 -0.07662181  0.27485253  0.1081127   0.2620721   0.06691791  0.26337762
 -0.17690343  0.20181255 -0.39149976 -0.21336916  0.02863723  0.05464993
  0.13188978 -0.02979055]
[-0.25959591 -0.0293495   1.31153266 -0.092652   -0.53967925  0.48373697
  0.42899501  0.07655914 -0.20373993 -0.28883177  0.31412267 -0.31802733
 -0.40303154 -0.03755157 -0.01473108 -0.32464714  0.16016947  0.28151775
 -0.00601207  0.08980997  0.35180044  0.07305629  0.24749308  0.15596934
 -0.15689167 -0.24202824 -0.09806507 -0.41608357 -0.23200726 -0.1417604
  0.28880281 -0.12649247 -

In [16]:
# Function to clean and convert string embeddings to numpy arrays
def clean_embedding_pca(embedding):
    if pd.isnull(embedding):
        return np.array([])
    # Remove spaces after brackets and convert to a list
    cleaned = re.sub(r'\[\s+', '[', embedding)
    cleaned = re.sub(r'\s+\]', ']', cleaned)
    cleaned = re.sub(r'(\d)\s+(-?\d)', r'\1, \2', cleaned)
    return np.array(ast.literal_eval(cleaned))



In [17]:
test['pca_embedding'] = test['pca_embedding'].apply(clean_embedding_pca)
pca_embeddings = np.vstack(test['pca_embedding'])


In [59]:
# import ast

# # Function to clean and convert string embeddings to numpy arrays
# def clean_embedding_2(embedding):
#     if pd.isnull(embedding):
#         return []
#     # Use ast.literal_eval to safely evaluate the string as a list
#     return np.array(ast.literal_eval(embedding))

In [18]:
similarity_matrix = cosine_similarity(pca_embeddings)

print("Similarity Matrix:")
print(similarity_matrix)

Similarity Matrix:
[[ 1.          0.43362078  0.1422256  ...  0.2444507  -0.16600406
  -0.1454945 ]
 [ 0.43362078  1.          0.48410892 ...  0.42858874 -0.22826684
   0.03774436]
 [ 0.1422256   0.48410892  1.         ...  0.41147236  0.21449504
   0.41235672]
 ...
 [ 0.2444507   0.42858874  0.41147236 ...  1.          0.11036008
   0.5682801 ]
 [-0.16600406 -0.22826684  0.21449504 ...  0.11036008  1.
   0.59990596]
 [-0.1454945   0.03774436  0.41235672 ...  0.5682801   0.59990596
   1.        ]]


  ret = a @ b


In [86]:
def recommend_articles(article_index, similarity_matrix, df, top_n=5):
    # Get similarity scores for the given article
    similarity_scores = similarity_matrix[article_index]
    
    # Get indices of articles sorted by similarity scores in descending order
    similar_article_indices = similarity_scores.argsort()[::-1]
    
    # Exclude the article itself and select top N similar articles
    similar_article_indices = similar_article_indices[similar_article_indices != article_index][:top_n]
    
    # Print the main article
    main_article = df.loc[df.index == article_index]
    print(f"Main Article:\nTitle: {main_article['title'].values[0]}\nAbstract: {main_article['abstract'].values[0]}\n")
    
    # Print the recommended articles
    print("Recommended Articles:")
    for index in similar_article_indices:
        article = df.loc[df.index == index]
        print(f"Title: {article['title'].values[0]}\nAbstract: {article['abstract'].values[0]}\n")

    return similar_article_indices

# Example: Recommend articles similar to the first article
article_index = 15
recommended_articles = recommend_articles(article_index, similarity_matrix, test)

# print(f"Recommended articles for article {article_index}: {recommended_articles}")


Main Article:
Title: Assessment of imaging protocol and patients radiation exposure in computed tomography colonography
Abstract: In the screening and identifying of colon and rectum malignancy computed tomography colonography (CTC) is a highly effective imaging technique albeit patients receiving a significant effective dose Accordingly patient dose evaluation is an important need seeking to ensure benefits outweigh the projected cancer risk ObjectiveFor CTC procedures carried out in the Radiology Department Medical Imaging Operation Services King Fahad Medical City (KFMC) evaluation is done using the current American College of Radiology (ACR) imaging protocol and concomitant patient-effective doses Study is carried out on a sample size of 55 CTC procedures involving 25 males (45%) and 30 females (55%) The patients were classified as followstwo groups based on CT machine; four groups based on the applied protocol; and three groups based on the procedure results All procedures were ca