### Extracting embeddings of all topics, determining neighbour topics and distances 

In [1]:
# Libraries for handling the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Tensorflow embeddings projector
import subprocess

# To calculated the euclidean distances between topics
from math import sqrt


In [2]:
# Loading datasets 
# Note: csv files can be found on pCloud
Science1900_2023 = pd.read_csv('./data/Science1900_2023.csv', index_col=0)  
topic_proba_data_Science1900_2023 = pd.read_csv('./data/topic_proba_data_Science1900_2023.csv',index_col=0)  

topified_vectorized_Science1900_2023 = pd.read_csv('./data/topified_vectorized_Science1900_2023.csv', index_col=0)  
topified_vectorized_Science1900_2023['fieldsOfStudy']= Science1900_2023['fieldsOfStudy']
topified_vectorized_Science1900_2023['s2FieldsOfStudy']= Science1900_2023['s2FieldsOfStudy']
topified_vectorized_Science1900_2023 = topified_vectorized_Science1900_2023.drop(['title','abstract','paperId','externalIds', 'publicationVenue', 'publicationTypes'], axis=1)
topified_vectorized_Science1900_2023['publicationDate'] = pd.to_datetime(topified_vectorized_Science1900_2023['publicationDate']) 
topified_vectorized_Science1900_2023['year'] = pd.to_datetime(topified_vectorized_Science1900_2023['year'], format ='%Y') 

# Exracting embeddings
embeddings_df = pd.DataFrame()
embeddings_df['x_vector'] = topified_vectorized_Science1900_2023['x_vector']
embeddings_df['y_vector'] = topified_vectorized_Science1900_2023['y_vector']
embeddings_df['z_vector'] = topified_vectorized_Science1900_2023['z_vector']
embeddings_df.to_csv('./data/embeddings.tsv', sep='\t',header=False, index=False)

# Extracting labels
topic_labels_df = pd.DataFrame()
topic_labels_df['label'] = topified_vectorized_Science1900_2023['topic_list']
topic_labels_df.reset_index(drop=True, inplace= True)
topic_labels_df.index += 1

# The saved file is for tensorflow projector and nothing else
topic_labels_df.to_csv('./data/labels.tsv', sep='\t',header=False, index=False)

In [28]:
# Uncomment if tensorflow projector is required
# # Visualization of the embeddings using tensorflow projector
# config = """
# embeddings {
#   tensor_path: "embeddings.tsv"
#   metadata_path: "labels.tsv"
# }
# """
# with open("./data/projector_config.pbtxt", "w") as f:
#     f.write(config)

# logdir = "./data"  # Directory containing embeddings, metadata, and projector_config.pbtxt
# # Launch TensorBoard with the specified log directory
# subprocess.Popen(["tensorboard", f"--logdir={logdir}"])

# # commands to kill running instances of the projector
# # sudo lsof -i -n -P
# # kill -9 PID
# # tensorboard_process.terminate() # This doesn't work even if you create the variable

In [3]:
# Expanding topic_labels_df to include nearest neighbors and their distances

# Filtering unclassified topics out
topified_vectorized_Science1900_2023 = topified_vectorized_Science1900_2023[ (topified_vectorized_Science1900_2023['topic_code'] >= 0)]

# Creating groups by topic code to model each topic separately

# Grouping by topic code and extracting embeddings to calculate the distances
grouped_by_topic_code_including_distances = topified_vectorized_Science1900_2023.groupby('topic_code')[['x_vector', 'y_vector', 'z_vector']].apply(lambda x: x.values.tolist()).reset_index()

# This group contains the labels
grouped_by_topic_code_including_topic_labels = topified_vectorized_Science1900_2023.groupby('topic_code')['topic_list'].agg(list).reset_index()

# Calculating centroids for each topic
length=len(grouped_by_topic_code_including_distances[0])
topic_centroids = []
topic_labels = []
for row in range(length):
    topic_centroids.append(np.mean(grouped_by_topic_code_including_distances[0][row],axis=0))
    topic_labels.append(grouped_by_topic_code_including_topic_labels['topic_list'][row][0])

# Creating df's for labels and centroids
topic_labels_df = pd.DataFrame(topic_labels,columns=['topic_labels'])
topic_centroids_df = pd.DataFrame(topic_centroids, columns=['x','y','z'])

# Saving to a file
topic_centroids_df.to_csv('./data/topic_centroids.tsv', sep='\t',header=False, index=False)

# Calculating the distances between topics ----------------------------------------------------------------------------------------------------------------------

topic_centroids_array =topic_centroids_df.to_numpy()
length_centroids = len(topic_centroids_array)
#n_neighbors=10

# Initiating the columns
for k in range(length_centroids-1):
        topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
        topic_labels_df[f'n_nb_distance_'+ str(k)] = 0

# Looping over all topics, calculating the distances and adding to topic_labels_df
for z in range(length_centroids):
    centroid_distances = []
    for i in range(length_centroids):
        #calculate euclidean distances between the vectors per centroid
        
        result = sqrt(
        (topic_centroids_array[z][0]-topic_centroids_array[i][0])**2+
        (topic_centroids_array[z][1]-topic_centroids_array[i][1])**2+
        (topic_centroids_array[z][2]-topic_centroids_array[i][2])**2)
        centroid_distances.append([i,result])
    #print(centroid_distances)
    #sort centroid_distances accendingly and keeping the topic_code as index
    centroid_distances_sorted_list = sorted(centroid_distances, key=lambda x: x[1])
    centroid_distances_sorted_df = pd.DataFrame(centroid_distances_sorted_list)
   # centroid_distances_sorted_df = pd.concat(centroid_distances_sorted_df)
    # print(centroid_distances_sorted_df)
    for k in range(length_centroids-1):
       # print('topic_labels: ', centroid_distances_sorted_df.iloc[k,0])
        topic_labels_df.iloc[z,2*k+1] = centroid_distances_sorted_df.iloc[k,0]
        #topic_labels_df.iloc[z,2*k+1] = centroid_distances_sorted_df.iloc[k,0]
        topic_labels_df.iloc[z,2*k+2] = centroid_distances_sorted_df.iloc[k,1]

# Expanding the df and saving into a csv file instead of tsv
topic_labels_df.to_csv('./data/topic_labels.csv', index=False)

  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
  topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
  topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
  topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
  topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
  topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
  topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
  topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
  topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
  topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
  topic_labels_df[f'n_nb_topic_'+ str(k)] = 0
  topic_labels_df[f'n_nb_distance_'+ str(k)] = 0
 

In [31]:
topic_labels_df

Unnamed: 0,topic_labels,n_nb_topic_0,n_nb_distance_0,n_nb_topic_1,n_nb_distance_1,n_nb_topic_2,n_nb_distance_2,n_nb_topic_3,n_nb_distance_3,n_nb_topic_4,...,n_nb_topic_112,n_nb_distance_112,n_nb_topic_113,n_nb_distance_113,n_nb_topic_114,n_nb_distance_114,n_nb_topic_115,n_nb_distance_115,n_nb_topic_116,n_nb_distance_116
0,0_species_soil_plant_plants,0,0,67,2.016517,77,2.086345,4,2.453928,7,...,61,7.598980,39,7.697668,44,7.747544,69,7.777791,57,7.844189
1,1_coli_escherichia_escherichia coli_strains,1,0,94,0.861683,98,1.495111,49,1.562618,97,...,80,8.631868,72,8.633318,68,8.794184,17,8.801515,33,8.839814
2,2_chromatography_liquid_liquid chromatography_...,2,0,77,0.806960,41,1.409208,67,1.479810,97,...,39,7.412360,57,7.435924,66,7.456084,69,7.604247,33,7.704082
3,3_political_politics_language_social,3,0,59,0.725362,5,1.325051,11,1.371638,51,...,44,7.984457,73,8.003182,24,8.007140,53,8.023838,45,8.081010
4,4_der_die_und_von,4,0,23,1.802350,55,1.841656,56,1.931493,51,...,69,7.786374,64,7.882692,44,7.916049,66,7.916175,57,8.006525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,113_channel_codes_channels_capacity,113,0,35,0.380656,72,0.606432,81,1.074211,56,...,21,9.260380,53,9.775409,73,9.809199,45,9.986404,24,10.065113
114,114_language_tasks_models_language models,114,0,81,0.927958,51,1.024840,72,1.252507,56,...,21,8.222646,73,8.687122,53,8.717820,24,8.841850,45,8.873300
115,115_acuity_visual_visual acuity_myopia,115,0,39,0.979359,84,1.065278,71,1.079725,69,...,15,8.734962,109,8.838273,27,9.264592,80,9.313633,17,9.760175
116,116_cells_axonal_astrocytes_neurons,116,0,43,0.502005,93,0.830149,100,0.831901,104,...,109,7.169085,15,7.485941,27,7.680585,80,8.054589,17,8.357058
