**Data Prepocessing**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize
import numpy as np
import nltk
from nltk.corpus import stopwords
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

nltk.download('stopwords')

# Load SimLex999 data
simlex_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SimLex-999.txt", delimiter='\t')

# Load Brown Corpus data
brown_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/brown.csv')

print("Sample of Brown Corpus data before preprocessing:")
print(brown_data[['tokenized_text']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Sample of Brown Corpus data before preprocessing:
                                      tokenized_text
0  Furthermore , as an encouragement to revisioni...
1  The Unitarian clergy were an exclusive club of...
2  Ezra Stiles Gannett , an honorable representat...
3  Even so , Gannett judiciously argued , the Ass...
4  We today are not entitled to excoriate honest ...


In [None]:
# Function for text preprocessing
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in text.split() if word.isalpha()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens

# Apply text preprocessing to each sentence in the Brown Corpus
brown_data['preprocessed_text'] = brown_data['tokenized_text'].apply(preprocess_text)

# Display the dataset after preprocessing
print("\nAfter Preprocessing:")
print(brown_data['preprocessed_text'].head())



After Preprocessing:
0    [furthermore, encouragement, revisionist, thin...
1    [unitarian, clergy, exclusive, club, cultivate...
2    [ezra, stiles, gannett, honorable, representat...
3    [even, gannett, judiciously, argued, associati...
4    [today, entitled, excoriate, honest, men, beli...
Name: preprocessed_text, dtype: object


**Using GloVe**

In [None]:
import os

glove_dir='/content/drive/MyDrive/Colab Notebooks'

embeddings_index={}
f=open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding="utf8")
for line in f:
  values=line.split()
  word=values[0]
  coefs=np.asarray(values[1:], dtype='float32')
  embeddings_index[word]=coefs
f.close()

print('%s word  vectors'%len(embeddings_index))

# embeddings_index['good']

400000 word  vectors


In [None]:
# Create word embeddings for Brown Corpus using GloVe
embedding_dim = 100  # Assuming you are using GloVe with 100-dimensional vectors

def get_embedding(word):
    return embeddings_index.get(word, np.zeros(embedding_dim))

brown_data['word_embeddings'] = brown_data['preprocessed_text'].apply(lambda words: [get_embedding(word) for word in words])

# Display a few entries to verify the embeddings
print(brown_data[['preprocessed_text', 'word_embeddings']].head())

                                   preprocessed_text  \
0  [furthermore, encouragement, revisionist, thin...   
1  [unitarian, clergy, exclusive, club, cultivate...   
2  [ezra, stiles, gannett, honorable, representat...   
3  [even, gannett, judiciously, argued, associati...   
4  [today, entitled, excoriate, honest, men, beli...   

                                     word_embeddings  
0  [[-0.158, 0.15186, -0.098325, 0.31868, -0.0931...  
1  [[0.14219, 0.68774, -0.84726, -0.027145, -0.41...  
2  [[0.30293, 0.33103, -1.1076, -0.1375, -0.35773...  
3  [[-0.15308, 0.63194, 0.65512, -0.30706, -0.239...  
4  [[-0.19939, 0.37846, 0.52093, 0.28347, -0.1898...  


In [None]:
print(len(brown_data['preprocessed_text'][0]))

print(np.vstack(brown_data['word_embeddings'][0]).shape)


print(len(brown_data['preprocessed_text'][1]))
print(np.vstack(brown_data['word_embeddings'][1]).shape)


print(len(brown_data['preprocessed_text'][55160]))
print(np.vstack(brown_data['word_embeddings'][55160]).shape)

total_rows = len(brown_data['word_embeddings'])
print("Total number of rows in 'word_embeddings':", total_rows)

print(type(np.vstack(brown_data['word_embeddings'][0]).shape[0]))

14
(14, 100)
16
(16, 100)
9
(9, 100)
Total number of rows in 'word_embeddings': 57340
<class 'int'>


**Rearranging the Data Frame**

In [None]:
#THIS CODE TOOK 17MINUTES TO RUN, SO I HAVE SAVED IT AS CSV

# import pandas as pd

# # Assuming your original DataFrame is called brown_data
# # Create an empty DataFrame for the rearranged format
# rearranged_data = pd.DataFrame(columns=['preprocessed_text', 'word_embeddings'])

# # Iterate through each row in the original DataFrame
# for _, row in brown_data.iterrows():
#     preprocessed_text = row['preprocessed_text']
#     word_embeddings = row['word_embeddings']

#     # Create a temporary DataFrame for the current row
#     temp_df = pd.DataFrame({'preprocessed_text': preprocessed_text, 'word_embeddings': word_embeddings})

#     # Concatenate the temporary DataFrame with the rearranged_data DataFrame
#     rearranged_data = pd.concat([rearranged_data, temp_df], ignore_index=True)

# rearranged_data.to_csv('/content/drive/MyDrive/Colab Notebooks/rearranged_data.csv', index=False)

# # Drop duplicates based on 'preprocessed_text'
# rearranged_data = rearranged_data.drop_duplicates(subset=['preprocessed_text'])

# duplicates = rearranged_data['preprocessed_text'].duplicated()
# print("Duplicates in 'preprocessed_text':", duplicates.any())

In [None]:
rrearranged_data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/token.csv')

# Drop duplicates based on 'preprocessed_text'
rrearranged_data = rrearranged_data.drop_duplicates(subset=['preprocessed_text'])

duplicates = rrearranged_data['preprocessed_text'].duplicated()
print("Duplicates in 'rrrrrpreprocessed_text':", duplicates.any())

# Assuming your new dataset is named 'new_dataset' and has a column 'preprocessed_text'
rrearranged_data['word_embeddings'] = rrearranged_data['preprocessed_text'].apply(lambda text: [get_embedding(word) for word in text.split()] if isinstance(text, str) else [])


# Assuming 'word_embeddings' column is already populated in your DataFrame
rrearranged_data['word_embeddings'] = rrearranged_data['word_embeddings'].apply(lambda embeddings_list: np.concatenate(embeddings_list) if embeddings_list else [])

# Assuming 'rearranged_data' is your DataFrame
rrearranged_data = rrearranged_data.drop(columns=['Unnamed: 1'])

rrearranged_data.head()

Duplicates in 'rrrrrpreprocessed_text': False


Unnamed: 0,preprocessed_text,word_embeddings
0,furthermore,"[-0.158, 0.15186, -0.098325, 0.31868, -0.09317..."
1,encouragement,"[0.72701, 0.58038, -0.12419, -0.027612, 0.4040..."
2,revisionist,"[0.12806, 0.86505, 0.16681, 0.21417, -0.25038,..."
3,thinking,"[0.56354, 0.28124, 0.62225, 0.2077, -0.47417, ..."
4,manifestly,"[0.38598, -0.14472, -0.15539, 0.12179, -0.1323..."


In [None]:
print(np.vstack(rrearranged_data['word_embeddings'][19225]).shape)

print((rrearranged_data['preprocessed_text'][19225]))
print((rrearranged_data['preprocessed_text'][19225:19226]))

(100, 1)
identified
116407    NaN
Name: preprocessed_text, dtype: object


**Removing the NULL values if any**

In [None]:
# Check for NaN values in the DataFrame
nan_rows = rrearranged_data[rrearranged_data.isnull().any(axis=1)]

# Drop rows with NaN values
rrearranged_data = rrearranged_data.dropna()

# Display the rows with NaN values (if any)
print("Rows with NaN values:")
print(nan_rows)

# Display the DataFrame after dropping NaN values
print("DataFrame after dropping NaN values:")
print(rrearranged_data.head())


Rows with NaN values:
       preprocessed_text word_embeddings
116407               NaN              []
DataFrame after dropping NaN values:
  preprocessed_text                                    word_embeddings
0       furthermore  [-0.158, 0.15186, -0.098325, 0.31868, -0.09317...
1     encouragement  [0.72701, 0.58038, -0.12419, -0.027612, 0.4040...
2       revisionist  [0.12806, 0.86505, 0.16681, 0.21417, -0.25038,...
3          thinking  [0.56354, 0.28124, 0.62225, 0.2077, -0.47417, ...
4        manifestly  [0.38598, -0.14472, -0.15539, 0.12179, -0.1323...


In [None]:
print((rrearranged_data.loc[:, 'preprocessed_text']))

0           furthermore
1         encouragement
2           revisionist
3              thinking
4            manifestly
              ...      
509226    pharmacopoeia
509229     redefinition
509231              usp
509279          mussett
509291       equipotent
Name: preprocessed_text, Length: 40099, dtype: object


**Resetting the index values as duplicates have been removed**

In [None]:
rrearranged_data = rrearranged_data.reset_index(drop=True)

# Print the modified DataFrame
print(rrearranged_data.head())
print((rrearranged_data.loc[:, 'preprocessed_text']))

  preprocessed_text                                    word_embeddings
0       furthermore  [-0.158, 0.15186, -0.098325, 0.31868, -0.09317...
1     encouragement  [0.72701, 0.58038, -0.12419, -0.027612, 0.4040...
2       revisionist  [0.12806, 0.86505, 0.16681, 0.21417, -0.25038,...
3          thinking  [0.56354, 0.28124, 0.62225, 0.2077, -0.47417, ...
4        manifestly  [0.38598, -0.14472, -0.15539, 0.12179, -0.1323...
0          furthermore
1        encouragement
2          revisionist
3             thinking
4           manifestly
             ...      
40094    pharmacopoeia
40095     redefinition
40096              usp
40097          mussett
40098       equipotent
Name: preprocessed_text, Length: 40099, dtype: object


**Processing Cosine Similarity in Chunks**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

chunk_size = 5000
upto = 40099

# Calculate cosine similarity matrix in chunks
num_rows = min(upto, len(rrearranged_data))
cosine_sim_matrix = np.zeros((num_rows, num_rows))

for i in range(0, num_rows, chunk_size):
    subset_embeddings = rrearranged_data['word_embeddings'][i:i + chunk_size]
    subset_embeddings_array = np.vstack(subset_embeddings)
    chunk_cosine_sim_matrix = cosine_similarity(subset_embeddings_array)
    cosine_sim_matrix[i:i + chunk_size, i:i + chunk_size] = chunk_cosine_sim_matrix

# Optionally, you can convert the similarity matrix to a DataFrame for better visualization
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=range(num_rows), columns=range(num_rows))

# Display the cosine similarity DataFrame for the subset
print(rrearranged_data['preprocessed_text'][:num_rows])
print(cosine_sim_df)



0          furthermore
1        encouragement
2          revisionist
3             thinking
4           manifestly
             ...      
40094    pharmacopoeia
40095     redefinition
40096              usp
40097          mussett
40098       equipotent
Name: preprocessed_text, Length: 40099, dtype: object
          0         1         2         3         4         5         6      \
0      1.000000  0.247114 -0.015581  0.417657  0.161974  0.386030  0.373484   
1      0.247114  1.000000  0.131388  0.348723  0.068426  0.230121  0.272027   
2     -0.015581  0.131388  1.000000  0.225604  0.139131  0.079546  0.134561   
3      0.417657  0.348723  0.225604  1.000000  0.036609  0.373480  0.540338   
4      0.161974  0.068426  0.139131  0.036609  1.000000  0.184054  0.143845   
...         ...       ...       ...       ...       ...       ...       ...   
40094  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
40095  0.000000  0.000000  0.000000  0.000000  0.000000  0.00

**Performed KMeans clustering for K=20**

In [None]:
from sklearn.cluster import KMeans

# Assuming n_clusters is the number of clusters you want
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Process the clustering in chunks
chunk_size = 15000
num_rows = len(rrearranged_data)

# Initialize an empty array to store cluster labels
all_cluster_labels = np.zeros(num_rows)

# Iterate through the data in chunks
for i in range(0, num_rows, chunk_size):
    subset_cosine_sim_matrix = cosine_sim_matrix[i:i + chunk_size, :]

    # Fit the KMeans model on the subset of the cosine similarity matrix
    subset_clusters = kmeans.fit_predict(subset_cosine_sim_matrix)

    # Assign cluster labels to the corresponding rows in the overall array
    all_cluster_labels[i:i + chunk_size] = subset_clusters

# Add the overall cluster labels to the DataFrame
rrearranged_data['cluster'] = all_cluster_labels.astype(int)


# Display the clusters and their representative words
for cluster_label in range(n_clusters):
    cluster_words = rrearranged_data.loc[rrearranged_data['cluster'] == cluster_label, 'preprocessed_text']
    print(f"\nCluster {cluster_label + 1}: {', '.join(cluster_words)}")






Cluster 1: sanhedrin, diffusing, excoriate, unitarianism, frothingham, trinitarians, unitarians, heaves, escutcheon, insurgence, bartol, stratagems, liberality, apostates, loins, shabbily, cherishing, incontestable, iconoclasm, commonplaces, rephrased, polities, libertines, cherubim, seraphim, archangels, recollect, remorseless, copiously, historicity, unreality, separateness, comprehended, shintoism, koinonia, chumminess, indwelling, agreeableness, bifocals, bifocal, conceives, nondiscriminatory, superimpose, impelled, carload, reapportioned, imputed, volumetric, imputation, steichen, calderone, heedless, bestial, wifely, demeans, saps, robs, maleness, embezzle, schillinger, henpecked, shrewish, threshhold, reik, primly, flapper, banister, knifelike, feint, heaved, feebly, newel, panting, wister, lamming, wastrel, ached, allegoric, fruitfully, individualizing, dramatizing, jaggers, fascinates, terrifies, onslaughts, apologetically, leitmotif, interrelationships, magwitch, pip, bogy, 

In [None]:
# # print(rrearranged_data.head())
rrearranged_data[rrearranged_data['preprocessed_text'].str.contains('good', case=False, na=False)]

Unnamed: 0,preprocessed_text,word_embeddings,cluster
3168,good,"[-0.030769, 0.11993, 0.53909, -0.43696, -0.739...",16
4014,goods,"[0.33874, 0.14341, 0.024038, -0.23155, 0.54294...",17
9196,goodwill,"[0.32148, 0.093939, 0.08593, 0.094394, 0.50016...",4
9718,goodwin,"[0.050618, -0.12776, -0.13475, -0.32542, 0.038...",12
9760,hapgood,"[0.24582, -0.44046, -0.14635, -0.38284, -0.723...",13
12337,goodbye,"[0.49707, 0.23149, 0.40713, -0.45075, -0.19791...",10
13717,lovingood,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
15984,goodness,"[0.17267, 0.481, 1.317, -0.19581, -0.1213, 0.5...",12
19992,goodis,"[0.0035193, -0.58247, -0.41975, 0.36637, 0.426...",8
27302,goodnight,"[0.11086, -0.036782, 0.57787, 0.82565, -0.6038...",19


In [None]:
rrearranged_data[rrearranged_data['preprocessed_text'].str.contains('excellent', case=False, na=False)]

Unnamed: 0,preprocessed_text,word_embeddings,cluster
2422,excellent,"[-0.2816, 0.18427, -0.06755, 0.27694, -0.06677...",11
27774,excellently,"[-0.81937, 0.187, -0.34208, -0.26881, -0.01664...",15


In [None]:
rrearranged_data[rrearranged_data['preprocessed_text'].str.contains('forest', case=False, na=False)]

Unnamed: 0,preprocessed_text,word_embeddings,cluster
3166,forests,"[-1.3829, 0.7772, 0.86109, 0.62111, 0.5058, 0....",6
4402,forestall,"[-0.19051, -0.30195, -0.29571, 0.35827, -0.385...",7
16952,forest,"[-0.51682, 0.49154, 0.66964, 0.40753, 0.034442...",4
21869,forestry,"[-0.38465, -0.16046, -0.28902, 0.27635, -0.118...",10
31317,deforest,"[0.066769, -0.33179, -0.3891, -0.31418, -0.310...",16


In [None]:
rrearranged_data[rrearranged_data['preprocessed_text'].str.contains('bad', case=False, na=False)]

Unnamed: 0,preprocessed_text,word_embeddings,cluster
648,bad,"[0.39456, -0.24717, 1.0319, -0.61444, -1.2376,...",16
5298,badly,"[-0.45475, 0.70261, 0.26121, -0.38866, -0.4388...",3
10553,badrawi,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
10807,badges,"[0.16794, -0.22451, -0.57482, -0.80525, 0.3255...",15
16732,badness,"[-0.18008, -0.241, 0.51216, 0.093723, -0.39503...",5
16975,badlands,"[-0.069283, 0.03581, 1.2034, -0.55422, 0.13133...",14
21697,bads,"[0.288, 0.033761, -0.30746, -0.15146, -0.25293...",2
24250,forbad,"[0.043189, -0.4039, -0.5416, 0.29391, -0.03824...",2
25749,badmen,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",19
25873,badinage,"[0.1731, -0.23663, -0.12306, 0.0049184, -0.323...",6


**Calculating the similarity scores based on cluster average similarity**

In [None]:
# Calculate average similarity within each cluster
cluster_avg_similarity = pd.DataFrame(index=range(n_clusters), columns=range(n_clusters))

for i in range(n_clusters):
    for j in range(n_clusters):
        cluster_rows = rrearranged_data[rrearranged_data['cluster'] == i].index
        cluster_cols = rrearranged_data[rrearranged_data['cluster'] == j].index
        cluster_avg_similarity.loc[i, j] = cosine_sim_matrix[np.ix_(cluster_rows, cluster_cols)].mean()

# Display average similarity within each cluster
print("Average Similarity within Clusters:")
print(cluster_avg_similarity)

# Function to get average similarity for a word pair
def get_average_similarity(word1, word2):
    try:
        cluster_label1 = rrearranged_data.loc[rrearranged_data['preprocessed_text'] == word1, 'cluster'].values[0]
        cluster_label2 = rrearranged_data.loc[rrearranged_data['preprocessed_text'] == word2, 'cluster'].values[0]
    except IndexError:
        # Handle the case where one or both words are not found in rrearranged_data
        # You can choose a default value or strategy, for example, returning the average similarity of the entire dataset
        return cosine_sim_matrix.mean().mean()

    # If the words are in the same cluster, return the average similarity within that cluster
    if cluster_label1 == cluster_label2:
        return cluster_avg_similarity.loc[cluster_label1, cluster_label1]

    # If the words are in different clusters, handle it as appropriate
    return cosine_sim_matrix.mean().mean()

# Same cluster values have given the same values
word1 = 'good'
word2 = 'excellent'
similarity_score = get_average_similarity(word1, word2)
print(f"\nSimilarity score between '{word1}' and '{word2}': {similarity_score}")

word1 = 'good'
word2 = 'good'
similarity_score = get_average_similarity(word1, word2)
print(f"\nSimilarity score between '{word1}' and '{word2}': {similarity_score}")

word1 = 'good'
word2 = 'bad'
similarity_score = get_average_similarity(word1, word2)
print(f"\nSimilarity score between '{word1}' and '{word2}': {similarity_score}")

word1 = 'good'
word2 = 'forest'
similarity_score = get_average_similarity(word1, word2)
print(f"\nSimilarity score between '{word1}' and '{word2}': {similarity_score}")


Average Similarity within Clusters:
          0         1         2         3         4         5         6   \
0   0.044457  -0.01586  0.002957       0.0  0.000199 -0.007501  -0.00274   
1   -0.01586  0.084126 -0.001511  0.004714  0.000115       0.0  0.008116   
2   0.002957 -0.001511  0.018176 -0.001064 -0.000147  0.000189  0.002728   
3        0.0  0.004714 -0.001064  0.087615  0.021042       0.0  0.005637   
4   0.000199  0.000115 -0.000147  0.021042  0.016824 -0.003663 -0.000194   
5  -0.007501       0.0  0.000189       0.0 -0.003663  0.067619       0.0   
6   -0.00274  0.008116  0.002728  0.005637 -0.000194       0.0  0.064889   
7    0.00748  0.001903  0.009131  0.007176 -0.000118       0.0  0.026958   
8  -0.000025       0.0   0.00159  0.001943  0.002997  0.007981       0.0   
9   0.006488  -0.00188  0.009855  0.000239  0.000115   0.00912  0.003991   
10   0.00839  0.001928  0.001773 -0.000025  0.000225   -0.0138 -0.002236   
11 -0.010836  0.048439 -0.000192       0.0  0.001871

**Calculating the Spearman Correlation**

In [None]:
from scipy.stats import spearmanr

# Assuming simlex_data is the DataFrame containing word pairs from SimLex999
simlex_data['predicted_similarity'] = simlex_data.apply(lambda row: get_average_similarity(row['word1'], row['word2']), axis=1)



In [None]:
# Evaluate using Spearman correlation
spearman_corr, _ = spearmanr(simlex_data['SimLex999'], simlex_data['predicted_similarity'])
print(f"Spearman Correlation: {spearman_corr}")

Spearman Correlation: -0.017627329096855795


In [None]:
# from sklearn.cluster import KMeans

# chunk_size = 5000
# num_rows = min(upto, len(rrearranged_data))
# # Assuming a reduced number of clusters
# n_clusters = 5

# # Initialize KMeans model
# kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# # Process in chunks
# for i in range(0, num_rows, chunk_size):
#     subset_embeddings = rrearranged_data['word_embeddings'][i:i + chunk_size]
#     subset_embeddings_array = np.vstack(subset_embeddings)

#     # Fit KMeans model on the current chunk
#     chunk_clusters = kmeans.fit_predict(subset_embeddings_array)

#     # Assign cluster labels to the original DataFrame (add 1 to the cluster labels)
#     rrearranged_data.loc[i:i + chunk_size - 1, 'cluster'] = chunk_clusters + 1

# # Display the clusters and their representative words
# for cluster_label in range(1, n_clusters + 1):
#     cluster_words = rrearranged_data.loc[rrearranged_data['cluster'] == cluster_label, 'preprocessed_text']
#     print(f"\nCluster {cluster_label}: {', '.join(cluster_words)}")





Cluster 1: fraternity, clergy, club, parker, gentleman, theology, representative, association, assisted, entitled, congregations, lecture, church, son, biography, father, title, christians, liberty, christianity, religion, grant, modern, harvard, divinity, school, invited, england, chandler, robbins, liberal, honor, boston, denomination, fellowship, hall, theodore, comedy, story, triumph, christian, university, ralph, naming, philosophy, birth, colleagues, frank, conservative, emerson, communion, lifetime, afterwards, society, ecclesiastical, foundations, garrison, charles, proclaimed, champion, theological, era, revered, orthodox, wrote, friend, louisville, governed, genius, heart, book, mercy, letter, sermons, congregational, descended, founders, churches, spoke, institution, lawyer, doctor, science, letters, throne, merchant, holy, old, catholics, bishop, tells, great, bishops, pope, rome, editors, parents, love, perform, age, fifty, psychology, massacre, living, worked, christ, tr



In [None]:
# # # Aggregate word embeddings to obtain sentence embeddings
# # brown_data['sentence_embeddings'] = brown_data['word_embeddings'].apply(lambda embeddings: np.mean(embeddings, axis=0) if embeddings else np.zeros(embedding_dim))
# # Display a few entries to verify the sentence embeddings
# # print(brown_data[['preprocessed_text', 'sentence_embeddings']].head())


# # Pairwise Word Embeddings
# brown_data['pairwise_embeddings'] = brown_data['word_embeddings'].apply(lambda embeddings: np.concatenate(embeddings, axis=0) if embeddings else np.zeros(embedding_dim * 2))