In [1]:
# imports
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
from math import dist

In [2]:
# read in sentence embeddings from robbert
robbert_1, robbert_2, robbert_3, robbert_4, robbert_5, robbert_6, robbert_7, robbert_8 = pd.read_csv('D:\\robbert_embeddings_1.csv'), pd.read_csv('D:\\robbert_embeddings_2.csv'), pd.read_csv('D:\\robbert_embeddings_3.csv'), pd.read_csv('D:\\robbert_embeddings_4.csv'), pd.read_csv('D:\\robbert_embeddings_5.csv'), pd.read_csv('D:\\robbert_embeddings_6.csv'), pd.read_csv('D:\\robbert_embeddings_7.csv'), pd.read_csv('D:\\robbert_embeddings_8.csv')
embeddings_robbert = pd.concat([robbert_1, robbert_2, robbert_3, robbert_4, robbert_5, robbert_6, robbert_7, robbert_8])
embeddings_robbert = embeddings_robbert.reset_index()

In [3]:
# read in sentence embeddings from bertje
bertje_1, bertje_2, bertje_3, bertje_4, bertje_5, bertje_6, bertje_7, bertje_8 = pd.read_csv('D:\\bertje_embeddings_1.csv'), pd.read_csv('D:\\bertje_embeddings_2.csv'), pd.read_csv('D:\\bertje_embeddings_3.csv'), pd.read_csv('D:\\bertje_embeddings_4.csv'), pd.read_csv('D:\\bertje_embeddings_5.csv'), pd.read_csv('D:\\bertje_embeddings_6.csv'), pd.read_csv('D:\\bertje_embeddings_7.csv'), pd.read_csv('D:\\bertje_embeddings_8.csv')
embeddings_bertje = pd.concat([bertje_1, bertje_2, bertje_3, bertje_4, bertje_5, bertje_6, bertje_7, bertje_8])
embeddings_bertje = embeddings_bertje.reset_index()

In [4]:
# read in sentence embeddings from eurobert
eurobert_1, eurobert_2, eurobert_3, eurobert_4, eurobert_5, eurobert_6, eurobert_7, eurobert_8 = pd.read_csv('D:\\eurobert_embeddings_1.csv'), pd.read_csv('D:\\eurobert_embeddings_2.csv'), pd.read_csv('D:\\eurobert_embeddings_3.csv'), pd.read_csv('D:\\eurobert_embeddings_4.csv'), pd.read_csv('D:\\eurobert_embeddings_5.csv'), pd.read_csv('D:\\eurobert_embeddings_6.csv'), pd.read_csv('D:\\eurobert_embeddings_7.csv'), pd.read_csv('D:\\eurobert_embeddings_8.csv')
embeddings_eurobert = pd.concat([eurobert_1, eurobert_2, eurobert_3, eurobert_4, eurobert_5, eurobert_6, eurobert_7, eurobert_8])
embeddings_eurobert = embeddings_eurobert.reset_index()

In [5]:
# read in sentence embeddings from mbert
mbert_1, mbert_2, mbert_3, mbert_4, mbert_5, mbert_6, mbert_7, mbert_8 = pd.read_csv('D:\\mbert_embeddings_1.csv'), pd.read_csv('D:\\mbert_embeddings_2.csv'), pd.read_csv('D:\\mbert_embeddings_3.csv'), pd.read_csv('D:\\mbert_embeddings_4.csv'), pd.read_csv('D:\\mbert_embeddings_5.csv'), pd.read_csv('D:\\mbert_embeddings_6.csv'), pd.read_csv('D:\\mbert_embeddings_7.csv'), pd.read_csv('D:\\mbert_embeddings_8.csv')
embeddings_mbert = pd.concat([mbert_1, mbert_2, mbert_3, mbert_4, mbert_5, mbert_6, mbert_7, mbert_8])
embeddings_mbert = embeddings_mbert.reset_index()

In [6]:
# data preprocessing
for df in [embeddings_robbert, embeddings_bertje, embeddings_eurobert, embeddings_mbert]:
    df['Sentence Embedding'] = df['Sentence Embedding'].apply(literal_eval)
    df['Sentence Embedding'] = df['Sentence Embedding'].apply(np.asarray)

In [7]:
# separate short and long sentences
embeddings_robbert_short = embeddings_robbert[embeddings_robbert['Length Label'] == 'short']
embeddings_robbert_long = embeddings_robbert[embeddings_robbert['Length Label'] == 'long']

embeddings_bertje_short = embeddings_bertje[embeddings_bertje['Length Label'] == 'short']
embeddings_bertje_long = embeddings_bertje[embeddings_bertje['Length Label'] == 'long']

embeddings_eurobert_short = embeddings_eurobert[embeddings_eurobert['Length Label'] == 'short']
embeddings_eurobert_long = embeddings_eurobert[embeddings_eurobert['Length Label'] == 'long']

embeddings_mbert_short = embeddings_mbert[embeddings_mbert['Length Label'] == 'short']
embeddings_mbert_long = embeddings_mbert[embeddings_mbert['Length Label'] == 'long']

# Silhouette Score

In [9]:
def silhouette_score_function(df):
    oti_and_ti = df[(df['Category'] == 'oti') | (df['Category'] == 'ti')]
    labels_oti_and_ti = oti_and_ti['Category'].to_numpy()
    oti_and_ti = np.stack(oti_and_ti['Sentence Embedding'].to_numpy())
    ti_and_niet = df[(df['Category'] == 'ti') | (df['Category'] == 'niet + ti')]
    labels_ti_and_niet = ti_and_niet['Category'].to_numpy()
    ti_and_niet = np.stack(ti_and_niet['Sentence Embedding'].to_numpy())

    ss_oti_ti = silhouette_score(oti_and_ti, labels_oti_and_ti)
    ss_ti_niet = silhouette_score(ti_and_niet, labels_ti_and_niet)

    return ss_oti_ti, ss_ti_niet

In [10]:
# Silhouette Score for robbert
ss_robbert_oti_ti, ss_robbert_ti_niet = silhouette_score_function(embeddings_robbert)

In [11]:
# Silhouette Score for bertje
ss_bertje_oti_ti, ss_bertje_ti_niet = silhouette_score_function(embeddings_bertje)

In [12]:
# Silhouette Score for eurobert
ss_eurobert_oti_ti, ss_eurobert_ti_niet = silhouette_score_function(embeddings_eurobert)

In [13]:
# Silhouette Score for mbert
ss_mbert_oti_ti, ss_mbert_ti_niet = silhouette_score_function(embeddings_mbert)

### Silhouette Score separated for sentence length

In [15]:
# Silhouette Score robbert for short sentences
ss_robbert_oti_ti_short, ss_robbert_ti_niet_short = silhouette_score_function(embeddings_robbert_short)

In [16]:
# Silhouette Score robbert for long sentences
ss_robbert_oti_ti_long, ss_robbert_ti_niet_long = silhouette_score_function(embeddings_robbert_long)

In [17]:
# Silhouette Score bertje for short sentences
ss_bertje_oti_ti_short, ss_bertje_ti_niet_short = silhouette_score_function(embeddings_bertje_short)

In [18]:
# Silhouette Score bertje for long sentences
ss_bertje_oti_ti_long, ss_bertje_ti_niet_long = silhouette_score_function(embeddings_bertje_long)

In [19]:
# Silhouette Score eurobert for short sentences
ss_eurobert_oti_ti_short, ss_eurobert_ti_niet_short = silhouette_score_function(embeddings_eurobert_short)

In [20]:
# Silhouette Score eurobert for long sentences
ss_eurobert_oti_ti_long, ss_eurobert_ti_niet_long = silhouette_score_function(embeddings_eurobert_long)

In [21]:
# Silhouette Score mbert for short sentences
ss_mbert_oti_ti_short, ss_mbert_ti_niet_short = silhouette_score_function(embeddings_mbert_short)

In [22]:
# Silhouette Score mbert for long sentences
ss_mbert_oti_ti_long, ss_mbert_ti_niet_long = silhouette_score_function(embeddings_mbert_long)

# Davies-Bouldin

In [24]:
def davies_bouldin_function(df):
    oti_and_ti = df[(df['Category'] == 'oti') | (df['Category'] == 'ti')]
    labels_oti_and_ti = oti_and_ti['Category'].to_numpy()
    oti_and_ti = np.stack(oti_and_ti['Sentence Embedding'].to_numpy())
    ti_and_niet = df[(df['Category'] == 'ti') | (df['Category'] == 'niet + ti')]
    labels_ti_and_niet = ti_and_niet['Category'].to_numpy()
    ti_and_niet = np.stack(ti_and_niet['Sentence Embedding'].to_numpy())

    db_oti_ti = davies_bouldin_score(oti_and_ti, labels_oti_and_ti)
    db_ti_niet = davies_bouldin_score(ti_and_niet, labels_ti_and_niet)

    return db_oti_ti, db_ti_niet

In [25]:
# Davies-Bouldin for robbert
db_robbert_oti_ti, db_robbert_ti_niet = davies_bouldin_function(embeddings_robbert)

In [26]:
# Davies-Bouldin for bertje
db_bertje_oti_ti, db_bertje_ti_niet = davies_bouldin_function(embeddings_bertje)

In [27]:
# Davies-Bouldin for eurobert
db_eurobert_oti_ti, db_eurobert_ti_niet = davies_bouldin_function(embeddings_eurobert)

In [28]:
# Davies-Bouldin for mbert
db_mbert_oti_ti, db_mbert_ti_niet = davies_bouldin_function(embeddings_mbert)

### Davies-Bouldin separated for sentence length

In [30]:
# Davies-Bouldin robbert for short sentences
db_robbert_oti_ti_short, db_robbert_ti_niet_short = davies_bouldin_function(embeddings_robbert_short)

In [31]:
# Davies-Bouldin robbert for long sentences
db_robbert_oti_ti_long, db_robbert_ti_niet_long = davies_bouldin_function(embeddings_robbert_long)

In [32]:
# Davies-Bouldin bertje for short sentences
db_bertje_oti_ti_short, db_bertje_ti_niet_short = davies_bouldin_function(embeddings_bertje_short)

In [33]:
# Davies-Bouldin bertje for long sentences
db_bertje_oti_ti_long, db_bertje_ti_niet_long = davies_bouldin_function(embeddings_bertje_long)

In [34]:
# Davies-Bouldin eurobert for short sentences
db_eurobert_oti_ti_short, db_eurobert_ti_niet_short = davies_bouldin_function(embeddings_eurobert_short)

In [35]:
# Davies-Bouldin eurobert for long sentences
db_eurobert_oti_ti_long, db_eurobert_ti_niet_long = davies_bouldin_function(embeddings_eurobert_long)

In [36]:
# Davies-Bouldin mbert for short sentences
db_mbert_oti_ti_short, db_mbert_ti_niet_short = davies_bouldin_function(embeddings_mbert_short)

In [37]:
# Davies-Bouldin mbert for long sentences
db_mbert_oti_ti_long, db_mbert_ti_niet_long = davies_bouldin_function(embeddings_mbert_long)

# Results

In [39]:
# print all results
print('Results')
print('Silhouette Score')
print('RobBERT oti and ti:        ', ss_robbert_oti_ti)
print('RobBERT ti and niet + ti:  ', ss_robbert_ti_niet)
print('Difference:                ', ss_robbert_oti_ti - ss_robbert_ti_niet)
print('BERTje oti and ti:         ', ss_bertje_oti_ti)
print('BERTje ti and niet + ti:   ', ss_bertje_ti_niet)
print('Difference:                ', ss_bertje_oti_ti - ss_bertje_ti_niet)
print('EuroBERT oti and ti:       ', ss_eurobert_oti_ti)
print('EuroBERT ti and niet + ti: ', ss_eurobert_ti_niet)
print('Difference:                ', ss_eurobert_oti_ti - ss_eurobert_ti_niet)
print('mBERT oti and ti:          ', ss_mbert_oti_ti)
print('mBERT ti and niet + ti:    ', ss_mbert_ti_niet)
print('Difference:                ', ss_mbert_oti_ti - ss_mbert_ti_niet)
print()
print('Davies-Bouldin Index')
print('RobBERT oti and ti:        ', db_robbert_oti_ti)
print('RobBERT ti and niet + ti:  ', db_robbert_ti_niet)
print('Difference:                ', db_robbert_oti_ti - db_robbert_ti_niet)
print('BERTje oti and ti:         ', db_bertje_oti_ti)
print('BERTje ti and niet + ti:   ', db_bertje_ti_niet)
print('Difference:                ', db_bertje_oti_ti - db_bertje_ti_niet)
print('EuroBERT oti and ti:       ', db_eurobert_oti_ti)
print('EuroBERT ti and niet + ti: ', db_eurobert_ti_niet)
print('Difference:                ', db_eurobert_oti_ti - db_eurobert_ti_niet)
print('mBERT oti and ti:          ', db_mbert_oti_ti)
print('mBERT ti and niet + ti:    ', db_mbert_ti_niet)
print('Difference:                ', db_mbert_oti_ti - db_mbert_ti_niet)
print()
print('Results sentence length')
print('Silhouette Score for short sentences')
print('RobBERT oti and ti:        ', ss_robbert_oti_ti_short)
print('RobBERT ti and niet + ti:  ', ss_robbert_ti_niet_short)
print('Difference:                ', ss_robbert_oti_ti_short - ss_robbert_ti_niet_short)
print('BERTje oti and ti:         ', ss_bertje_oti_ti_short)
print('BERTje ti and niet + ti:   ', ss_bertje_ti_niet_short)
print('Difference:                ', ss_bertje_oti_ti_short - ss_bertje_ti_niet_short)
print('EuroBERT oti and ti:       ', ss_eurobert_oti_ti_short)
print('EuroBERT ti and niet + ti: ', ss_eurobert_ti_niet_short)
print('Difference:                ', ss_eurobert_oti_ti_short - ss_eurobert_ti_niet_short)
print('mBERT oti and ti:          ', ss_mbert_oti_ti_short)
print('mBERT ti and niet + ti:    ', ss_mbert_ti_niet_short)
print('Difference:                ', ss_mbert_oti_ti_short - ss_mbert_ti_niet_short)
print()
print('Silhouette Score for long sentences')
print('RobBERT oti and ti:        ', ss_robbert_oti_ti_long)
print('RobBERT ti and niet + ti:  ', ss_robbert_ti_niet_long)
print('Difference:                ', ss_robbert_oti_ti_long - ss_robbert_ti_niet_long)
print('BERTje oti and ti:         ', ss_bertje_oti_ti_long)
print('BERTje ti and niet + ti:   ', ss_bertje_ti_niet_long)
print('Difference:                ', ss_bertje_oti_ti_long - ss_bertje_ti_niet_long)
print('EuroBERT oti and ti:       ', ss_eurobert_oti_ti_long)
print('EuroBERT ti and niet + ti: ', ss_eurobert_ti_niet_long)
print('Difference:                ', ss_eurobert_oti_ti_long - ss_eurobert_ti_niet_long)
print('mBERT oti and ti:          ', ss_mbert_oti_ti_long)
print('mBERT ti and niet + ti:    ', ss_mbert_ti_niet_long)
print('Difference:                ', ss_mbert_oti_ti_long - ss_mbert_ti_niet_long)
print()
print('Davies-Bouldin Index for short sentences')
print('RobBERT oti and ti:        ', db_robbert_oti_ti_short)
print('RobBERT ti and niet + ti:  ', db_robbert_ti_niet_short)
print('Difference:                ', db_robbert_oti_ti_short - db_robbert_ti_niet_short)
print('BERTje oti and ti:         ', db_bertje_oti_ti_short)
print('BERTje ti and niet + ti:   ', db_bertje_ti_niet_short)
print('Difference:                ', db_bertje_oti_ti_short - db_bertje_ti_niet_short)
print('EuroBERT oti and ti:       ', db_eurobert_oti_ti_short)
print('EuroBERT ti and niet + ti: ', db_eurobert_ti_niet_short)
print('Difference:                ', db_eurobert_oti_ti_short - db_eurobert_ti_niet_short)
print('mBERT oti and ti:          ', db_mbert_oti_ti_short)
print('mBERT ti and niet + ti:    ', db_mbert_ti_niet_short)
print('Difference:                ', db_mbert_oti_ti_short - db_mbert_ti_niet_short)
print()
print('Davies-Bouldin Index for long sentences')
print('RobBERT oti and ti:        ', db_robbert_oti_ti_long)
print('RobBERT ti and niet + ti:  ', db_robbert_ti_niet_long)
print('Difference:                ', db_robbert_oti_ti_long - db_robbert_ti_niet_long)
print('BERTje oti and ti:         ', db_bertje_oti_ti_long)
print('BERTje ti and niet + ti:   ', db_bertje_ti_niet_long)
print('Difference:                ', db_bertje_oti_ti_long - db_bertje_ti_niet_long)
print('EuroBERT oti and ti:       ', db_eurobert_oti_ti_long)
print('EuroBERT ti and niet + ti: ', db_eurobert_ti_niet_long)
print('Difference:                ', db_eurobert_oti_ti_long - db_eurobert_ti_niet_long)
print('mBERT oti and ti:          ', db_mbert_oti_ti_long)
print('mBERT ti and niet + ti:    ', db_mbert_ti_niet_long)
print('Difference:                ', db_mbert_oti_ti_long - db_mbert_ti_niet_long)

Results
Silhouette Score
RobBERT oti and ti:         0.0157265887932287
RobBERT ti and niet + ti:   0.031882406504231534
Difference:                 -0.016155817711002832
BERTje oti and ti:          0.013608322851695921
BERTje ti and niet + ti:    0.030755928530612233
Difference:                 -0.017147605678916312
EuroBERT oti and ti:        0.011898994182254582
EuroBERT ti and niet + ti:  0.025850122165494392
Difference:                 -0.01395112798323981
mBERT oti and ti:           0.011650065902473526
mBERT ti and niet + ti:     0.02821737642342569
Difference:                 -0.016567310520952165

Davies-Bouldin Index
RobBERT oti and ti:         7.875617687245911
RobBERT ti and niet + ti:   5.467978177515712
Difference:                 2.407639509730199
BERTje oti and ti:          8.427271304654342
BERTje ti and niet + ti:    5.517885626939613
Difference:                 2.909385677714729
EuroBERT oti and ti:        8.856396914723716
EuroBERT ti and niet + ti:  6.0677586390104

# Closest sentences to average

In [41]:
sentences = pd.read_csv('D:\\preprocessed_sentences.csv')['Sentence']

In [42]:
def calculate_percentage_oti(embeddings, sentences):
    average_oti = np.average(embeddings[embeddings['Category'] == 'oti']['Sentence Embedding'])
    oti = []
    index = 0
    for i in embeddings[embeddings['Category'] == 'oti']['Sentence Embedding']:
        euclidian_distance = dist(i, average_oti)
        oti.append((i, euclidian_distance, index, sentences[index], 'oti'))
        index += 1
    ti = []
    index = 2645
    for i in embeddings[embeddings['Category'] == 'ti']['Sentence Embedding']:
        euclidian_distance = dist(i, average_oti)
        ti.append((i, euclidian_distance, index, sentences[index], 'ti'))
        index += 1
    ti_oti = oti + ti
    ti_oti.sort(key = lambda x: x[1])
    top_10 = ti_oti[:10]
    top_100 = ti_oti[:100]
    return sum(1 for i in top_10 if i[4] == 'oti')/len(top_10), sum(1 for i in top_100 if i[4] == 'oti')/len(top_100)

In [43]:
calculate_percentage_oti(embeddings_robbert, sentences)

(0.9, 0.72)

In [44]:
calculate_percentage_oti(embeddings_bertje, sentences)

(0.8, 0.66)

In [45]:
calculate_percentage_oti(embeddings_eurobert, sentences)

(0.9, 0.67)

In [46]:
calculate_percentage_oti(embeddings_mbert, sentences)

(0.3, 0.61)