<a href="https://colab.research.google.com/github/Heity94/TWSM_Lab/blob/main/Project/Notebooks/MA_PerformanceEvaluation_SynDet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Data

In [1]:
#Mount drive
from google.colab import drive
drive.mount('/content/drive')


#install required packages
!pip install -U sentence-transformers -q

Mounted at /content/drive
[K     |████████████████████████████████| 85 kB 5.0 MB/s 
[K     |████████████████████████████████| 4.4 MB 49.9 MB/s 
[K     |████████████████████████████████| 1.2 MB 62.7 MB/s 
[K     |████████████████████████████████| 101 kB 11.5 MB/s 
[K     |████████████████████████████████| 596 kB 71.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 49.4 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np

In [3]:
data_path = "/content/drive/MyDrive/2022_Analytics Lab Student Projects/Data/Topic 1/Data_Team1/"

## Noun Phrases

In [4]:
entities_train = pd.read_csv(data_path+"/SynDet/SynDet_entities_test.csv", index_col=0)

In [5]:
# Filter for one article (188)
entities_sample = entities_train.loc[(entities_train["article_id"] >= 188) & (entities_train["article_id"] < 198)]

## Entities and Synonyms

In [6]:
ontology_train = pd.read_csv(data_path+"/SynDet/SynDet_ontology_test.csv", index_col=0)

In [7]:
#Show synonyms in ontology which appear in more than 1 entity
ontology_train[ontology_train.synonym.duplicated(keep=False)].sort_values(by="synonym")

Unnamed: 0,entity_id,category,label,synonym
10111,information technology operation,domain specific entity,TOPIC,IS/IT operations
11730,systems operations,domain specific entity,TOPIC,IS/IT operations
4861,computer supported cooperative work,domain specific entity,TECHNOLOGY,internet aided collaboration
6382,IT supported collaboration,domain specific entity,TECHNOLOGY,internet aided collaboration
3109,use case diagram,research method,CONCEPTUAL_METHOD,use case diagram
11465,use case modeling,domain specific entity,TOPIC,use case diagram


In [8]:
#List of unique entities from ontology with synonyms without downsampling (apparently there are some duplicates in there?!)
ontology_train_ds_unique = ontology_train.drop_duplicates(subset="synonym")

In [9]:
from sentence_transformers import SentenceTransformer, util

# Cosine Similarty score with ST model (Without fine-tuning)

## Load Model

In [10]:
model_norm = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

## Computing Cosine Similarity

In [11]:
# create function based on semantic_search
def sem_search_sent_ent(model, sentences, entities, top_k = 1):


  #Compute embeddings
  embeddings_sentences = model.encode(sentences.noun_phrases.to_list(), convert_to_tensor=True, show_progress_bar=True)
  embeddings_entities = model.encode(entities.synonym.to_list(), convert_to_tensor=True, show_progress_bar=True)

  #Compute cosine-similarities for each sentence with all entities and return top k per sentence (list of list of dictonaries)
  cosine_scores = util.semantic_search(embeddings_sentences, embeddings_entities, top_k=top_k) #returns [[{'corpus_id': 483, 'score': 0.483295202255249}],...]

  #loop over results from semantic_search and create dataframe with sentence id and store in list
  dfs = []
  for idx, sim in enumerate(cosine_scores):
    test = pd.DataFrame(sim)
    test["sentence_id"] = sentences.iloc[idx,1]
    test["noun_phrase_txt"] = sentences.iloc[idx,3]
    test = test.rename(columns={"corpus_id":"synonym"})
    dfs.append(test)

  # concat all dfs 
  dfs_df = pd.concat(dfs)
  dfs_df = dfs_df[["score", "sentence_id", "synonym", "noun_phrase_txt"]] # reorder columns
  dfs_df[["entity"]] = dfs_df[["synonym"]].applymap(lambda x:  entities.iloc[x,0]) # add "main" entity name (since we compare also with synonyms)
  dfs_df[["synonym"]] = dfs_df[["synonym"]].applymap(lambda x:  entities.iloc[x,3]) #replace entity_id_idx with real name of entity
  dfs_df = dfs_df.sort_values(by="score", ascending=False, ignore_index=True) # sort by score and reset index

  return dfs_df

In [12]:
sim_scores_sample_norm = sem_search_sent_ent(model_norm, entities_sample, ontology_train_ds_unique, top_k=1)

Batches:   0%|          | 0/607 [00:00<?, ?it/s]

Batches:   0%|          | 0/699 [00:00<?, ?it/s]

In [13]:
# Merge sim scores on entities sample to get "true" entity id
sim_scores_sample_mapped_norm = sim_scores_sample_norm.merge(entities_sample
                                              [["sentence_id", "noun_phrases", "true_ent_id"]],
                                               how="left", left_on=["sentence_id", "noun_phrase_txt"], 
                                               right_on=["sentence_id", "noun_phrases"])\
                                               .drop(columns=["noun_phrases"])

In [14]:
# Define columns which should be checked for duplicates
col_dupl = sim_scores_sample_mapped_norm.columns[1:].to_list()

#Drop duplicates
sim_scores_sample_mapped_norm = sim_scores_sample_mapped_norm.drop_duplicates(subset=col_dupl)

In [15]:
#Merge sim scores again on entities sample to get "true" entity id, but this time with the true entity id which was found by Roland
sim_scores_sample_mapped_f_norm =sim_scores_sample_mapped_norm.merge(entities_sample[["sentence_id", "ent_id"]], 
                                                           how="left", left_on=["sentence_id", "entity"], 
                                                           right_on=["sentence_id", "ent_id"])\
                                                           .rename(columns={"ent_id":"true_ent_id_cmpl_sen"})\
                                                           .drop_duplicates()

In [16]:
# Define function to combine both columns "true_ent_id" & "true_ent_id_cmpl_sen"
def true_ent_label(x):

  if x.true_ent_id!="0":
    return x.true_ent_id
  else:
    return x.true_ent_id_cmpl_sen

In [17]:
# Compare both columns and store correct value  in column true_entity_id
sim_scores_sample_mapped_f_norm["true_entity_id"] = sim_scores_sample_mapped_f_norm.apply(lambda x: true_ent_label(x), 
                                                                                axis=1, result_type="expand")

#Fill all NaN Values with 0 (no match)
sim_scores_sample_mapped_f_norm["true_entity_id"] = sim_scores_sample_mapped_f_norm["true_entity_id"].fillna(0)

#Drop columns "true_ent_id" & "true_ent_id_cmpl_sen"
sim_scores_sample_mapped_f_norm = sim_scores_sample_mapped_f_norm.drop(columns=["true_ent_id", "true_ent_id_cmpl_sen"])

In [18]:
# Check wheter main_entity based on cosine similarity is identical to "true" main entity label 
sim_scores_sample_mapped_f_norm["correct_match"] = sim_scores_sample_mapped_f_norm.entity==sim_scores_sample_mapped_f_norm.true_entity_id

In [19]:
# Create bins of similarity score to check the number of correct matches compared to the labels of the database
grouped_df = sim_scores_sample_mapped_f_norm.groupby(pd.cut(sim_scores_sample_mapped_f_norm.score, np.arange(0.,1.1,0.1)))[["correct_match"]]
corr_match_bins_norm = np.round(grouped_df.sum()/grouped_df.count(),2).rename(columns={"correct_match":"pct_correct_in_bin"})
corr_match_bins_norm["no_correct"]= grouped_df.sum()
corr_match_bins_norm["total"]= grouped_df.count()
corr_match_bins_norm = corr_match_bins_norm[["total", "no_correct", "pct_correct_in_bin"]]
corr_match_bins_norm = corr_match_bins_norm.iloc[::-1] #reverse order

In [20]:
corr_match_bins_norm
corr_match_bins_norm.append(corr_match_bins_norm.sum().rename('Total'))

Unnamed: 0_level_0,total,no_correct,pct_correct_in_bin
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(0.9, 1.0]",409.0,318.0,0.78
"(0.8, 0.9]",934.0,318.0,0.34
"(0.7, 0.8]",1869.0,372.0,0.2
"(0.6, 0.7]",2448.0,291.0,0.12
"(0.5, 0.6]",2904.0,122.0,0.04
"(0.4, 0.5]",2824.0,44.0,0.02
"(0.3, 0.4]",492.0,7.0,0.01
"(0.2, 0.3]",6.0,0.0,0.0
"(0.1, 0.2]",0.0,0.0,
"(0.0, 0.1]",0.0,0.0,


In [21]:
corr_match_bins_norm.to_csv(data_path+"SynDet/SynDet_results_vanilla")

# Cosine Similarity score with a fine-tuned ST model (Synonym Dataset)

## Load the fine-tuned Sentence Transformer

In [22]:
model = SentenceTransformer(data_path+"/SynDet/Synonym_Dataset/model_sentence_transformer_result_2")

## Computing the cosine similarity 

In [23]:
sim_scores_sample = sem_search_sent_ent(model, entities_sample, ontology_train_ds_unique, top_k=1)

Batches:   0%|          | 0/607 [00:00<?, ?it/s]

Batches:   0%|          | 0/699 [00:00<?, ?it/s]

## Evaluate the performance in dectecting synonym

In [24]:
# Merge sim scores on entities sample to get "true" entity id
sim_scores_sample_mapped = sim_scores_sample.merge(entities_sample
                                              [["sentence_id", "noun_phrases", "true_ent_id"]],
                                               how="left", left_on=["sentence_id", "noun_phrase_txt"], 
                                               right_on=["sentence_id", "noun_phrases"])\
                                               .drop(columns=["noun_phrases"])

In [25]:
# Define columns which should be checked for duplicates
col_dupl = sim_scores_sample_mapped.columns[1:].to_list()

#Drop duplicates
sim_scores_sample_mapped = sim_scores_sample_mapped.drop_duplicates(subset=col_dupl)

In [26]:
#Merge sim scores again on entities sample to get "true" entity id, but this time with the true entity id which was found by Roland
sim_scores_sample_mapped_f =sim_scores_sample_mapped.merge(entities_sample[["sentence_id", "ent_id"]], 
                                                           how="left", left_on=["sentence_id", "entity"], 
                                                           right_on=["sentence_id", "ent_id"])\
                                                           .rename(columns={"ent_id":"true_ent_id_cmpl_sen"})\
                                                           .drop_duplicates()

In [27]:
# Compare both columns and store correct value  in column true_entity_id
sim_scores_sample_mapped_f["true_entity_id"] = sim_scores_sample_mapped_f.apply(lambda x: true_ent_label(x), 
                                                                                axis=1, result_type="expand")

#Fill all NaN Values with 0 (no match)
sim_scores_sample_mapped_f["true_entity_id"] = sim_scores_sample_mapped_f["true_entity_id"].fillna(0)

#Drop columns "true_ent_id" & "true_ent_id_cmpl_sen"
sim_scores_sample_mapped_f = sim_scores_sample_mapped_f.drop(columns=["true_ent_id", "true_ent_id_cmpl_sen"])

In [28]:
# Check wheter main_entity based on cosine similarity is identical to "true" main entity label 
sim_scores_sample_mapped_f["correct_match"] = sim_scores_sample_mapped_f.entity==sim_scores_sample_mapped_f.true_entity_id

In [29]:
# Check results for one sample sentence
sim_scores_sample_mapped_f[(sim_scores_sample_mapped_f.sentence_id=="188_5984_6028")].sort_values(by="score", ascending=False)#&(sim_scores_sample_mapped.score<1.)]

Unnamed: 0,score,sentence_id,synonym,noun_phrase_txt,entity,true_entity_id,correct_match
3193,0.995111,188_5984_6028,information technology infrastructure,information infrastructure,information technology infrastructure,information technology infrastructure,True
6681,0.969299,188_5984_6028,information superhighway,an information superhighway,information superhighway,information superhighway,True
7124,0.963645,188_5984_6028,public sector industries,the public institutions,government,0,False
7157,0.963625,188_5984_6028,information and communications technology deve...,the development,system development method,0,False
12927,0.847716,188_5984_6028,information services development,service development,information services development,0,False
13887,0.798647,188_5984_6028,private sector,the private sector,private sector,private sector,True
14845,0.760352,188_5984_6028,IS technology,application,IS technology,0,False
18506,0.607873,188_5984_6028,hardware,hardware receivers,hardware,hardware,True
19804,0.529411,188_5984_6028,telecommunications,innovative communications,telecommunications industry,0,False
20440,0.489981,188_5984_6028,cocreations,a 'host,participatory design,0,False


In [30]:
# Create bins of similarity score to check the number of correct matches compared to the labels of the database
grouped_df = sim_scores_sample_mapped_f.groupby(pd.cut(sim_scores_sample_mapped_f.score, np.arange(0.,1.1,0.1)))[["correct_match"]]
corr_match_bins = np.round(grouped_df.sum()/grouped_df.count(),2).rename(columns={"correct_match":"pct_correct_in_bin"})
corr_match_bins["no_correct"]= grouped_df.sum()
corr_match_bins["total"]= grouped_df.count()
corr_match_bins = corr_match_bins[["total", "no_correct", "pct_correct_in_bin"]]
corr_match_bins = corr_match_bins.iloc[::-1] #reverse order

In [31]:
corr_match_bins
corr_match_bins.append(corr_match_bins.sum().rename('Total'))

Unnamed: 0_level_0,total,no_correct,pct_correct_in_bin
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(0.9, 1.0]",3418.0,728.0,0.21
"(0.8, 0.9]",1826.0,221.0,0.12
"(0.7, 0.8]",1593.0,88.0,0.06
"(0.6, 0.7]",1787.0,71.0,0.04
"(0.5, 0.6]",1243.0,28.0,0.02
"(0.4, 0.5]",1244.0,21.0,0.02
"(0.3, 0.4]",622.0,0.0,0.0
"(0.2, 0.3]",141.0,0.0,0.0
"(0.1, 0.2]",0.0,0.0,
"(0.0, 0.1]",0.0,0.0,


In [32]:
corr_match_bins.to_csv(data_path+"SynDet/SynDet_results_syndataset")

# Cosine Similarity with a fine-tuned ST (Triplet Dataset)

## Load the fine-tuned Sentence Transformer

In [33]:
model_tripl = SentenceTransformer(data_path+"SynDet/Triplets_Dataset")

## Computing the Cosine Similarity

In [34]:
sim_scores_sample_tripl = sem_search_sent_ent(model_tripl, entities_sample, ontology_train_ds_unique, top_k=1)

Batches:   0%|          | 0/607 [00:00<?, ?it/s]

Batches:   0%|          | 0/699 [00:00<?, ?it/s]

## Evaluate the performance in detecting synonym

In [35]:
# Merge sim scores on entities sample to get "true" entity id
sim_scores_sample_mapped_tripl = sim_scores_sample_tripl.merge(entities_sample
                                              [["sentence_id", "noun_phrases", "true_ent_id"]],
                                               how="left", left_on=["sentence_id", "noun_phrase_txt"], 
                                               right_on=["sentence_id", "noun_phrases"])\
                                               .drop(columns=["noun_phrases"])

In [36]:
# Define columns which should be checked for duplicates
col_dupl = sim_scores_sample_mapped_tripl.columns[1:].to_list()

#Drop duplicates
sim_scores_sample_mapped_tripl = sim_scores_sample_mapped_tripl.drop_duplicates(subset=col_dupl)

In [37]:
#Merge sim scores again on entities sample to get "true" entity id, but this time with the true entity id which was found by Roland
sim_scores_sample_mapped_f_tripl =sim_scores_sample_mapped_tripl.merge(entities_sample[["sentence_id", "ent_id"]], 
                                                           how="left", left_on=["sentence_id", "entity"], 
                                                           right_on=["sentence_id", "ent_id"])\
                                                           .rename(columns={"ent_id":"true_ent_id_cmpl_sen"})\
                                                           .drop_duplicates()

In [38]:
# Compare both columns and store correct value  in column true_entity_id
sim_scores_sample_mapped_f_tripl["true_entity_id"] = sim_scores_sample_mapped_f_tripl.apply(lambda x: true_ent_label(x), 
                                                                                axis=1, result_type="expand")

#Fill all NaN Values with 0 (no match)
sim_scores_sample_mapped_f_tripl["true_entity_id"] = sim_scores_sample_mapped_f_tripl["true_entity_id"].fillna(0)

#Drop columns "true_ent_id" & "true_ent_id_cmpl_sen"
sim_scores_sample_mapped_f_tripl = sim_scores_sample_mapped_f_tripl.drop(columns=["true_ent_id", "true_ent_id_cmpl_sen"])

In [39]:
# Check wheter main_entity based on cosine similarity is identical to "true" main entity label 
sim_scores_sample_mapped_f_tripl["correct_match"] = sim_scores_sample_mapped_f_tripl.entity==sim_scores_sample_mapped_f_tripl.true_entity_id

In [40]:
# Create bins of similarity score to check the number of correct matches compared to the labels of the database
grouped_df = sim_scores_sample_mapped_f_tripl.groupby(pd.cut(sim_scores_sample_mapped_f_tripl.score, np.arange(0.,1.1,0.1)))[["correct_match"]]
corr_match_bins_tripl = np.round(grouped_df.sum()/grouped_df.count(),2).rename(columns={"correct_match":"pct_correct_in_bin"})
corr_match_bins_tripl["no_correct"]= grouped_df.sum()
corr_match_bins_tripl["total"]= grouped_df.count()
corr_match_bins_tripl = corr_match_bins_tripl[["total", "no_correct", "pct_correct_in_bin"]]
corr_match_bins_tripl = corr_match_bins_tripl.iloc[::-1] #reverse order

In [41]:
corr_match_bins_tripl
corr_match_bins_tripl.append(corr_match_bins_tripl.sum().rename('Total'))

Unnamed: 0_level_0,total,no_correct,pct_correct_in_bin
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(0.9, 1.0]",395.0,300.0,0.76
"(0.8, 0.9]",951.0,330.0,0.35
"(0.7, 0.8]",1858.0,364.0,0.2
"(0.6, 0.7]",2448.0,287.0,0.12
"(0.5, 0.6]",2904.0,123.0,0.04
"(0.4, 0.5]",2823.0,44.0,0.02
"(0.3, 0.4]",488.0,7.0,0.01
"(0.2, 0.3]",5.0,0.0,0.0
"(0.1, 0.2]",0.0,0.0,
"(0.0, 0.1]",0.0,0.0,


In [42]:
corr_match_bins_tripl.to_csv(data_path+"SynDet/SynDet_results_tripldataset")