<a href="https://colab.research.google.com/github/Heity94/TWSM_Lab/blob/main/Project/Notebooks/PH_TE_WMD_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation of WMD W2V Model on sample article

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Load data

In [73]:
import pandas as pd
import pickle
import numpy as np

In [10]:
data_path = "/content/drive/MyDrive/2022_Analytics Lab Student Projects/Data/All Topics"
data_path_group = data_path[:-10]+"Topic 1/Data_Team1/" # create new data path to access files created by Team1

In [57]:
ontology_syn = pd.read_csv("/content/drive/MyDrive/2022_Analytics Lab Student Projects/Data/Topic 1/Data_Team1/SynDet/SynDet_ontology_train.csv", index_col=0)

In [72]:
noun_phrases = pd.read_csv("/content/drive/MyDrive/2022_Analytics Lab Student Projects/Data/Topic 1/Data_Team1/SynDet/SynDet_entities_train.csv", index_col=0)

In [102]:
noun_phrases_sample = noun_phrases.loc[noun_phrases.article_id==188]

In [103]:
noun_phrases_clean = pd.read_csv(data_path_group+"wmd_syn_clean.csv", index_col=0)

### Prepare `noun_phrases_clean` df

In [104]:
noun_phrases_clean = noun_phrases_clean.drop(columns="ent_id")

In [105]:
noun_phrases_clean.head(3)

Unnamed: 0,sentence_id,noun_phrases,similarity_score,ontology_txt
0,188_3753_3772,which,0.4732733882113142,"['IS/IT', 'in', 'healthcares']"
1,188_3753_3772,we,0.5631700901822527,"['interorganizational', 'I']"
2,188_3753_3772,an information,0.6066101391194407,"['medical', 'information']"


In [106]:
# Clean column ontology txt
noun_phrases_clean["ontology_txt"] = noun_phrases_clean.ontology_txt.str.replace("[\[\],']", "", regex=True).str.strip()

In [107]:
noun_phrases_clean.head(3)

Unnamed: 0,sentence_id,noun_phrases,similarity_score,ontology_txt
0,188_3753_3772,which,0.4732733882113142,IS/IT in healthcares
1,188_3753_3772,we,0.5631700901822527,interorganizational I
2,188_3753_3772,an information,0.6066101391194407,medical information


In [108]:
noun_phrases_clean = noun_phrases_clean.rename(columns={"ontology_txt": "closest_entity"})

In [109]:
#merge dfs to get main_entity_id (=ent_id)
noun_phrases_mapped = noun_phrases_clean.merge(ontology_syn, 
                                               how="left", left_on="closest_entity", right_on="synonym").drop(columns=["category", "label", "synonym"])\
                                               .rename(columns={"entity_id": "main_entity_id"})

In [110]:
noun_phrases_mapped

Unnamed: 0,sentence_id,noun_phrases,similarity_score,closest_entity,main_entity_id
0,188_3753_3772,which,0.4732733882113142,IS/IT in healthcares,health information system
1,188_3753_3772,we,0.5631700901822527,interorganizational I,interorganizational system
2,188_3753_3772,an information,0.6066101391194407,medical information,healthcare data
3,188_3753_3772,communication resource,0.6281378354394097,IS/IT resource management,IT resource management
4,188_3753_3772,the Minitel system,0.5728775449540222,system in governments,government system
...,...,...,...,...,...
3247,188_3675_3704,mind,0.49290139466647426,thinking alouds,cognitive walkthrough
3248,188_3704_3727,the French people,0.5347515368856867,statistic the hypothesis,statistical hypothesis test
3249,188_3704_3727,the government,0.6226555134885214,government I,government system
3250,188_3704_3727,the French superhighway,0.529844106122733,theory of the commons,theory of the commons


In [111]:
noun_phrases_sample.head(3)

Unnamed: 0,article_id,sentence_id,ent_id,noun_phrases,true_ent_id
0,188,188_3753_3772,videotex,which,0
1,188,188_3753_3772,videotex,we,0
2,188,188_3753_3772,videotex,an information,0


### Map main entity and true entity on results 

In [112]:
noun_phrases_mapped_1 = noun_phrases_mapped.merge(noun_phrases_sample
                          [["sentence_id", "noun_phrases", "true_ent_id"]],
                          how="left", on=["sentence_id", "noun_phrases"])

In [115]:
# Define columns which should be checked for duplicates
col_dupl = ['sentence_id', 'noun_phrases', 'closest_entity', 'main_entity_id', 'true_ent_id']

#Drop duplicates
noun_phrases_mapped_1 = noun_phrases_mapped_1.drop_duplicates(subset=col_dupl)

In [116]:
noun_phrases_mapped_1.head(3)

Unnamed: 0,sentence_id,noun_phrases,similarity_score,closest_entity,main_entity_id,true_ent_id
0,188_3753_3772,which,0.4732733882113142,IS/IT in healthcares,health information system,0
1,188_3753_3772,we,0.5631700901822527,interorganizational I,interorganizational system,0
2,188_3753_3772,an information,0.6066101391194407,medical information,healthcare data,0


In [118]:
#Merge sim scores again on entities sample to get "true" entity id, but this time with the true entity id which was found by Roland
noun_phrases_mapped_f =noun_phrases_mapped_1.merge(noun_phrases_sample[["sentence_id", "ent_id"]], 
                                                           how="left", left_on=["sentence_id", "main_entity_id"], 
                                                           right_on=["sentence_id", "ent_id"])\
                                                           .rename(columns={"ent_id":"true_ent_id_cmpl_sen"})\
                                                           .drop_duplicates()

In [122]:
noun_phrases_mapped_f.head()

Unnamed: 0,sentence_id,noun_phrases,similarity_score,closest_entity,main_entity_id,true_ent_id,true_ent_id_cmpl_sen
0,188_3753_3772,which,0.4732733882113142,IS/IT in healthcares,health information system,0,
1,188_3753_3772,we,0.5631700901822527,interorganizational I,interorganizational system,0,
2,188_3753_3772,an information,0.6066101391194407,medical information,healthcare data,0,
3,188_3753_3772,communication resource,0.6281378354394097,IS/IT resource management,IT resource management,0,
4,188_3753_3772,the Minitel system,0.5728775449540222,system in governments,government system,0,


In [123]:
# Define function to combine both columns "true_ent_id" & "true_ent_id_cmpl_sen"
def true_ent_label(x):

  if x.true_ent_id!="0":
    return x.true_ent_id
  else:
    return x.true_ent_id_cmpl_sen

In [124]:
# Compare both columns and store correct value  in column true_entity_id
noun_phrases_mapped_f["true_entity_id"] = noun_phrases_mapped_f.apply(lambda x: true_ent_label(x), 
                                                                                axis=1, result_type="expand")

#Fill all NaN Values with 0 (no match)
noun_phrases_mapped_f["true_entity_id"] = noun_phrases_mapped_f["true_entity_id"].fillna(0)

#Drop columns "true_ent_id" & "true_ent_id_cmpl_sen"
noun_phrases_mapped_f = noun_phrases_mapped_f.drop(columns=["true_ent_id", "true_ent_id_cmpl_sen"])

In [129]:
noun_phrases_mapped_f.tail(10)


Unnamed: 0,sentence_id,noun_phrases,similarity_score,closest_entity,main_entity_id,true_entity_id
4159,188_3675_3704,Minitel,1.0,Minitel,videotex,videotex
4167,188_3675_3704,the French Government,0.5347083801639579,theory of the firms,transaction cost economics,0
4168,188_3675_3704,it,0.5129635599683585,not significants,no statistical significance,0
4169,188_3675_3704,the interests,0.5994099546083067,of the articles,literature filtering,0
4170,188_3675_3704,the French people,0.5347515368856867,statistic the hypothesis,statistical hypothesis test,0
4171,188_3675_3704,mind,0.4929013946664742,thinking alouds,cognitive walkthrough,0
4172,188_3704_3727,the French people,0.5347515368856867,statistic the hypothesis,statistical hypothesis test,0
4173,188_3704_3727,the government,0.6226555134885214,government I,government system,0
4174,188_3704_3727,the French superhighway,0.529844106122733,theory of the commons,theory of the commons,0
4175,188_3727_3737,The Minitel terminal,0.5364154021729209,operator interface terminal,user interface,0


In [130]:
# Check wheter main_entity based on score is identical to "true" main entity label 
noun_phrases_mapped_f["correct_match"] = noun_phrases_mapped_f.main_entity_id==noun_phrases_mapped_f.true_entity_id

In [132]:
# Check results for one sample sentence
noun_phrases_mapped_f[(noun_phrases_mapped_f.sentence_id=="188_5984_6028")].sort_values(by="similarity_score", ascending=False)#&(sim_scores_samp

Unnamed: 0,sentence_id,noun_phrases,similarity_score,closest_entity,main_entity_id,true_entity_id,correct_match
788,188_5984_6028,information infrastructure,1.0,information infrastructure,information technology infrastructure,information technology infrastructure,True
799,188_5984_6028,application,1.0,application staffings,IT workforce,0,False
800,188_5984_6028,application,1.0,application staffings,IT recruiting,0,False
801,188_5984_6028,service development,0.6956238842940197,information service development,information services development,0,False
764,188_5984_6028,the private sector,0.6950479174448173,private sector,private sector,private sector,True
750,188_5984_6028,the public institutions,0.6888989628484203,public sector institutions,government,0,False
753,188_5984_6028,an information superhighway,0.6883462591864447,information superhighway,information superhighway,information superhighway,True
752,188_5984_6028,the development,0.6142808188071551,IS/IT development efforts,systems development effort,0,False
775,188_5984_6028,innovative communications,0.6058404258457725,nearsourcing of information and communications...,nearshoring,0,False
776,188_5984_6028,hardware receivers,0.6040103002645342,hardware,hardware,hardware,True


In [134]:
#remove the values where loop did not get an index ("not in index range")
noun_phrases_mapped_f = noun_phrases_mapped_f[~noun_phrases_mapped_f.similarity_score.str.contains("not in index range")]

In [136]:
#convert str to float
noun_phrases_mapped_f["similarity_score"] = noun_phrases_mapped_f["similarity_score"].astype(float)

### Calculate results within score bins

In [140]:
# Create bins of similarity score to check the number of correct matches compared to the labels of the database
grouped_df = noun_phrases_mapped_f.groupby(pd.cut(noun_phrases_mapped_f.similarity_score, np.arange(0.,1.1,0.1)))[["correct_match"]]
corr_match_bins = np.round(grouped_df.sum()/grouped_df.count(),2).rename(columns={"correct_match":"pct_correct_in_bin"})
corr_match_bins["no_correct"]= grouped_df.sum()
corr_match_bins["total"]= grouped_df.count()
corr_match_bins = corr_match_bins[["total", "no_correct", "pct_correct_in_bin"]]
corr_match_bins = corr_match_bins.iloc[::-1] #reverse order

In [141]:
corr_match_bins

Unnamed: 0_level_0,total,no_correct,pct_correct_in_bin
similarity_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(0.9, 1.0]",131,24,0.18
"(0.8, 0.9]",0,0,
"(0.7, 0.8]",70,22,0.31
"(0.6, 0.7]",497,110,0.22
"(0.5, 0.6]",524,33,0.06
"(0.4, 0.5]",303,5,0.02
"(0.3, 0.4]",0,0,
"(0.2, 0.3]",0,0,
"(0.1, 0.2]",0,0,
"(0.0, 0.1]",0,0,
