In [96]:
import requests
import json
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from time import sleep
from tqdm.auto import tqdm
import numpy as np
import ast
from sentence_transformers import SentenceTransformer

In [97]:
model = SentenceTransformer('intfloat/e5-large-v2')

In [98]:
df_MAF=pd.read_csv('data/MAF_Evaluation.csv')

In [99]:
df_MAF.columns

Index(['item', 'node_id', 'IAB_Candidates', 'H_Scores'], dtype='object')

In [100]:
df_IAB=pd.read_csv('data/IAB_Evaluation.csv')

In [101]:
df_IAB.columns

Index(['IAB_Label', 'node_ids'], dtype='object')

In [102]:
iab_labels=df_IAB['IAB_Label'].to_list()

In [103]:
len(iab_labels)

160

In [104]:
maf=df_MAF['item'].to_list()

In [105]:
len(maf)

942

In [106]:
for i,x in enumerate(maf):
    try: 
        x=x.replace("_"," ")
    except:
        pass
    maf[i]=x

In [107]:
call=iab_labels+maf

In [108]:
len(call)

1102

In [109]:
print(call[0:10])

['Hair Care', 'Shaving', 'Motorcycles', "Women's Formal Wear", "Women's Clothing", 'Retail Industry', 'Comedy Events', 'Household Supplies', 'Home Security', 'Polar Travel']


In [110]:
total_embeddings=[]

In [111]:
total_embeddings=model.encode(call, normalize_embeddings=True)

In [112]:
iab_embeddings=total_embeddings[0:160]

In [113]:
type(iab_embeddings[0][0])

numpy.float32

In [114]:
maf_embeddings=total_embeddings[160:]

In [115]:
len(maf_embeddings)

942

In [116]:
iab_embeddings[0]

array([ 0.01664113, -0.05357603,  0.04887162, ..., -0.06031791,
        0.02185748,  0.03875877], dtype=float32)

In [117]:
maf_embeddings[0]

array([ 0.01973327, -0.04324522,  0.01025429, ..., -0.02802336,
        0.00969677,  0.05434723], dtype=float32)

In [118]:
def calculate_precision_recall(true_labels, predictions):
    true_positives = sum(1 for pred in predictions if pred in true_labels)
    false_positives = len(predictions) - true_positives
    false_negatives = len(true_labels) - true_positives
    
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    
    return precision, recall

In [119]:
def calculate_f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [120]:
nodes=df_MAF['item'].to_list()
node_id=df_MAF['node_id'].to_list()
IAB_Candidates=df_MAF['IAB_Candidates'].to_list()
H_Mappings=df_MAF['H_Scores'].to_list()

In [121]:
cos_sim=cosine_similarity(np.array(iab_embeddings[98]).reshape(1,-1),np.array(maf_embeddings[1]).reshape(1,-1))
print(cos_sim)

[[0.7622761]]


In [122]:
top_n_labels_new=[]
top_n_cosines_new=[]
top_n_nodes=[]
top_n_node_ids=[]
top_n_IAB_Candidates=[]
top_n_H_scores=[]
top_n_labels=[]
top_n_cosines=[]

for i in tqdm(range(len(maf_embeddings))):
    similarities=[]
    for j in range(len(iab_embeddings)):
        cos_sim=cosine_similarity(np.array(iab_embeddings[j]).reshape(1,-1),np.array(maf_embeddings[i]).reshape(1,-1))
        similarities.append([call[160+j],call[j],cos_sim])
    
    similarities.sort(key=lambda x: x[2], reverse=True)


    #threshold logic needed
    similarities=[x for x in similarities if x[2]>=0.7375]

    
    
    if len(similarities)==0:
        similarities.append(['No_Pred','No_Pred',[[0]]])
    
    temp_iab_labels=[]
    temp_cosines=[]


    for items in similarities:
        #print(items[1])
        temp_iab_labels.append(items[1])
        temp_cosines.append(items[2][0])
        
        top_n_labels_new.append(items[1])
        top_n_cosines_new.append(items[2][0])
        top_n_nodes.append(nodes[i])
        top_n_node_ids.append(node_id[i])
        top_n_IAB_Candidates.append(IAB_Candidates[i])
        top_n_H_scores.append(H_Mappings[i])


    
    top_n_labels.append(temp_iab_labels)
    top_n_cosines.append(temp_cosines)


  0%|          | 0/942 [00:00<?, ?it/s]

100%|██████████| 942/942 [01:07<00:00, 13.98it/s]


In [123]:
len(top_n_labels)

942

In [124]:
len(top_n_labels_new)

101008

In [125]:
len(top_n_cosines)

942

In [126]:
# print(top_n_labels)
# print(top_n_cosines)

In [127]:
MAF_predictions=df_MAF['IAB_Candidates'].to_list()

In [128]:
MAF_predictions[230]

"['Business and Finance', 'Education industry', 'Entertainment Industry', 'Financial Industry', 'Healthcare Industry', 'Logistics and Transportation Industry', 'Media Industry', 'Technology Industry', 'Telecommunications Industry', 'Automotive Industry', 'Aviation Industry', 'Events and Attractions', 'Fashion Events', 'Concerts & Music Events', 'Medical Health', 'Travel', 'Africa Travel', 'Asia Travel', 'Australia and Oceania Travel', 'Europe Travel', 'North America Travel', 'South America Travel', 'Air Travel', 'Entertainment Industry', 'Media Industry']"

In [129]:
candidate_count=0
hit_count=0
precision=0
recall=0
count=0
f1=0

for predictions, labels in zip(MAF_predictions,top_n_labels):
    count+=1
    proper=ast.literal_eval(predictions)
    prec, reca=calculate_precision_recall(predictions, labels)

    f=calculate_f1_score(prec, reca)
    precision+=prec
    recall+=reca
    f1+=f


    candidate_count+=len(proper)
    for items in labels:
        if items in proper:
            hit_count+=1

In [130]:
hit_count/candidate_count

0.8713096139288418

In [131]:
precision/942

0.028025503377093076

In [132]:
recall/942

0.06680706646499002

In [133]:
f1/942

0.030797604420095312

In [134]:
count

942

In [135]:
candidate_count

2642

In [136]:
dic={'item':top_n_nodes, 'node_id': top_n_node_ids, 'IAB_Candidates': top_n_IAB_Candidates,
'predictions': top_n_labels_new, 'cosine_predicted': top_n_cosines_new, "H_Mapping_scores": top_n_H_scores }

In [137]:
df_MAF_new=pd.DataFrame.from_dict(dic)

In [139]:
df_MAF_new.to_excel('results/e5_results_0.7375_new.xlsx')

In [140]:
df_MAF_new.shape

(101008, 6)

In [141]:
#Top 100 in predictions but not in embedding

In [142]:
df_MAF_new=df_MAF_new.sort_values(by='cosine_predicted', ascending=False)

In [143]:
df_MAF_new.iloc[0]

item                convertible
node_id                    6584
IAB_Candidates       ['Travel']
predictions         Convertible
cosine_predicted    [1.0000002]
H_Mapping_scores          [1.0]
Name: 11398, dtype: object

In [144]:
top_100_node=[]
top_100_embedding_pred=[]
top_100_H_pred=[]
top_100_cos_pred=[]
top_100_H_score=[]

In [145]:
items=df_MAF_new['item'].to_list()
IAB_Candidates=df_MAF_new['IAB_Candidates'].to_list()
predictions=df_MAF_new['predictions'].to_list()
cosine_predicted=df_MAF_new['cosine_predicted'].to_list()
H_Mapping_scores=df_MAF_new['H_Mapping_scores'].to_list()

In [146]:
for item,IAB,pred,cos,H_map in tqdm(zip(items,IAB_Candidates,predictions,cosine_predicted,H_Mapping_scores)):
    pred_list=ast.literal_eval(IAB)
    if pred not in pred_list:
        top_100_node.append(item)
        top_100_embedding_pred.append(pred)
        top_100_H_pred.append(IAB)
        top_100_cos_pred.append(cos)
        top_100_H_score.append(H_map)
    if len(top_100_node)==100:
        break

240it [00:00, 7508.43it/s]


In [147]:
top_100_dic={'items':top_100_node, 'Embedding Predictions': top_100_embedding_pred,
 "H_Mapping_Pred": top_100_H_pred, 'Predicted Cosine': top_100_cos_pred, 
 'H Mapping Cosine': top_100_H_score}

In [148]:
df_top_100=pd.DataFrame.from_dict(top_100_dic)

In [149]:
df_top_100.to_excel('results/e5_Top_100_Not_In_H.xlsx', index=False)

In [None]:
#Top 100 in H but not in Embedding

In [150]:
print(top_n_labels[5])
print(top_n_cosines[5])

['Dogs', 'Large Animals', 'Cats', 'Pets', 'Commercial Trucks', 'Reptiles', 'Polar Travel', 'Shaving', 'Pickup Trucks', 'Grocery Shopping', 'SUV', 'Smart Home', 'Oral care', 'Single Life', 'Bath and Shower', 'Home Security', 'Perfume and Fragrance', 'Healthy Living', 'Zoos & Aquariums', 'Sedan', 'Fitness and Exercise', 'Wellness', 'Station Wagon', 'Hair Care', 'Parks & Nature', 'Luxury Cars', 'Auto Type', 'North America Travel', 'Parenting', 'Beauty', 'Home Improvement', 'Medical Health', 'Family and Relationships', 'Hobbies & Interests', 'Home Appliances', 'Automotive', 'Auto Repair', 'Graduation', 'Convertible', 'Deodorant and Antiperspirant', 'Smartphones', 'Holiday Shopping', 'Science', 'Minivan', 'Music and Audio', 'Makeup and Accessories', 'Camping', 'Prom', 'Daycare and Pre-School', 'Birthday', 'Auto Parts', 'Birds', 'Style & Fashion', 'Cameras and Camcorders', 'Desktops', 'Home & Garden', 'Video Gaming', 'Laptops', 'Motorcycles', 'Home Entertaining', 'Healthcare Industry', 'Mall

In [151]:
nodes=df_MAF['item'].to_list()
# node_id=df_MAF['node_id'].to_list()
IAB_Candidates=df_MAF['IAB_Candidates'].to_list()
H_Mappings=df_MAF['H_Scores'].to_list()

In [152]:
mid_node=[]
mid_IAB=[]
mid_label=[]
mid_cos=[]
mid_H_map=[]

for node, IAB, label, cos, H_map in tqdm(zip(nodes,IAB_Candidates,top_n_labels,top_n_cosines,H_Mappings)):
    IAB_list=ast.literal_eval(IAB)
    H_Map_list=ast.literal_eval(H_map)
    for IAB_item, H_Map_item in zip(IAB_list,H_Map_list):
        mid_node.append(node)
        mid_IAB.append(IAB_item)
        mid_label.append(label)
        mid_cos.append(cos)
        mid_H_map.append(H_Map_item)

942it [00:00, 16919.32it/s]


In [153]:
dic={'item':mid_node, 'IAB_Candidates': mid_IAB,
'predictions': mid_label, 'cosine_predicted': mid_cos, "H_Mapping_scores": mid_H_map}

In [154]:
df_lol_100=pd.DataFrame.from_dict(dic)

In [155]:
df_lol_100.head()

Unnamed: 0,item,IAB_Candidates,predictions,cosine_predicted,H_Mapping_scores
0,closet,Style & Fashion,"[Shopping, Convertible, Camping, Birthday, Van...","[[0.81433314], [0.8021967], [0.80011946], [0.7...",0.726366
1,closet,Women's Clothing,"[Shopping, Convertible, Camping, Birthday, Van...","[[0.81433314], [0.8021967], [0.80011946], [0.7...",0.718773
2,closet,Men's Clothing,"[Shopping, Convertible, Camping, Birthday, Van...","[[0.81433314], [0.8021967], [0.80011946], [0.7...",0.723078
3,closet,Men's Formal Wear,"[Shopping, Convertible, Camping, Birthday, Van...","[[0.81433314], [0.8021967], [0.80011946], [0.7...",0.710873
4,rosehip,Flower Shopping,"[Flower Shopping, Oral care, Hair Care, Green ...","[[0.77521855], [0.77189094], [0.76413876], [0....",0.742494


In [156]:
df_lol_100=df_lol_100.sort_values(by='H_Mapping_scores', ascending=False)

In [157]:
df_lol_100.head()

Unnamed: 0,item,IAB_Candidates,predictions,cosine_predicted,H_Mapping_scores
2613,wagon,Travel,"[Station Wagon, Van, Sedan, SUV, Coupe, Campin...","[[0.883122], [0.8603968], [0.8480902], [0.8441...",1.0
2614,drop,Real Estate,"[Graduation, Convertible, Dating, Science, Cou...","[[0.8278803], [0.82517684], [0.81389654], [0.8...",1.0
2615,drop,Travel,"[Graduation, Convertible, Dating, Science, Cou...","[[0.8278803], [0.82517684], [0.81389654], [0.8...",1.0
2629,mousetrap,Home & Garden,"[Cats, Convertible, Reptiles, Pets, Station Wa...","[[0.80222684], [0.7876469], [0.7850667], [0.77...",1.0
2626,ring_mail,Events and Attractions,"[Home Security, Smart Home, Wedding, Home Ente...","[[0.7973201], [0.796212], [0.7920143], [0.7918...",1.0


In [158]:
top_100_node=[]
top_100_embedding_pred=[]
top_100_H_pred=[]
top_100_cos_pred=[]
top_100_H_score=[]

In [159]:
items=df_lol_100['item'].to_list()
IAB_Candidates=df_lol_100['IAB_Candidates'].to_list()
predictions=df_lol_100['predictions'].to_list()
cosine_predicted=df_lol_100['cosine_predicted'].to_list()
H_Mapping_scores=df_lol_100['H_Mapping_scores'].to_list()

In [160]:
for item,IAB,pred,cos,H_map in tqdm(zip(items,IAB_Candidates,predictions,cosine_predicted,H_Mapping_scores)):
    #pred_list=ast.literal_eval(pred)
    if IAB not in pred:
        top_100_node.append(item)
        top_100_embedding_pred.append(pred)
        top_100_H_pred.append(IAB)
        top_100_cos_pred.append(cos)
        top_100_H_score.append(H_map)
    if len(top_100_node)==100:
        break

1005it [00:00, 269088.77it/s]


In [161]:
top_100_dic={'items':top_100_node, 'Embedding Predictions': top_100_embedding_pred,
 "H_Mapping_Pred": top_100_H_pred, 'Predicted Cosine': top_100_cos_pred, 
 'H Mapping Cosine': top_100_H_score}

In [162]:
df_top_100=pd.DataFrame.from_dict(top_100_dic)

In [163]:
df_top_100.to_excel('results/e5_Top_100_Not_In_e5.xlsx', index=False)