In [1]:
import requests
import json
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from time import sleep
from tqdm.auto import tqdm
import numpy as np
import ast
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('all-mpnet-base-v2')

In [3]:
df_MAF=pd.read_csv('data/MAF_Evaluation.csv')

In [4]:
df_MAF.columns

Index(['Unnamed: 0', 'item', 'IAB_Candidates', 'scores'], dtype='object')

In [5]:
df_IAB=pd.read_csv('data/IAB_Evaluation.csv')

In [6]:
df_IAB.columns

Index(['IAB_Label', 'node_ids'], dtype='object')

In [7]:
iab_labels=df_IAB['IAB_Label'].to_list()

In [8]:
len(iab_labels)

160

In [9]:
maf=df_MAF['item'].to_list()

In [10]:
len(maf)

1876

In [11]:
for i,x in enumerate(maf):
    try: 
        x=x.replace("_"," ")
    except:
        pass
    maf[i]=x

In [12]:
call=iab_labels+maf

In [13]:
len(call)

2036

In [14]:
print(call[0:10])

['Hair Care', 'Shaving', 'Motorcycles', "Women's Formal Wear", "Women's Clothing", 'Retail Industry', 'Comedy Events', 'Household Supplies', 'Home Security', 'Polar Travel']


In [15]:
total_embeddings=[]

In [16]:
total_embeddings=model.encode(call)

In [None]:
iab_embeddings=total_embeddings[0:160]

In [None]:
type(iab_embeddings[0][0])

numpy.float32

In [None]:
maf_embeddings=total_embeddings[160:]

In [None]:
len(maf_embeddings)

942

In [None]:
def calculate_precision_recall(true_labels, predictions):
    true_positives = sum(1 for pred in predictions if pred in true_labels)
    false_positives = len(predictions) - true_positives
    false_negatives = len(true_labels) - true_positives
    
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    
    return precision, recall

In [None]:
def calculate_f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [None]:
nodes=df_MAF['item'].to_list()
node_id=df_MAF['node_id'].to_list()
IAB_Candidates=df_MAF['IAB_Candidates'].to_list()
H_Mappings=df_MAF['H_Scores'].to_list()

In [162]:
top_n_labels_new=[]
top_n_cosines_new=[]
top_n_nodes=[]
top_n_node_ids=[]
top_n_IAB_Candidates=[]
top_n_H_scores=[]
top_n_labels=[]
top_n_cosines=[]

for i in tqdm(range(len(maf_embeddings))):
    similarities=[]
    for j in range(len(iab_embeddings)):
        cos_sim=cosine_similarity(np.array(iab_embeddings[j]).reshape(1,-1),np.array(maf_embeddings[i]).reshape(1,-1))
        similarities.append([call[160+i],call[j],cos_sim])
    
    similarities.sort(key=lambda x: x[2], reverse=True)


    #threshold logic needed
    similarities=[x for x in similarities if x[2]>=0.3]

    
    
    if len(similarities)==0:
        similarities.append(['No_Pred','No_Pred',[[0]]])
    
    temp_iab_labels=[]
    temp_cosines=[]


    for items in similarities:
        #print(items[1])
        temp_iab_labels.append(items[1])
        temp_cosines.append(items[2][0])
        
        top_n_labels_new.append(items[1])
        top_n_cosines_new.append(items[2][0])
        top_n_nodes.append(nodes[i])
        top_n_node_ids.append(node_id[i])
        top_n_IAB_Candidates.append(IAB_Candidates[i])
        top_n_H_scores.append(H_Mappings[i])


    
    top_n_labels.append(temp_iab_labels)
    top_n_cosines.append(temp_cosines)


100%|██████████| 942/942 [01:08<00:00, 13.77it/s]


In [163]:
len(top_n_labels)

942

In [164]:
len(top_n_labels_new)

67703

In [165]:
len(top_n_cosines)

942

In [166]:
print(top_n_labels)
print(top_n_cosines)

[['Apartments', 'Houses', 'Grocery Shopping', 'Office Property', 'Gardening', 'Wedding', 'Camping', 'Desktops', 'Home Improvement', 'Museums & Galleries', 'SUV', 'Bath and Shower', 'Home & Garden', 'Pets', 'Cats', 'Sedan', 'Shopping', 'Malls & Shopping Centers', 'Retail Property', 'Station Wagon', 'Bars & Restaurants', 'Household Supplies', "Men's Clothing", 'Single Life', 'Hotels and Motels', "Women's Handbags and Wallets", 'Pickup Trucks', 'Hotel Properties', "Women's Clothing", 'Laptops', 'Family and Relationships', 'Real Estate', 'Dating', 'Parks & Nature', 'Prom', 'Home Appliances', 'Holiday Shopping', 'Retail Industry', 'Parenting', 'Style & Fashion', 'Dogs', 'Convertible', "Men's Casual Wear", 'Home Security', 'Cooking', 'Apparel Industry', 'Movies', 'Birds', 'Awards Shows', 'Birthday', 'Auto Rentals', 'Pop Culture', 'Home Entertaining', 'Coupe', 'Dining Out', 'Alcoholic Beverages', 'Books and Literature', 'Sports', 'Van', 'Automotive', 'Cruises', 'Smartphones', 'Reptiles', 'Lan

In [167]:
MAF_predictions=df_MAF['IAB_Candidates'].to_list()

In [168]:
MAF_predictions[230]

"['Business and Finance', 'Education industry', 'Entertainment Industry', 'Financial Industry', 'Healthcare Industry', 'Logistics and Transportation Industry', 'Media Industry', 'Technology Industry', 'Telecommunications Industry', 'Automotive Industry', 'Aviation Industry', 'Events and Attractions', 'Fashion Events', 'Concerts & Music Events', 'Medical Health', 'Travel', 'Africa Travel', 'Asia Travel', 'Australia and Oceania Travel', 'Europe Travel', 'North America Travel', 'South America Travel', 'Air Travel', 'Entertainment Industry', 'Media Industry']"

In [169]:
candidate_count=0
hit_count=0
precision=0
recall=0
count=0
f1=0

for predictions, labels in zip(MAF_predictions,top_n_labels):
    count+=1
    proper=ast.literal_eval(predictions)
    prec, reca=calculate_precision_recall(predictions, labels)

    f=calculate_f1_score(prec, reca)
    precision+=prec
    recall+=reca
    f1+=f


    candidate_count+=len(proper)
    # for items in labels:
    #     if items in proper:
    #         hit_count+=1

    for items in proper:
        if items in labels:
            hit_count+=1

In [170]:
hit_count/candidate_count

0.8345950037850114

In [171]:
precision/942

0.03531567950914199

In [172]:
recall/942

0.06662262266223341

In [173]:
f1/942

0.03858127262125235

In [174]:
count

942

In [175]:
candidate_count

2642

In [180]:
dic={'item':top_n_nodes, 'node_id': top_n_node_ids, 'IAB_Candidates': top_n_IAB_Candidates,
'predictions': top_n_labels_new, 'cosine_predicted': top_n_cosines_new, "H_Mapping_scores": top_n_H_scores }

In [181]:
df_MAF_new=pd.DataFrame.from_dict(dic)

In [182]:
df_MAF_new.to_excel('results/SBERT_results_0.15_new.xlsx')

In [187]:
df_MAF_new.shape

(67703, 6)

In [None]:
#Top 100 in predictions but not in embedding

In [183]:
df_MAF_new=df_MAF_new.sort_values(by='cosine_predicted', ascending=False)

In [184]:
df_MAF_new.iloc[0]

item                                                             pets
node_id                                                          2657
IAB_Candidates              ['Pets', 'Cats', 'Dogs', 'Large Animals']
predictions                                                      Pets
cosine_predicted                                          [1.0000004]
H_Mapping_scores    [0.931252259, 0.840195182, 0.8419054970000001,...
Name: 18053, dtype: object

In [188]:
top_100_node=[]
top_100_embedding_pred=[]
top_100_H_pred=[]
top_100_cos_pred=[]
top_100_H_score=[]

In [189]:
items=df_MAF_new['item'].to_list()
IAB_Candidates=df_MAF_new['IAB_Candidates'].to_list()
predictions=df_MAF_new['predictions'].to_list()
cosine_predicted=df_MAF_new['cosine_predicted'].to_list()
H_Mapping_scores=df_MAF_new['H_Mapping_scores'].to_list()

In [190]:
for item,IAB,pred,cos,H_map in tqdm(zip(items,IAB_Candidates,predictions,cosine_predicted,H_Mapping_scores)):
    pred_list=ast.literal_eval(IAB)
    if pred not in pred_list:
        top_100_node.append(item)
        top_100_embedding_pred.append(pred)
        top_100_H_pred.append(IAB)
        top_100_cos_pred.append(cos)
        top_100_H_score.append(H_map)
    if len(top_100_node)==100:
        break

264it [00:00, 17042.14it/s]


In [191]:
top_100_dic={'items':top_100_node, 'Embedding Predictions': top_100_embedding_pred,
 "H_Mapping_Pred": top_100_H_pred, 'Predicted Cosine': top_100_cos_pred, 
 'H Mapping Cosine': top_100_H_score}

In [192]:
df_top_100=pd.DataFrame.from_dict(top_100_dic)

In [193]:
df_top_100.to_excel('results/SBERT_Top_100_Not_In_H.xlsx', index=False)

In [194]:
#Top 100 in H but not in Embedding

In [195]:
print(top_n_labels[5])
print(top_n_cosines[5])

['Pets', 'Dogs', 'Cats', 'Large Animals', 'Reptiles', 'Birds', 'Pickup Trucks', 'Coupe', 'Sports', 'SUV', 'Sedan', 'Alcoholic Beverages', 'Birthday', 'Science', 'Luxury Cars', 'Television', 'Beauty', 'Fine Art', 'Houses', 'Video Gaming', 'Real Estate', 'World Cuisines', 'Consumer Electronics', 'Station Wagon', 'Automotive', 'Single Life', 'Home Improvement', 'Laptops', 'Parks & Nature', 'Motorcycles', 'Convertible', 'Hair Care', 'Pop Culture', 'Home & Garden', 'Home Appliances', 'Van', 'Musical Instruments', 'Minivan', 'Holiday Shopping', "Men's Clothing", 'Zoos & Aquariums', 'Gardening', 'Prom', 'Food & Drink', 'Retail Property', 'Shaving', "Women's Clothing", 'Polar Travel', 'Grocery Shopping', 'Commercial Trucks', 'Style & Fashion', 'Cooking', 'Auto Parts', 'Smartphones', 'Graduation', 'Wellness', 'Landscaping']
[array([0.498584], dtype=float32), array([0.47203004], dtype=float32), array([0.44442818], dtype=float32), array([0.39049444], dtype=float32), array([0.34085548], dtype=floa

In [196]:
nodes=df_MAF['item'].to_list()
# node_id=df_MAF['node_id'].to_list()
IAB_Candidates=df_MAF['IAB_Candidates'].to_list()
H_Mappings=df_MAF['H_Scores'].to_list()

In [197]:
mid_node=[]
mid_IAB=[]
mid_label=[]
mid_cos=[]
mid_H_map=[]

for node, IAB, label, cos, H_map in tqdm(zip(nodes,IAB_Candidates,top_n_labels,top_n_cosines,H_Mappings)):
    IAB_list=ast.literal_eval(IAB)
    H_Map_list=ast.literal_eval(H_map)
    for IAB_item, H_Map_item in zip(IAB_list,H_Map_list):
        mid_node.append(node)
        mid_IAB.append(IAB_item)
        mid_label.append(label)
        mid_cos.append(cos)
        mid_H_map.append(H_Map_item)

942it [00:00, 36248.69it/s]


In [198]:
dic={'item':mid_node, 'IAB_Candidates': mid_IAB,
'predictions': mid_label, 'cosine_predicted': mid_cos, "H_Mapping_scores": mid_H_map}

In [199]:
df_lol_100=pd.DataFrame.from_dict(dic)

In [200]:
df_lol_100.head()

Unnamed: 0,item,IAB_Candidates,predictions,cosine_predicted,H_Mapping_scores
0,closet,Style & Fashion,"[Apartments, Houses, Grocery Shopping, Office ...","[[0.4207727], [0.40638095], [0.35840812], [0.3...",0.726366
1,closet,Women's Clothing,"[Apartments, Houses, Grocery Shopping, Office ...","[[0.4207727], [0.40638095], [0.35840812], [0.3...",0.718773
2,closet,Men's Clothing,"[Apartments, Houses, Grocery Shopping, Office ...","[[0.4207727], [0.40638095], [0.35840812], [0.3...",0.723078
3,closet,Men's Formal Wear,"[Apartments, Houses, Grocery Shopping, Office ...","[[0.4207727], [0.40638095], [0.35840812], [0.3...",0.710873
4,rosehip,Flower Shopping,"[Cruises, Flower Shopping, Van, Parks & Nature...","[[0.39731506], [0.36966306], [0.3360331], [0.3...",0.742494


In [201]:
df_lol_100=df_lol_100.sort_values(by='H_Mapping_scores', ascending=False)

In [202]:
df_lol_100.head()

Unnamed: 0,item,IAB_Candidates,predictions,cosine_predicted,H_Mapping_scores
2613,wagon,Travel,"[Station Wagon, SUV, Sedan, Minivan, Van, Pick...","[[0.773705], [0.56584173], [0.5654195], [0.559...",1.0
2614,drop,Real Estate,"[Shopping, Grocery Shopping, Birthday, Pickup ...","[[0.30240864], [0.27807486], [0.25866687], [0....",1.0
2615,drop,Travel,"[Shopping, Grocery Shopping, Birthday, Pickup ...","[[0.30240864], [0.27807486], [0.25866687], [0....",1.0
2629,mousetrap,Home & Garden,"[Desktops, Pets, Cats, Video Gaming, Children'...","[[0.30631667], [0.27378374], [0.26512915], [0....",1.0
2626,ring_mail,Events and Attractions,"[Smartphones, Telecommunications Industry, Wom...","[[0.29209846], [0.2710069], [0.25701064], [0.2...",1.0


In [203]:
top_100_node=[]
top_100_embedding_pred=[]
top_100_H_pred=[]
top_100_cos_pred=[]
top_100_H_score=[]

In [204]:
items=df_lol_100['item'].to_list()
IAB_Candidates=df_lol_100['IAB_Candidates'].to_list()
predictions=df_lol_100['predictions'].to_list()
cosine_predicted=df_lol_100['cosine_predicted'].to_list()
H_Mapping_scores=df_lol_100['H_Mapping_scores'].to_list()

In [205]:
for item,IAB,pred,cos,H_map in tqdm(zip(items,IAB_Candidates,predictions,cosine_predicted,H_Mapping_scores)):
    #pred_list=ast.literal_eval(pred)
    if IAB not in pred:
        top_100_node.append(item)
        top_100_embedding_pred.append(pred)
        top_100_H_pred.append(IAB)
        top_100_cos_pred.append(cos)
        top_100_H_score.append(H_map)
    if len(top_100_node)==100:
        break

338it [00:00, 289380.44it/s]


In [206]:
top_100_dic={'items':top_100_node, 'Embedding Predictions': top_100_embedding_pred,
 "H_Mapping_Pred": top_100_H_pred, 'Predicted Cosine': top_100_cos_pred, 
 'H Mapping Cosine': top_100_H_score}

In [207]:
df_top_100=pd.DataFrame.from_dict(top_100_dic)

In [208]:
df_top_100.to_excel('results/SBERT_Top_100_Not_In_SBERT.xlsx', index=False)