In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
DATASET_PATH = '/kaggle/input/influencer-and-brand-dataset-smaller-features'

In [3]:
brands = pd.read_csv(os.path.join(DATASET_PATH, 'brand_combined_features.csv'))
influencers = pd.read_csv(os.path.join(DATASET_PATH, 'influencer_combined_features.csv'))

# for each Instagram account (either that of a brand or of a influencer), from each of their 50 posts, 
# we extract 25088 visual features (from post images, using pretrained VGG-16 model) and
# 300 textual features (from post captions, using spaCy library)

# hence, in all, from each post: 25088 + 300 = 25388 features

# then, over all the 50 posts of every account, we take the average of the features of all the posts
# hence, each account will be represented by 25388 features

# brand_combined_features.csv: contains all the features (as detailed above) for the 90 brand accounts
# influencer_combined_features.csv: contains all the features (as detailed above) for the 1150 influencer accounts

# 101 microinfluencers are getting repeated in the dataframe/excel file!

# hence, the number of unique microinfluencers = total - repeating = 1150 - 101 = 1049

print(brands.shape)
print('')

print(brands.head())
print('')

print(brands.tail())
print('')

print(influencers.shape)
print('')

print(influencers.head())
print('')

print(influencers.tail())
print('')

print('\n')

brands_list = list(brands['username'])

print(brands_list)
print('')

(89, 25389)

        username         0         1         2         3         4         5  \
0        24k.mag  1.449159 -1.649709  0.790643 -1.594844  1.220980  2.190517   
1  acemarksshoes  0.567023  0.509280 -0.946712 -0.164634  1.894553  0.146237   
2    airmaxkicks  0.304050  2.071166 -2.260535  1.493369  1.157146  0.769862   
3  angelusdirect  0.672281 -0.153045 -0.934472 -0.818101  1.403500  0.692561   
4  backtominimal  0.012063 -0.256059 -0.737451 -0.686560  1.484131  1.278554   

          6         7         8  ...     25378     25379     25380     25381  \
0  1.597013 -0.016088 -2.461812  ...  0.849440  0.759347  0.168444  0.290002   
1  1.504280  1.555754 -1.020325  ...  0.063693  0.152824  0.063699  0.531436   
2  0.434513  1.448903 -1.910376  ...  0.312810  0.389381  0.129467  0.077005   
3  1.875513  0.565864 -1.445671  ...  0.218277  0.203993  0.106722  0.054129   
4  0.794310  0.527335 -1.060499  ...  0.496977  0.624787  0.208957  0.156278   

      25382     25383    

In [4]:
# creating a dictionary of those 2 dataframes, the account username (of either brand or influencer)
# being the key, and the array of features being the value

brand_features = dict(zip(brands['username'], brands.drop(columns = ['username']).values))
influencer_features = dict(zip(influencers['username'], influencers.drop(columns = ['username']).values))

print(len(brand_features))
print('')

print(len(influencer_features))
print('')

# getting 1 brand less, and 2 influencers more!
# i.e., we should be getting 90 (unique) brands (instead of 89) and 1049 (unique) influencers (instead of 1051)

# 'pasmag' brand has been left out!!

print(brand_features)
print('')

print(brand_features['24k.mag'])
print('')

print(len(brand_features['24k.mag']))
print('')

89

1051

{'24k.mag': array([ 1.4491594 , -1.6497089 ,  0.790643  , ...,  0.22922634,
        0.15800056,  0.12135798]), 'acemarksshoes': array([ 0.56702286,  0.50928026, -0.9467123 , ...,  0.15017195,
        0.0678497 ,  0.21821186]), 'airmaxkicks': array([ 0.30404958,  2.0711663 , -2.260535  , ...,  0.11611696,
        0.03392881,  0.1468502 ]), 'angelusdirect': array([ 0.67228127, -0.15304472, -0.93447214, ...,  0.01985469,
        0.11686733,  0.03753393]), 'backtominimal': array([ 0.01206339, -0.25605917, -0.73745066, ...,  0.01722349,
        0.02995477,  0.04259473]), 'bellabelleshoes': array([ 0.6441864 ,  0.2713913 , -1.186392  , ...,  0.22106172,
        0.12597741,  0.23150235]), 'boosthaven': array([-0.96492743, -0.19557185, -1.552698  , ...,  0.08951952,
        0.15269665,  0.14787431]), 'bump_official': array([ 0.16063268, -1.4405837 ,  0.34752032, ...,  0.12860177,
        0.38139284,  0.5853847 ]), 'jordanaddict': array([-0.02710151, -0.7226696 , -0.7680729 , ...,  0.

In [5]:
# labels.csv: establishes the mapping between brands and influencers, via label (1: denotes associativity, 0: no associativity)

# labels.csv contains the correct number of brands (90) and influencers (1049)

labels_df = pd.read_csv(os.path.join(DATASET_PATH, 'labels.csv'))

### dropping brand 'pasmag'

labels_df = labels_df.drop(labels_df[labels_df['brand_username'] == 'pasmag'].index)

###

print(labels_df.head(20))
print('')

     brand_username  influencer_username  label
1049    acuracanada       aemelectronics      0
1050    acuracanada             240pauly      0
1051    acuracanada               _crvn_      0
1052    acuracanada        04poswrxwagon      0
1053    acuracanada             4xspower      0
1054    acuracanada        2jz_lelandsti      0
1055    acuracanada     adphotosofficial      0
1056    acuracanada                25thh      0
1057    acuracanada      _elliottpowell_      0
1058    acuracanada             _s2klean      0
1059    acuracanada              1evo2h8      0
1060    acuracanada  1down5upphotography      0
1061    acuracanada           marcustroy      1
1062    acuracanada           importfest      1
1063    acuracanada   pikespeakhillclimb      1
1064    acuracanada      officialmidohio      1
1065    acuracanada       cjwilsonracing      1
1066    acuracanada          drivingline      1
1067    acuracanada           nyautoshow      1
1068    acuracanada      cdnintlautoshow

In [6]:
# categorising brands and microinfluencers as per category

mappings = pd.read_excel('/kaggle/input/influencer-and-brand-dataset-smaller-features/all_positive (mapping).xls')

### dropping brand 'pasmag'

mappings = mappings.drop(mappings[mappings['Brand'] == 'pasmag'].index)

###

print(mappings.head())
print('')

print(mappings.tail())
print('')

brand_to_category_mapping = {}
influencer_to_category_mapping = {}

for category_name, category_dataframe in mappings.groupby('Category'):
  category_specific_brands_array = category_dataframe['Brand'].unique()
  brand_to_category_mapping[category_name] = category_specific_brands_array

  category_specific_influencers_array = category_dataframe['Micro_influencer'].unique()
  influencer_to_category_mapping[category_name] = category_specific_influencers_array
  
print(brand_to_category_mapping)
print('')

print(influencer_to_category_mapping)
print('')

print(type(influencer_to_category_mapping['auto']))

# creating an array of all the 90 brand names

all_brand_names = []

for category_brands in brand_to_category_mapping.values():
    for brand in category_brands:
        all_brand_names.append(brand)

all_brand_names = np.array(all_brand_names)

print(all_brand_names)
print('')

print(len(all_brand_names))
print('')


   Category        Brand    Micro_influencer
12     auto  acuracanada          marcustroy
13     auto  acuracanada          importfest
14     auto  acuracanada  pikespeakhillclimb
15     auto  acuracanada     officialmidohio
16     auto  acuracanada      cjwilsonracing

     Category         Brand Micro_influencer
1145    shoes  streetwearde         air__way
1146    shoes  streetwearde      _ruucakana_
1147    shoes  streetwearde  anddrew_johnson
1148    shoes  streetwearde         717soles
1149    shoes  streetwearde             ae6x

{'auto': array(['acuracanada', 'astonmartinlagonda', 'audi', 'bentleymotors',
       'bmw', 'cadillac', 'chevrolet', 'ford', 'mitsubishimotors',
       'nissan', 'porsche', 'ramtrucks', 'skodamotorsport',
       'smart_worldwide'], dtype=object), 'clothing': array(['aboutyoude', 'abyssbyabby', 'all.streetwear', 'bebe_stores',
       'nellycom', 'nikesb', 'nlymancom', 'oohlaluxe', 'parisianofficial',
       'pepejeans', 'touchdolls', 'twinsetofficial', 'v

# Model

In [7]:
import random
from collections import defaultdict

import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [8]:
class DatasetPrep:
    def __init__(self, labels_df, brand_features, influencer_features, brand_to_category_mapping, influencer_to_category_mapping):
        
        # brand_features,influencer_features: dictionaries containing all the data (account username: key, array of features: value)
        
        self.brand_features = brand_features
        self.influencer_features = influencer_features

        self.brand_to_pos = defaultdict(list) # dictionary mapping all the brands to all of their positive (associated) influencers
        self.brand_to_neg = defaultdict(list) # dictionary mapping all the brands to all of their negative (non-associated) influencers

        for _, row in labels_df.iterrows():
            b = row['brand_username']
            i = row['influencer_username']
            y = row['label']

            # in any case, if the brand or influencer is not present in created dictionaries (they are skipped and the loop continues)
            
            if b not in brand_features or i not in influencer_features:
                continue

            # for brand b, adding positive influencers (y = 1) to brand_to_pos (brand_to_pos[b], key:b) dictionary, 
            # and negative influencers (y = 0) to brand_to_neg (brand_to_neg[b], key:b) dictionary
            
            if y == 1:
                self.brand_to_pos[b].append(i)
            else:
                self.brand_to_neg[b].append(i)

            # retaining all the brands in an array valid_brands

            self.valid_brands = []

            for brand in self.brand_to_pos:
                self.valid_brands.append(brand)


    def candidate_microinfluencers(self, brand_name, brand_features, influencer_features):
        # here, preparing a pool of 25 candidate micro-influencers for brand brand_name 
        # of the pool of 25, 6 positive influencers are chosen, and 19 negative ones are chosen

        # -> try a pool of 10, 3 positives and 7 negatives
        # try a pool of 25, 6 positives and 19 negatives
        # try a pool of 25, 9 positives and 16 negatives
        
        # in this way, while running the model, have to prepare candidate micro-influencers for every brand for each of the 6 categories

        # getting the category of the particular brand
        
        for category in brand_to_category_mapping.keys():
            if brand_name in brand_to_category_mapping[category]:
                category_name = category
                break

        # category_name: the category (out of the 6 categories) to which the brand brand_name belongs to

        # candidate_pool = influencer_to_category_mapping[category_name]

        # candidate_pool = np.random.choice(influencer_to_category_mapping[category_name], size = 25, replace = False)

        positive_influencers = self.brand_to_pos[brand_name]   # contains usernames of all the positive influencers

        sampled_positive_influencers = np.random.choice(positive_influencers, size = 3, replace = False)

        all_category_influencers = influencer_to_category_mapping[category_name]
        
        # all_category_influencers_set = set(all_category_influencers)
        # positive_influencers_set = set(positive_influencers)
        # remaining_influencers_set = all_category_influencers_set.difference(positive_influencers_set)
        # remaining_influencers = np.array(list(remaining_influencers_set))

        negative_influencers = self.brand_to_neg[brand_name]   # contains usernames of all the negative influencers

        sampled_negative_influencers = np.random.choice(negative_influencers, size = 12, replace = False)

        candidate_pool = np.concatenate((sampled_positive_influencers, sampled_negative_influencers))
        
        random.shuffle(candidate_pool)   # shuffling to avoid position bias

        # creating labels to determine if (out of the candidate pool), a particular micro-influencer is associated with the brand or not

        labels_for_this_brand = []

        for inf in candidate_pool:
            if inf in positive_influencers:
                labels_for_this_brand.append(1)
            else:
                labels_for_this_brand.append(0)

        # converting it to an array

        labels_for_this_brand = np.array(labels_for_this_brand)

        # the total number of positive (associated) influencers for this particular brand

        num_of_associated_influencers = len(positive_influencers)

        # preparing feature vectors for the brand and all the influencers in the candidate pool for that brand

        brand_vec = self.brand_features[brand_name]   # a 1-D array

        influencer_vecs = []

        for inf in candidate_pool:
            if inf not in influencer_features:
                continue
            
            inf_vec = self.influencer_features[inf]
            influencer_vecs.append(inf_vec)

        influencer_vecs = np.array(influencer_vecs)   # a 2-D array
        
        return candidate_pool, labels_for_this_brand, brand_vec, influencer_vecs, num_of_associated_influencers
        

In [9]:
class WSim_like_model(nn.Module):

    # a standard WSim class object (for initialization)

    def __init__(self, d_v = 25088, d_t = 300):
        super().__init__()

        ### creating and initializing model parameters with values!

        # visual and textual parameter vectors 𝑾𝒗 and 𝑾𝑡 (columnar vectors/1-D columnar tensors)

        self.W_v = nn.Parameter(torch.randn(d_v) * 0.01)
        self.W_t = nn.Parameter(torch.randn(d_t) * 0.01)

        # also try

        # self.W_v = nn.Parameter(torch.randn(d_v))
        # self.W_t = nn.Parameter(torch.randn(d_t))

        # coefficient parameters
        
        self.w_v = nn.Parameter(torch.tensor(1.0))             
        self.w_t = nn.Parameter(torch.tensor(1.0))             
        
        # also try

        # self.w_v = nn.Parameter(torch.randn(1))             
        # self.w_t = nn.Parameter(torch.randn(1))

        # self.w_v = nn.Parameter(torch.randn(1) * 0.01)             
        # self.w_t = nn.Parameter(torch.randn(1) * 0.01)

        # when w_v + w_t = 1

        # self.w_t = nn.Parameter(torch.tensor(0.5))

        # self.w_t = nn.Parameter(torch.randn(1))

        # self.w_t = nn.Parameter(torch.randn(1) * 0.01)

    
    
    # forward pass

    def forward(self, b_vec, i_vecs, labels_for_this_brand):
        """
        Args:
            b_vec  (1-D tensor): (d_v + d_t) - Brand (visual + textual) features
            i_vecs (2-D tensor): (pool_size, d_v + d_t) - Influencer (visual + textual) features of the micro-influencer candidate pool
            labels_for_this_brand (1-D array): array containing the labels for each candidate influencer w.r.t. the brand (label = 1: positive influencer, label = 0: negative influencer)
        Returns:
            score  (2-D tensor): (batch_size, pool_size) - contains the similarity scores (z_k) of each of the pool_size (or k) influencers w.r.t the particular brand
        """

        ### now, batch_size = number of positive influencers in the candidate pool
        
        # splitting features (assuming concatenated features [visual; textual])
        
        d_v = self.W_v.shape[0]

        pool_size = i_vecs.shape[0]

        b_v = b_vec[:d_v]
        b_t = b_vec[d_v:]

        i_v = i_vecs[:, :d_v]
        i_t = i_vecs[:, d_v:]

        # each micro-influencer in the candidate pool will have a similarity score (both for textual and 
        # visual features) corresponding to the particular brand
        # for each micro-influencer, multiplying both these similarity scores with the corresponding
        # parameter coefficients (w_v and w_t) and then adding them will yield the final similarity score
        # hence, for the particular brand, 'score' will be a vector consisting of all the final
        # similarity scores w.r.t it of each of the micro-influencers in the candidate pool
        # therefore, len(score) = pool size

        list_of_score_of_each_influencer = []
        
        for k in range(pool_size):
            prod_of_visual_features = torch.mul(b_v, i_v[k])
            prod_of_textual_features = torch.mul(b_t, i_t[k])

            score_v = torch.dot(prod_of_visual_features, self.W_v)      # visual similarity
            score_t = torch.dot(prod_of_textual_features, self.W_t)     # textual similarity

            z_k = (self.w_v * score_v) + (self.w_t * score_t)

            # also try (for case/constraint where w_v + w_t = 1)

            # z_k = (self.w_t * score_t) + ((1 - self.w_t) * score_v)

            list_of_score_of_each_influencer.append(z_k)

        
        score = torch.stack(list_of_score_of_each_influencer) 

        # through torch.stack(), directly converting from list to a 1-D tensor ((pool_size) tensor)
        # torch.stack() concatenates a sequence of tensors along a new dimension (creates a new dimension at the specified place and concatenates the argument tensors along it)

        # tensor.repeat(): used to repeat/duplicate/replicate the elements of a tensor along specified 
        # dimensions and returns the resulting tensor

        # if tensor is a k-dimensional tensor, tensor.repeat() will have k arguments, each argument 
        # representing the number of times that the elements of that entire tensor (i.e., the tensor itself)
        # have to be repeated/duplicated along that particular axis/dimension as the index of the argument
        
        # tensor.repeat(n1, n2, n3, ...): creates a new tensor by repeating the entire original tensor
        # n1 times along the 1st dimension (d1), n2 times along d2, n3 times along d3, and so on ...

        # hence, outputs an enlarged/duplicated as specified tensor of k dimensions itself
        # i.e., the number of dimensions in the resulting tensor is same as that in the original tensor

        # first converting 1-D tensor score to a 2-D tensor so that .repeat() can be used to output
        # a resultant repeated 2-D tensor 

        # tensor.unsqueeze(dim): adds a new dimension to 'tensor' at index/axis/dimension 'dim'
        # i.e., increases the number of dimensions of 'tensor' by 1 (from k to k+1)
        
        score = score.unsqueeze(dim = 0)   # now, score: (1, pool_size) tensor (2-D tensor)

        # applying this condition here to ensure that it doesn't cause an obstruction during evaluation!
        # since during evaluation, every single microinfluencer, both positive and negative, is being 
        # evaluated against every single brand, so pool_size = 1,
        # (labels_for_this_brand is an array, writing it as list in shorthand representation)
        # and labels_for_this_brand = [1] or [0] (associativity or no associativity)
        # causes issues if labels_for_this_brand = [0]   (i.e., for negative influencers of the brand)
        # if labels_for_this_brand = [1], at the end, score would remain the exact same without the below snippet

        if pool_size > 1:
            num_of_positive_samples = list(labels_for_this_brand).count(1)
        
            score = score.repeat(num_of_positive_samples, 1)

        
        # essentially, batch_size = num_of_positive_samples

        # now 'score' is a 2-D tensor of (num_of_positive_samples, pool_size) dimensions

        # a scalar tensor is a tensor with no dimensions, i.e, it is literally just a scalar/number!
        # nn.CrossEntropyLoss(score, target) expects 'score' to be a 2-D tensor consisting of
        # batch_size number of samples, and 'target' to be a 1-D tensor containing the true classes of
        # all the batch_size number of samples
        
        device = b_vec.device   # obtaining the device in which variable 'b_vec' is stored

        score = score.to(device)   # moving the variable 'score' to that very device

        # necessary to move 'score' to device like this! since when in the final snippet, when the model is
        # moved to device, all the model parameters and explicit model components only are moved to 
        # device. Any newly created tensor in this 'forward' function is by default on CPU and has to be
        # explicitly moved to device!!

        # here, list_of_score_of_each_influencer is a newly created list, from which 'score' tensor has
        # been created. Hence, due to list_of_score_of_each_influencer being by default on the CPU, its
        # derived tensor 'score' will also be on the CPU and has to be explicitly moved to GPU device!! 
        
        return score
        

In [10]:

# Stochastic Gradient Descent principle!

def train_epoch(model, DatasetPrep_instance, optimizer, device, all_brand_names):
    
    model.train()
    total_loss = 0

    # iterating over all the 90 brands in the dataset, for 10 loops/steps

    # for _ in range(10):

    # for _ in range(50)
    
    for _ in range(100):
        for brand_name in all_brand_names:
            if brand_name not in brand_features.keys():
                continue
        
            # dataset preparation for this particular brand

            candidate_pool, labels_for_this_brand, brand_vec, influencer_vecs, num_of_associated_influencers = DatasetPrep_instance.candidate_microinfluencers(brand_name, brand_features, influencer_features)

            # num_of_associated_influencers: the number of positive (associated) influencers for this particular brand

            # converting arrays to tensors
            # and then, moving to device (to GPU / inside GPU's memory for further computation in GPU)
            
            brand_vec = torch.FloatTensor(brand_vec).to(device)                # (d) 1-D tensor
            influencer_vecs = torch.FloatTensor(influencer_vecs).to(device)    # (pool_size, d) 2-D tensor
            
            true_class_indices = np.where(labels_for_this_brand == 1)[0]           

            # returns an (1-D) array of all the indices of array 'labels_for_this_brand' where the index 
            # elements = 1

            # converting it to a tensor and moving that tensor to device
            
            target = torch.LongTensor(true_class_indices).to(device)     # (num_of_positive_samples) 1-D tensor
        
            # --> Forward pass
            
            score = model(brand_vec, influencer_vecs, labels_for_this_brand)
            loss = nn.CrossEntropyLoss()(score, target)

            ## ! ## also try with pool_size = 25, with 9 positive samples, and 16 negative samples!
            ## create num_of_positive_samples number of duplicate score vectors!! and pass them all as a list of 1-D vectors (a 2-D matrix) to the nn.CrossEntropyLoss() function
            ## the target vector will consist of indices of all the positive samples in the pool 

            ########## diving by the number of positive influencers for this particular brand!

            # loss = loss / num_of_associated_influencers

            ##########
        
            # --> Backward pass
            
            optimizer.zero_grad()    # resets the gradients for a fresh start
            
            loss.backward()        # computes all the gradients of the loss function w.r.t all the
                                   # tensors (w.r.t the model parameters which the tensors contain)
            
            optimizer.step()       # updates all the model parameters (contained within tensors) using the 
                                   # gradients that were calculated during loss.backward()
        
            total_loss = total_loss + loss.item()  # x.item(): where x is a scalar tensor, returns its 
                                                   # standard float value as a regular number

            # w.r.t loss.backward(), i.e., the backward pass step, care should be taken that there isn't
            # any tensor in the code which is not connected to the computation graph, i.e., a tensor which
            # has been detached or created in a way that PyTorch doesn’t know how to compute gradients 
            # from it
            
            # cases can be there in which, at a particular step, a new tensor has been created which is
            # not connected to the computation graph, and hence it loses the grad info

            # for every code component which is used in the computation of model parameters, care has to
            # be taken that the computation graph is preserved, so that the gradients will flow properly
            # and loss.backward(), i.e., the backpropagation process, will work

            # code components should utilise (should be a function of) the model parameters, so that
            # PyTorch can track and govern the gradient flow

            # as long as any intermediate tensor is built using model parameters (which are nn.Parameters
            # with 'requires_grad = True'), and the chain isn't broken using .detach() or torch.no_grad() 
            # or torch.tensor() (i.e., by creating new tensors), the computation graph will remain intact,
            # PyTorch will automatically track everything and loss.backward() will work

            # torch.tensor([]) creates a new leaf tensor and will hence be detached from the computation
            # graph. In such cases, use tensor operations like torch. stack(), torch.cat(), etc., on 
            # tensors with 'requires_grad = True' to keep the computation graph intact

            # to check for a tensor, use print(tensor.requires_grad)


            # print(f'{brand_name} done')

    return (total_loss / 100)

    # return total_loss

    # return (total_loss / 900)
    

In [11]:
def evaluate(model, labels_df, brand_features, influencer_features, device, k_vals = [10, 50]):
    
    model.eval()
    brand_to_scores = defaultdict(list)
    
    with torch.no_grad():
        for index, row in tqdm(labels_df.iterrows(), total = len(labels_df)):

            # dataframe.iterrows(): a method that iterrates over the dataframe rows as (index, row) pairs
            # each row acts as a Pandas series
            
            b = row['brand_username']
            i = row['influencer_username']
            y = row['label']
            
            if b not in brand_features or i not in influencer_features:
                continue
                
            # preparing tensors
            
            b_vec = torch.FloatTensor(brand_features[b]).to(device)                    # (d)
            i_vec = torch.FloatTensor(influencer_features[i]).unsqueeze(0).to(device)  # (d) to (1, d)

            # unsqueeze(0) adds a new dimension of size 1 at the 0th position (i.e., the front)
            # x = torch.tensor([1, 2, 3]), x = x.unsqueeze(0), x: tensor([[1, 2, 3]])
            
            # after finding the parameter values (at the end of an epoch, since in the final training loop code snippet, evaluation is after every single epoch! and not after internal loops in an epoch) 
            # (2 parameter vectors and 2 coefficient parameters), computing the score using them!

            # using these above model parameters obtained at the end of an epoch,
            # computing the WSim scores of each of the 89 brands with each of the 1051 micro-influencers!!
            # essentially, a double loop!

            labels_for_this_brand = np.array([y])
            
            score = model(b_vec, i_vec, labels_for_this_brand).item()
            brand_to_scores[b].append((score, y))
    
    # computing metrics
    
    recall_at_k = {k: [] for k in k_vals}
    aucs = []
    medrs = []
    
    for b, scores in brand_to_scores.items():
        scores.sort(reverse = True, key = lambda x: x[0])
        _, labels = zip(*scores)
        total_pos = sum(labels)
        
        for k in k_vals:
            recall_at_k[k].append(sum(labels[:k]) / total_pos)
        
        if len(set(labels)) > 1:
            aucs.append(roc_auc_score(labels, [s[0] for s in scores]))
        
        ranks = [i for i, (_, label) in enumerate(scores) if label == 1]
        medrs.append(np.median(ranks) + 1 if ranks else np.nan)
    
    return {
        f"Recall@{k}": np.nanmean(recall_at_k[k]) for k in k_vals
    }, {
        "AUC": np.nanmean(aucs),
        "MedR": np.nanmedian(medrs)
    }

In [12]:

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

model = WSim_like_model(d_v = 25088, d_t = 300).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

# Data preparation

DatasetPrep_instance = DatasetPrep(labels_df, brand_features, influencer_features, brand_to_category_mapping, influencer_to_category_mapping)

# Training loop

for epoch in range(25):
    train_loss = train_epoch(model, DatasetPrep_instance, optimizer, device, all_brand_names)
    print(f"Epoch {epoch+1} | Loss: {train_loss:.4f}")
    
    # evaluation metrics
    
    recall_metrics, rank_metrics = evaluate(model, labels_df, brand_features, influencer_features, device)

    # the tqdm progress bar is being used to denote the evaluation process!! not the forward nor the backward passes
    
    print("Recall: ", recall_metrics)
    print("Rank Metrics: ", rank_metrics)


Epoch 1 | Loss: 169.1841


100%|██████████| 93361/93361 [01:04<00:00, 1441.88it/s]


Recall:  {'Recall@10': 0.255236236244732, 'Recall@50': 0.7235465267765896}
Rank Metrics:  {'AUC': 0.9446611376579445, 'MedR': 24.5}
Epoch 2 | Loss: 141.0897


100%|██████████| 93361/93361 [01:04<00:00, 1438.12it/s]


Recall:  {'Recall@10': 0.36488678221418075, 'Recall@50': 0.8296142845906709}
Rank Metrics:  {'AUC': 0.9531595264446519, 'MedR': 16.25}
Epoch 3 | Loss: 131.4353


100%|██████████| 93361/93361 [01:04<00:00, 1443.82it/s]


Recall:  {'Recall@10': 0.44523091307517376, 'Recall@50': 0.8993377185991572}
Rank Metrics:  {'AUC': 0.9851744465385314, 'MedR': 12.5}
Epoch 4 | Loss: 125.8549


100%|██████████| 93361/93361 [01:04<00:00, 1456.39it/s]


Recall:  {'Recall@10': 0.5500593794014023, 'Recall@50': 0.9794019987172015}
Rank Metrics:  {'AUC': 0.9923333325755844, 'MedR': 9.0}
Epoch 5 | Loss: 121.3766


100%|██████████| 93361/93361 [01:04<00:00, 1452.26it/s]


Recall:  {'Recall@10': 0.6080982195019956, 'Recall@50': 0.9615192707425618}
Rank Metrics:  {'AUC': 0.9921955297539248, 'MedR': 8.0}
Epoch 6 | Loss: 119.3775


100%|██████████| 93361/93361 [01:04<00:00, 1451.67it/s]


Recall:  {'Recall@10': 0.6065802969843056, 'Recall@50': 0.9660847897646027}
Rank Metrics:  {'AUC': 0.9928277558042908, 'MedR': 8.5}
Epoch 7 | Loss: 116.6848


100%|██████████| 93361/93361 [01:04<00:00, 1448.65it/s]


Recall:  {'Recall@10': 0.6485670860855013, 'Recall@50': 0.9868802356966447}
Rank Metrics:  {'AUC': 0.9958003964453696, 'MedR': 8.0}
Epoch 8 | Loss: 115.8035


100%|██████████| 93361/93361 [01:04<00:00, 1452.14it/s]


Recall:  {'Recall@10': 0.7047057058713833, 'Recall@50': 0.9948235677457689}
Rank Metrics:  {'AUC': 0.9971470991638611, 'MedR': 7.0}
Epoch 9 | Loss: 115.0673


100%|██████████| 93361/93361 [01:04<00:00, 1450.60it/s]


Recall:  {'Recall@10': 0.630459611903931, 'Recall@50': 0.9370850006657814}
Rank Metrics:  {'AUC': 0.983626983019703, 'MedR': 8.0}
Epoch 10 | Loss: 114.0951


100%|██████████| 93361/93361 [01:04<00:00, 1456.96it/s]


Recall:  {'Recall@10': 0.6739631312340632, 'Recall@50': 0.9780307949626131}
Rank Metrics:  {'AUC': 0.9930982712609532, 'MedR': 7.25}
Epoch 11 | Loss: 113.5586


100%|██████████| 93361/93361 [01:04<00:00, 1451.78it/s]


Recall:  {'Recall@10': 0.714126838808559, 'Recall@50': 0.9669913419913421}
Rank Metrics:  {'AUC': 0.9883786785345433, 'MedR': 7.0}
Epoch 12 | Loss: 112.5043


100%|██████████| 93361/93361 [01:04<00:00, 1450.30it/s]


Recall:  {'Recall@10': 0.7497261675408076, 'Recall@50': 0.9964962121212121}
Rank Metrics:  {'AUC': 0.9978052608251889, 'MedR': 7.0}
Epoch 13 | Loss: 111.7860


100%|██████████| 93361/93361 [01:04<00:00, 1453.23it/s]


Recall:  {'Recall@10': 0.712589262023499, 'Recall@50': 0.9770876345876346}
Rank Metrics:  {'AUC': 0.9943931629177196, 'MedR': 7.0}
Epoch 14 | Loss: 111.3008


100%|██████████| 93361/93361 [01:04<00:00, 1448.94it/s]


Recall:  {'Recall@10': 0.746791132884952, 'Recall@50': 0.9900033993783995}
Rank Metrics:  {'AUC': 0.9972533126580881, 'MedR': 7.0}
Epoch 15 | Loss: 110.5527


100%|██████████| 93361/93361 [01:04<00:00, 1452.49it/s]


Recall:  {'Recall@10': 0.7476200749989877, 'Recall@50': 0.9882005494505496}
Rank Metrics:  {'AUC': 0.9970246140479659, 'MedR': 7.0}
Epoch 16 | Loss: 109.4207


100%|██████████| 93361/93361 [01:04<00:00, 1446.03it/s]


Recall:  {'Recall@10': 0.7313712172283299, 'Recall@50': 0.9888004642416408}
Rank Metrics:  {'AUC': 0.997148046514364, 'MedR': 7.0}
Epoch 17 | Loss: 109.4825


100%|██████████| 93361/93361 [01:04<00:00, 1448.94it/s]


Recall:  {'Recall@10': 0.765281160095335, 'Recall@50': 0.9991883116883117}
Rank Metrics:  {'AUC': 0.9989305319916941, 'MedR': 6.5}
Epoch 18 | Loss: 109.6161


100%|██████████| 93361/93361 [01:04<00:00, 1454.75it/s]


Recall:  {'Recall@10': 0.7393983135523947, 'Recall@50': 0.9769805194805194}
Rank Metrics:  {'AUC': 0.9938950655452252, 'MedR': 6.5}
Epoch 19 | Loss: 109.0672


100%|██████████| 93361/93361 [01:03<00:00, 1466.11it/s]


Recall:  {'Recall@10': 0.7967838049920306, 'Recall@50': 0.9968073593073593}
Rank Metrics:  {'AUC': 0.9991561142342217, 'MedR': 6.5}
Epoch 20 | Loss: 108.7065


100%|██████████| 93361/93361 [01:03<00:00, 1460.90it/s]


Recall:  {'Recall@10': 0.7730575785669406, 'Recall@50': 0.9947916666666665}
Rank Metrics:  {'AUC': 0.9983298534195115, 'MedR': 6.75}
Epoch 21 | Loss: 107.8136


100%|██████████| 93361/93361 [01:03<00:00, 1462.42it/s]


Recall:  {'Recall@10': 0.7346440823409445, 'Recall@50': 0.9381446930594657}
Rank Metrics:  {'AUC': 0.985855475604419, 'MedR': 6.5}
Epoch 22 | Loss: 107.7347


100%|██████████| 93361/93361 [01:03<00:00, 1458.82it/s]


Recall:  {'Recall@10': 0.754502416779827, 'Recall@50': 0.9929954626478691}
Rank Metrics:  {'AUC': 0.9980930261890663, 'MedR': 7.0}
Epoch 23 | Loss: 107.2334


100%|██████████| 93361/93361 [01:03<00:00, 1471.89it/s]


Recall:  {'Recall@10': 0.7978136983013357, 'Recall@50': 1.0}
Rank Metrics:  {'AUC': 0.9995149694261187, 'MedR': 6.5}
Epoch 24 | Loss: 107.1084


100%|██████████| 93361/93361 [01:03<00:00, 1469.44it/s]


Recall:  {'Recall@10': 0.7955003393118404, 'Recall@50': 0.9946827069988835}
Rank Metrics:  {'AUC': 0.9985855836307626, 'MedR': 6.5}
Epoch 25 | Loss: 106.5093


100%|██████████| 93361/93361 [01:04<00:00, 1446.94it/s]


Recall:  {'Recall@10': 0.7877160055900067, 'Recall@50': 0.9947668997668998}
Rank Metrics:  {'AUC': 0.9987655148728609, 'MedR': 6.5}


In [13]:
# saving model run for 25 epochs

import torch
from pathlib import Path

# defining the directory where the progress/parameters are to be saved

model_dir = Path("/kaggle/working")
model_dir.mkdir(exist_ok = True)

# saving the full checkpoint

checkpoint = {
    "epoch": 25,  # current epoch
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "loss": 106.5093,  # best loss
    "metrics": {  # best metrics
        "Recall@10": 0.7978,
        "AUC": 0.9995,
        'Recall@50': 1.0000,
        'MedR': 6.5
        }
}

torch.save(checkpoint, model_dir / "multi-positive_pool_wsim_like_model_checkpoint_epoch_25_(3+,12-).pth")