In this notebook , we will train a neural network model that generates embeddings for URLs , based on a contrastive learning framework. The idea is to minimize the inner product between URLs having the same label and increasing the distance between URLs having different labels.

First we import librairies.

In [None]:
import pandas as pd 
import os 
import numpy as np 
import re 
import torch
import random 
import torch.nn as nn
from sklearn.metrics import accuracy_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

pd.set_option("display.max_colwidth",None)


In [None]:
data_path = '/content/drive/MyDrive/challenges/adot/data'
path_to_stopwords = '/content/drive/MyDrive/challenges/adot/stop_words_french.txt'

Next we load the data.

In [None]:
df_data = pd.DataFrame(columns=['url' , 'target' , 'day'])
for filename in os.listdir(data_path) :
    if filename.endswith('.parquet') :
        df_data = pd.concat([df_data , pd.read_parquet(os.path.join(data_path , filename), engine='pyarrow')] , 
                            ignore_index=True)
        

# Dataset preparation 

Here the steps for preprocessing are the same for the fasttext embedding model. We add another step which is the elimination of rows where the label is unique in the dataset, because in the contrastive learning framework we need pairs that share the same label, so each label must be present at least in two rows. 

## Labels cleaning 

In [None]:
from collections import defaultdict
dict_occ = defaultdict(int)
for label_list in df_data['target'] : 
    for label in label_list.astype('int64') : 
        dict_occ[label]+=1 

single_labels = [k for k,v in dict_occ.items() if v < 2 ]

In [None]:
def get_mapping_target(df , single_labels) :
   """
   Returns a dict where the keys are old targets and the values are new targets that have the property
   being successive.
   """
   full_labels = []
   for index , row in df.iterrows() :
       labels = [label for label in row['target'].astype('int64') if label not in single_labels]
       full_labels += labels
   full_labels = np.array(full_labels)
   dict_mapping = dict(zip(np.unique(full_labels) , range(len(np.unique(full_labels)))))
   return dict_mapping

dict_mapping = get_mapping_target(df_data,single_labels)

Now we create a new column in df_data where for each row we get its new targets based on dict_mapping. 

In [None]:
def get_new_target(target , dict_mapping)  :
    labels = target.astype('int64')
    new_label = [dict_mapping[label] for label in labels if label not in single_labels]
    return new_label

df_data['labels'] = df_data['target'].apply(lambda x : get_new_target(x , dict_mapping))

In [None]:
n_classes = len(dict_mapping)

print('the number of classes is :', n_classes)

the number of classes is : 1778


## URL preprocessing 

In [None]:
import tldextract
from urllib.parse import urlparse
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()

def removing_condition(token , stopwords) :
   cond =  any(c.isdigit() for c in token) or len(token) <=2 or token in stopwords
   return not(cond)

def parse_url (url , stopwords_list) :
  domain_name = tldextract.extract(url)[1]
  full_path = urlparse(url).path
  first_tokens = re.split('[- _ % : , / \. \+ ]', full_path)
  tokens = []
  for token in first_tokens : 
      tokens+= re.split('\d+' , token) 
  # remove tokens with numbers 
  tokens = [ stemmer.stem(token.lower()) for token in tokens if removing_condition(token.lower() , stopwords_list) ]
  tokens = [token for token in tokens if removing_condition(token , stopwords_list)]
  # return unique elements
  final_sentence = list(dict.fromkeys([domain_name] + tokens))
  return " ".join(final_sentence)

In [None]:
with open (path_to_stopwords , 'r') as f : 
     lines = f.readlines()
lines = [l.replace('\n','') for l in lines]
stopwords_list = lines + ['search' , 'article' , 'html' , 'htm' , 'about' , 'fr' , 'id' ,
                          'text', 'lid' , 'pgn' , 'pgs' , 'ms' , 'vhc' , 've' , 'cmp' , 'aa' , 'xca' , 'pr' , 'false']

In [None]:
df_data['text_url'] = df_data['url'].apply(lambda x : parse_url(x , stopwords_list))


Next we split to train and test subsets. 

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split( df_data , test_size=0.2 )
df_train.reset_index(inplace = True , drop = True)
df_test.reset_index(inplace = True , drop = True)

We also define label to text dict based on the training data. This ensures sampling URLs having the same label in the trainingloader.

In [None]:
labels_to_text = defaultdict(list)
for idx , rows in df_train.iterrows() :
    for label in rows['labels'] :
        labels_to_text[label].append(rows['text_url'])

labels_to_text = {k:list(set(v)) for k,v in labels_to_text.items() if len(list(set(v)))>1}

# define torch dataset and dataloader 


In [None]:
from torch.utils.data import Dataset , DataLoader

class CustomDataset(Dataset):
    def __init__(self , df , labels_to_text ) :
        """
        This class creates a torch dataset.
        """
        self.labels_to_text = labels_to_text
        self.keys = list(self.labels_to_text.keys())
        self.df = df
        self.n_classes = n_classes

    def __len__(self):
        return len(self.labels_to_text)

    def __getitem__(self, idx):
        """
        sample a pair of URLs sharing the same label. The label corresponds to the rank "idx" in the keys list. 
        """
        key = self.keys[idx]
        anchor , positive = random.sample(self.labels_to_text[key] , k = 2 )
        labels_anchor = self.df[self.df.text_url == anchor]['labels'].values[0]
        labels_anchor = ' '.join([str(tar) for tar in labels_anchor])
        labels_pos = self.df[self.df.text_url == positive]['labels'].values[0]
        labels_pos = ' '.join([str(tar) for tar in labels_pos])
        return (anchor , positive , labels_anchor , labels_pos)
  

In the contrastive setting that we will be implementing , we consider an anchor url $a_i$ , a positive url $p_i$ that share the same label with $a_i$ , and negative urls $n_i^1 , .... , n_i^m$ that will have different labels than $a_i$. 
For a given URL anchor $a_i$ , the negative urls are the other positives $p_1 , .... , p_n$ except $p_i$ associated with other anchors. This setting is called in-batch setting , and it has the advantage of being fast and simple to implement. However , we need to ensure  the negatives associated with $a_i$ do not share labels with it , hence the next collate function. 

In [None]:
def collate_fn(data) : 
    used_anchors = set()
    used_positives = set()
    anchors = []
    positives = []
    for anchor , positive , labels_anchor , labels_pos in data :
        labels_anchor = set([int(val) for val in labels_anchor.split()])
        labels_pos = set([int(val) for val in labels_pos.split()])
        inters_pos_anchors = set(labels_pos).intersection(used_anchors) 
        inters_anc_positives = set(labels_anchor).intersection(used_positives) 
        if len(inters_pos_anchors) > 0 or len(inters_anc_positives) > 0 :
            continue
        anchors.append(anchor)
        positives.append(positive)
        used_anchors = used_anchors.union(labels_anchor)
        used_positives = used_positives.union(labels_pos)
    return anchors , positives

def get_loader (df_train   , labels_to_text , batch_size = 32 ) : 

    train_dataset = CustomDataset(df_train ,   labels_to_text)

    trainloader = DataLoader (train_dataset, batch_size=batch_size, 
                              collate_fn = collate_fn , shuffle = True )
    
    return trainloader 

# define model 

First we create our vocabulary 

In [None]:
def get_vocab(training_data) : 
    word_to_ix = {'pad' : 0 }
    for sent in training_data:
        for word in sent.split() :
            if word not in word_to_ix:  # word has not been assigned an index yet
                word_to_ix[word] = len(word_to_ix)  
    return word_to_ix

vocab = get_vocab (df_train['text_url'])

In [None]:
len(vocab)

19588

In [None]:
def get_words_indices(sent , vocab , train) :
    if train : 
      return [vocab[word] for word in sent]
    else :
      ids = []
      n_oov , total = 0 , 0 
      for word in sent : 
          total+=1
          if word not in vocab.keys() :
             ids.append(random.choice(list(vocab.values())))
          else :
             ids.append(vocab[word])

      return ids 
    
def encode (list_sentences , train , vocab) : 
    """
    encode a list of sentences to its indices in the vocab. 
    """
    max_length = max([len(sent.split()) for sent in list_sentences])
    batch_inputs = torch.empty((len(list_sentences) , max_length) , dtype = torch.int64)
    list_lengths = []
    for p,sent in enumerate(list_sentences) :
        split_sent = list(reversed(sent.split()))
        encodings_sentence = get_words_indices(split_sent , vocab , train) + [vocab['pad']] * (max_length - len(split_sent))           
        batch_inputs[p] = torch.tensor(encodings_sentence)
        list_lengths.append(len(split_sent))
    return batch_inputs 


# modeling

In [None]:
class EmbeddingModule(nn.Module):
    """
    pytorch nn module for embedding.
    """
    def __init__(self, embedding_dim, vocab_size , sentence_embed_dim = 100 ):
        super(EmbeddingModule, self).__init__()
        

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim , padding_idx=0)
        self.fc = nn.Linear(embedding_dim , sentence_embed_dim)
        self.tanh = nn.Tanh()

    def forward(self, inputs_ ):

        embeds = self.word_embeddings(inputs_)
        embeds = torch.mean(embeds , dim = 1)
        embeds = self.tanh(self.fc(embeds))
        return embeds

In [None]:
embedding_dim = 512 
model = EmbeddingModule(embedding_dim=embedding_dim , vocab_size=len(vocab))
model.to(device)

EmbeddingModule(
  (word_embeddings): Embedding(19588, 512, padding_idx=0)
  (fc): Linear(in_features=512, out_features=100, bias=True)
  (tanh): Tanh()
)

In [None]:
learning_rate =  1e-3
optimizer = torch.optim.Adam(model.parameters() , lr=learning_rate )


In [None]:
from tqdm.notebook import tqdm 
    
def compute_loss(embeddings_anchors , embeddings_pos) : 
    """
    compute contrastive loss
    """
    sim_matrix = torch.cdist(embeddings_anchors, embeddings_pos)
    loss = - torch.log( torch.exp(torch.diag(sim_matrix))/ torch.sum (torch.exp(sim_matrix) , dim = 1))
    mean_loss = torch.mean(loss)
    return mean_loss 

def train (loader , vocab ) :
    """
    training function 
    """
    model.train()
    total_loss =  0 
    for batch_idx , batch in tqdm(enumerate(loader) , total = len(loader)) :
        anchors , positives  = batch 
        anchors = encode(anchors , vocab = vocab , train = True ).to(device)
        positives = encode(positives , vocab = vocab , train = True ).to(device)
        optimizer.zero_grad()

        embeddings_anchors = model(anchors)
        embeddings_positives =  model(positives)
        loss = compute_loss (embeddings_anchors , embeddings_positives)
        loss.backward()
        optimizer.step()
        total_loss += float(loss)
    print('loss : {} '.format(total_loss / (batch_idx + 1)))

# Main

In [None]:
trainloader  = get_loader(df_train , labels_to_text=labels_to_text, batch_size = 128)

In [None]:
for epoch in range(20) : 
    print('##### training ######')
    train(loader = trainloader , vocab = vocab)


##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.9374119554247176 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.903118371963501 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.9241632223129272 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.877462863922119 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.893211688314165 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.9254194498062134 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.8944704702922275 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.872221282550267 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.863709960665022 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.8816500902175903 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.8608482565198625 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.882218905857631 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.8327911411012923 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.883323975971767 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.8737313406808034 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.8503018617630005 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.896999852997916 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.8603588342666626 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.8819555044174194 
##### training ######


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


loss : 3.836083378110613 


In [None]:
def get_embeddings(df , vocab , train , batch_size = 128 , size= 100 ) : 
    list_urls = df['text_url'].values 
    full_embeddings = torch.empty((0,size))
    n_text = 0
    while n_text < len(list_urls) :
         batch = list_urls[n_text:n_text + batch_size] 
         batch = encode(batch , vocab = vocab , train = train ).to(device)
         with torch.no_grad() : 
              embeddings = model(batch)
         full_embeddings = torch.vstack((full_embeddings , embeddings.cpu()))
         n_text += batch_size
    return full_embeddings.numpy()

X_train = get_embeddings(df = df_train , vocab = vocab , train = True)
X_test = get_embeddings(df = df_test , vocab = vocab , train = False)

# classifier 

In [None]:
def extract_one_hots(targets) :
    one_hot_targets = np.zeros(n_classes)
    one_hot_targets[targets] = 1
    return one_hot_targets

def get_one_hot_labels(df) : 
  one_hot_labels = []
  for index,row in df.iterrows() :
      one_hot_labels.append(extract_one_hots(row["labels"]) )
  one_hot_labels = np.stack(one_hot_labels , axis = 0 )
  return one_hot_labels.astype(np.int)

In [None]:
y_train = get_one_hot_labels(df_train) 
y_test = get_one_hot_labels(df_test) 

In [None]:
from skmultilearn.adapt import MLARAM 

classifier = MLARAM(threshold=5 * 1e-5, vigilance=0.95)
# train
classifier.fit(X_train, y_train)
# predict
predictions = classifier.predict(X_test)

In [None]:
print('exact accuracy score is : ',  accuracy_score(y_test,predictions))

In [None]:
def get_IoU_score(y_test, predictions):
    """
    give a target list y_test and the predictions of the multilabel classifier  ,
    this function returns the IoU score.
    """
    score = 0
    for target, pred in zip(y_test, predictions):
        target_ones = np.where(target == 1)[0]
        pred_ones = np.where(np.array(pred) == 1)[0]
        current_score = len(
            set(target_ones).intersection(set(pred_ones))
        ) / len(set(target_ones).union(set(pred_ones)))
        score += current_score

    return score / len(y_test)

In [None]:
print('IoU score is  : ',  get_IoU_score(y_test, predictions))